1 /* 2 * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 * SPDX-License-Identifier: MIT 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice shall be included in 13 * all copies or substantial portions of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 * DEALINGS IN THE SOFTWARE. 22 */ 23 24 /*! 25 * @file 26 * @brief GPU Fabric Probe handling 27 */ 28 29 30 #include "gpu/gpu.h" 31 #include "os/os.h" 32 #include "utils/nvprintf.h" 33 #include "kernel/gpu/nvlink/kernel_nvlink.h" 34 #include "gpu/gpu_fabric_probe.h" 35 #include "nvlink_inband_msg.h" 36 #include "kernel/mem_mgr/fabric_vaspace.h" 37 #include "ctrl/ctrl2080/ctrl2080internal.h" 38 39 // Structure to hold gpu probe information 40 typedef struct GPU_FABRIC_PROBE_INFO_KERNEL 41 { 42 NvBool bProbeRespRcvd; 43 NvU8 bwMode; 44 45 OBJGPU *pGpu; 46 47 nvlink_inband_gpu_probe_rsp_msg_t probeResponseMsg; 48 49 } GPU_FABRIC_PROBE_INFO_KERNEL; 50 51 static NV_STATUS 52 _gpuFabricProbeFullSanityCheck 53 ( 54 GPU_FABRIC_PROBE_INFO_KERNEL *pGpuFabricProbeInfoKernel 55 ) 56 { 57 if (pGpuFabricProbeInfoKernel == NULL) 58 { 59 return NV_ERR_NOT_SUPPORTED; 60 } 61 62 LOCK_ASSERT_AND_RETURN(rmDeviceGpuLockIsOwner( 63 gpuGetInstance(pGpuFabricProbeInfoKernel->pGpu))); 64 65 if (!gpuFabricProbeIsReceived(pGpuFabricProbeInfoKernel)) 66 { 67 return NV_ERR_NOT_READY; 68 } 69 70 if (!gpuFabricProbeIsSuccess(pGpuFabricProbeInfoKernel)) 71 { 72 return pGpuFabricProbeInfoKernel->probeResponseMsg.msgHdr.status; 73 } 74 75 return NV_OK; 76 } 77 78 NV_STATUS 79 gpuFabricProbeGetGpuFabricHandle 80 ( 81 GPU_FABRIC_PROBE_INFO_KERNEL *pGpuFabricProbeInfoKernel, 82 NvU64 *pHandle 83 ) 84 { 85 NV_STATUS status; 86 87 status = _gpuFabricProbeFullSanityCheck(pGpuFabricProbeInfoKernel); 88 89 NV_CHECK_OR_RETURN(LEVEL_ERROR, status == NV_OK, status); 90 91 *pHandle = pGpuFabricProbeInfoKernel->probeResponseMsg.probeRsp.gpuHandle; 92 93 return status; 94 } 95 96 NV_STATUS 97 gpuFabricProbeGetGfId 98 ( 99 GPU_FABRIC_PROBE_INFO_KERNEL *pGpuFabricProbeInfoKernel, 100 NvU32 *pGfId 101 ) 102 { 103 NV_STATUS status; 104 105 status = _gpuFabricProbeFullSanityCheck(pGpuFabricProbeInfoKernel); 106 107 NV_CHECK_OR_RETURN(LEVEL_ERROR, status == NV_OK, status); 108 109 *pGfId = pGpuFabricProbeInfoKernel->probeResponseMsg.probeRsp.gfId; 110 111 return status; 112 } 113 114 NV_STATUS 115 gpuFabricProbeGetfmCaps 116 ( 117 GPU_FABRIC_PROBE_INFO_KERNEL *pGpuFabricProbeInfoKernel, 118 NvU64 *pFmCaps 119 ) 120 { 121 NV_STATUS status; 122 123 status = _gpuFabricProbeFullSanityCheck(pGpuFabricProbeInfoKernel); 124 125 NV_CHECK_OR_RETURN(LEVEL_ERROR, status == NV_OK, status); 126 127 *pFmCaps = pGpuFabricProbeInfoKernel->probeResponseMsg.probeRsp.fmCaps; 128 129 return status; 130 } 131 132 NV_STATUS 133 gpuFabricProbeGetClusterUuid 134 ( 135 GPU_FABRIC_PROBE_INFO_KERNEL *pGpuFabricProbeInfoKernel, 136 NvUuid *pClusterUuid 137 ) 138 { 139 NV_STATUS status; 140 141 status = _gpuFabricProbeFullSanityCheck(pGpuFabricProbeInfoKernel); 142 143 NV_CHECK_OR_RETURN(LEVEL_ERROR, status == NV_OK, status); 144 145 portMemCopy(&pClusterUuid->uuid[0], 146 sizeof(pClusterUuid->uuid), 147 &pGpuFabricProbeInfoKernel->probeResponseMsg.probeRsp.clusterUuid.uuid[0], 148 sizeof(pGpuFabricProbeInfoKernel->probeResponseMsg.probeRsp.clusterUuid.uuid)); 149 150 return status; 151 } 152 153 NV_STATUS 154 gpuFabricProbeGetFabricPartitionId 155 ( 156 GPU_FABRIC_PROBE_INFO_KERNEL *pGpuFabricProbeInfoKernel, 157 NvU16 *pFabricPartitionId 158 ) 159 { 160 NV_STATUS status; 161 162 status = _gpuFabricProbeFullSanityCheck(pGpuFabricProbeInfoKernel); 163 164 NV_CHECK_OR_RETURN(LEVEL_ERROR, status == NV_OK, status); 165 166 *pFabricPartitionId = pGpuFabricProbeInfoKernel->probeResponseMsg.probeRsp.fabricPartitionId; 167 168 return status; 169 } 170 171 NV_STATUS 172 gpuFabricProbeGetGpaAddress 173 ( 174 GPU_FABRIC_PROBE_INFO_KERNEL *pGpuFabricProbeInfoKernel, 175 NvU64 *pGpaAddress 176 ) 177 { 178 NV_STATUS status; 179 180 status = _gpuFabricProbeFullSanityCheck(pGpuFabricProbeInfoKernel); 181 182 NV_CHECK_OR_RETURN(LEVEL_ERROR, status == NV_OK, status); 183 184 *pGpaAddress = pGpuFabricProbeInfoKernel->probeResponseMsg.probeRsp.gpaAddress; 185 186 return status; 187 } 188 189 NV_STATUS 190 gpuFabricProbeGetGpaAddressRange 191 ( 192 GPU_FABRIC_PROBE_INFO_KERNEL *pGpuFabricProbeInfoKernel, 193 NvU64 *pGpaAddressRange 194 ) 195 { 196 NV_STATUS status; 197 198 status = _gpuFabricProbeFullSanityCheck(pGpuFabricProbeInfoKernel); 199 200 NV_CHECK_OR_RETURN(LEVEL_ERROR, status == NV_OK, status); 201 202 *pGpaAddressRange = pGpuFabricProbeInfoKernel->probeResponseMsg.probeRsp.gpaAddressRange; 203 204 return status; 205 } 206 207 NV_STATUS 208 gpuFabricProbeGetFlaAddress 209 ( 210 GPU_FABRIC_PROBE_INFO_KERNEL *pGpuFabricProbeInfoKernel, 211 NvU64 *pFlaAddress 212 ) 213 { 214 NV_STATUS status; 215 216 status = _gpuFabricProbeFullSanityCheck(pGpuFabricProbeInfoKernel); 217 218 NV_CHECK_OR_RETURN(LEVEL_ERROR, status == NV_OK, status); 219 220 *pFlaAddress = pGpuFabricProbeInfoKernel->probeResponseMsg.probeRsp.flaAddress; 221 222 return status; 223 } 224 225 NV_STATUS 226 gpuFabricProbeGetFlaAddressRange 227 ( 228 GPU_FABRIC_PROBE_INFO_KERNEL *pGpuFabricProbeInfoKernel, 229 NvU64 *pFlaAddressRange 230 ) 231 { 232 NV_STATUS status; 233 234 status = _gpuFabricProbeFullSanityCheck(pGpuFabricProbeInfoKernel); 235 236 NV_CHECK_OR_RETURN(LEVEL_ERROR, status == NV_OK, status); 237 238 *pFlaAddressRange = pGpuFabricProbeInfoKernel->probeResponseMsg.probeRsp.flaAddressRange; 239 240 return status; 241 } 242 243 NV_STATUS 244 gpuFabricProbeGetNumProbeReqs 245 ( 246 GPU_FABRIC_PROBE_INFO_KERNEL *pGpuFabricProbeInfoKernel, 247 NvU64 *numProbes 248 ) 249 { 250 NV2080_CTRL_CMD_INTERNAL_GET_GPU_FABRIC_PROBE_INFO_PARAMS params = { 0 }; 251 RM_API *pRmApi; 252 OBJGPU *pGpu; 253 254 if (pGpuFabricProbeInfoKernel == NULL) 255 { 256 return NV_ERR_NOT_SUPPORTED; 257 } 258 259 pGpu = pGpuFabricProbeInfoKernel->pGpu; 260 pRmApi = GPU_GET_PHYSICAL_RMAPI(pGpu); 261 262 LOCK_ASSERT_AND_RETURN(rmDeviceGpuLockIsOwner( 263 gpuGetInstance(pGpuFabricProbeInfoKernel->pGpu))); 264 265 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR, 266 pRmApi->Control(pRmApi, 267 pGpu->hInternalClient, 268 pGpu->hInternalSubdevice, 269 NV2080_CTRL_CMD_INTERNAL_GPU_GET_FABRIC_PROBE_INFO, 270 ¶ms, 271 sizeof(params))); 272 273 *numProbes = params.numProbes; 274 275 return NV_OK; 276 } 277 278 NvBool 279 gpuFabricProbeIsReceived 280 ( 281 GPU_FABRIC_PROBE_INFO_KERNEL *pGpuFabricProbeInfoKernel 282 ) 283 { 284 if (pGpuFabricProbeInfoKernel == NULL) 285 { 286 return NV_FALSE; 287 } 288 289 LOCK_ASSERT_AND_RETURN_BOOL(rmDeviceGpuLockIsOwner( 290 gpuGetInstance(pGpuFabricProbeInfoKernel->pGpu)), 291 NV_FALSE); 292 293 return pGpuFabricProbeInfoKernel->bProbeRespRcvd; 294 } 295 296 NvBool 297 gpuFabricProbeIsSuccess 298 ( 299 GPU_FABRIC_PROBE_INFO_KERNEL *pGpuFabricProbeInfoKernel 300 ) 301 { 302 nvlink_inband_gpu_probe_rsp_msg_t *pProbeResponseMsg; 303 nvlink_inband_msg_header_t *pProbeRespMsgHdr; 304 305 if (pGpuFabricProbeInfoKernel == NULL) 306 { 307 return NV_FALSE; 308 } 309 310 LOCK_ASSERT_AND_RETURN_BOOL(rmDeviceGpuLockIsOwner(gpuGetInstance( 311 pGpuFabricProbeInfoKernel->pGpu)), 312 NV_FALSE); 313 314 pProbeResponseMsg = &pGpuFabricProbeInfoKernel->probeResponseMsg; 315 pProbeRespMsgHdr = &pProbeResponseMsg->msgHdr; 316 317 return pProbeRespMsgHdr->status == NV_OK; 318 } 319 320 NV_STATUS 321 gpuFabricProbeGetFmStatus 322 ( 323 GPU_FABRIC_PROBE_INFO_KERNEL *pGpuFabricProbeInfoKernel 324 ) 325 { 326 if (pGpuFabricProbeInfoKernel == NULL) 327 { 328 return NV_ERR_NOT_SUPPORTED; 329 } 330 331 LOCK_ASSERT_AND_RETURN(rmDeviceGpuLockIsOwner( 332 gpuGetInstance(pGpuFabricProbeInfoKernel->pGpu))); 333 334 return pGpuFabricProbeInfoKernel->probeResponseMsg.msgHdr.status; 335 } 336 337 static void 338 _gpuFabricProbeSetupGpaRange 339 ( 340 OBJGPU *pGpu, 341 GPU_FABRIC_PROBE_INFO_KERNEL *pGpuFabricProbeInfoKernel 342 ) 343 { 344 KernelNvlink *pKernelNvlink = GPU_GET_KERNEL_NVLINK(pGpu); 345 // setup GPA based system 346 if (pKernelNvlink != NULL) 347 { 348 NvU64 gpaAddress; 349 NvU64 gpaAddressSize; 350 351 NV_CHECK_OR_RETURN_VOID(LEVEL_ERROR, 352 gpuFabricProbeGetGpaAddress(pGpuFabricProbeInfoKernel, 353 &gpaAddress) == NV_OK); 354 355 NV_CHECK_OR_RETURN_VOID(LEVEL_ERROR, 356 gpuFabricProbeGetGpaAddressRange(pGpuFabricProbeInfoKernel, 357 &gpaAddressSize) == NV_OK); 358 359 NV_CHECK_OR_RETURN_VOID(LEVEL_ERROR, 360 knvlinkSetUniqueFabricBaseAddress_HAL(pGpu, pKernelNvlink, 361 gpaAddress) == NV_OK); 362 } 363 } 364 365 static void 366 _gpuFabricProbeSetupFlaRange 367 ( 368 OBJGPU *pGpu, 369 GPU_FABRIC_PROBE_INFO_KERNEL *pGpuFabricProbeInfoKernel 370 ) 371 { 372 if (pGpu->pFabricVAS != NULL) 373 { 374 NvU64 flaBaseAddress; 375 NvU64 flaSize; 376 377 NV_CHECK_OR_RETURN_VOID(LEVEL_ERROR, 378 gpuFabricProbeGetFlaAddress(pGpuFabricProbeInfoKernel, 379 &flaBaseAddress) == NV_OK); 380 381 NV_CHECK_OR_RETURN_VOID(LEVEL_ERROR, 382 gpuFabricProbeGetFlaAddressRange(pGpuFabricProbeInfoKernel, 383 &flaSize) == NV_OK); 384 385 NV_CHECK_OR_RETURN_VOID(LEVEL_ERROR, 386 fabricvaspaceInitUCRange(dynamicCast(pGpu->pFabricVAS, FABRIC_VASPACE), 387 pGpu, flaBaseAddress, flaSize) == NV_OK); 388 } 389 } 390 391 static NV_STATUS 392 _gpuFabricProbeReceiveKernel 393 ( 394 NvU32 gpuInstance, 395 NV2080_CTRL_NVLINK_INBAND_RECEIVED_DATA_PARAMS *pInbandRcvParams 396 ) 397 { 398 OBJGPU *pGpu; 399 NvU32 gpuMaskUnused; 400 nvlink_inband_gpu_probe_rsp_msg_t *pProbeRespMsg; 401 GPU_FABRIC_PROBE_INFO_KERNEL *pGpuFabricProbeInfoKernel; 402 NV_STATUS status; 403 404 if ((pGpu = gpumgrGetGpu(gpuInstance)) == NULL) 405 { 406 NV_ASSERT_FAILED("Invalid GPU instance"); 407 return NV_ERR_INVALID_ARGUMENT; 408 } 409 410 // 411 // There is a scenario where _gpuFabricProbeStart fails in the GSP 412 // and returns failure to kernel ctrl call to start probe. 413 // This will set the pGpuFabricProbeInfoKernel to NULL. 414 // GSP also sends a probe response with failure error code. 415 // Handling this response causes kernel driver to crash since 416 // pGpuFabricProbeInfoKernel is already cleared in the kernel. 417 // This check is added to handle this scenario. 418 // 419 NV_CHECK_OR_RETURN(LEVEL_ERROR, pGpu->pGpuFabricProbeInfoKernel != NULL, NV_OK); 420 421 NV_ASSERT(rmGpuGroupLockIsOwner(gpuInstance, GPU_LOCK_GRP_SUBDEVICE, 422 &gpuMaskUnused)); 423 424 NV_ASSERT(pInbandRcvParams != NULL); 425 426 pGpuFabricProbeInfoKernel = pGpu->pGpuFabricProbeInfoKernel; 427 428 pProbeRespMsg = \ 429 (nvlink_inband_gpu_probe_rsp_msg_t *)&pInbandRcvParams->data[0]; 430 431 portMemCopy(&pGpuFabricProbeInfoKernel->probeResponseMsg, 432 sizeof(pGpuFabricProbeInfoKernel->probeResponseMsg), 433 pProbeRespMsg, 434 sizeof(*pProbeRespMsg)); 435 436 // 437 // TODO - Add additional check with versioning to continue with the 438 // timer and send lower version requests 439 // 440 pGpuFabricProbeInfoKernel->bProbeRespRcvd = NV_TRUE; 441 442 status = _gpuFabricProbeFullSanityCheck(pGpuFabricProbeInfoKernel); 443 NV_CHECK_OR_RETURN(LEVEL_ERROR, status == NV_OK, status); 444 445 _gpuFabricProbeSetupGpaRange(pGpu, pGpuFabricProbeInfoKernel); 446 _gpuFabricProbeSetupFlaRange(pGpu, pGpuFabricProbeInfoKernel); 447 448 return NV_OK; 449 } 450 451 void 452 gpuFabricProbeSuspend 453 ( 454 GPU_FABRIC_PROBE_INFO_KERNEL *pGpuFabricProbeInfoKernel 455 ) 456 { 457 OBJGPU *pGpu; 458 RM_API *pRmApi; 459 NV_STATUS status; 460 461 if (pGpuFabricProbeInfoKernel == NULL) 462 { 463 return; 464 } 465 466 pGpu = pGpuFabricProbeInfoKernel->pGpu; 467 pRmApi = GPU_GET_PHYSICAL_RMAPI(pGpu); 468 469 NV_ASSERT(rmDeviceGpuLockIsOwner(gpuGetInstance(pGpu))); 470 471 NV_CHECK_OK(status, LEVEL_ERROR, 472 pRmApi->Control(pRmApi, 473 pGpu->hInternalClient, 474 pGpu->hInternalSubdevice, 475 NV2080_CTRL_CMD_INTERNAL_GPU_SUSPEND_FABRIC_PROBE, 476 NULL, 0)); 477 } 478 479 NV_STATUS 480 gpuFabricProbeResume 481 ( 482 GPU_FABRIC_PROBE_INFO_KERNEL *pGpuFabricProbeInfoKernel 483 ) 484 { 485 OBJGPU *pGpu; 486 RM_API *pRmApi; 487 NV2080_CTRL_CMD_INTERNAL_RESUME_GPU_FABRIC_PROBE_INFO_PARAMS params = { 0 }; 488 489 if (pGpuFabricProbeInfoKernel == NULL) 490 { 491 return NV_ERR_NOT_SUPPORTED; 492 } 493 494 pGpu = pGpuFabricProbeInfoKernel->pGpu; 495 pRmApi = GPU_GET_PHYSICAL_RMAPI(pGpu); 496 497 NV_ASSERT(rmDeviceGpuLockIsOwner(gpuGetInstance(pGpu))); 498 499 params.bwMode = pGpuFabricProbeInfoKernel->bwMode; 500 501 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR, 502 pRmApi->Control(pRmApi, 503 pGpu->hInternalClient, 504 pGpu->hInternalSubdevice, 505 NV2080_CTRL_CMD_INTERNAL_GPU_RESUME_FABRIC_PROBE, 506 ¶ms, sizeof(params))); 507 508 return NV_OK; 509 } 510 511 NV_STATUS 512 gpuFabricProbeStart 513 ( 514 OBJGPU *pGpu, 515 GPU_FABRIC_PROBE_INFO_KERNEL **ppGpuFabricProbeInfoKernel 516 ) 517 { 518 NV_STATUS status; 519 GPU_FABRIC_PROBE_INFO_KERNEL *pGpuFabricProbeInfoKernel; 520 NVLINK_INBAND_MSG_CALLBACK inbandMsgCbParams; 521 KernelNvlink *pKernelNvlink = GPU_GET_KERNEL_NVLINK(pGpu); 522 RM_API *pRmApi = GPU_GET_PHYSICAL_RMAPI(pGpu); 523 NV2080_CTRL_CMD_INTERNAL_START_GPU_FABRIC_PROBE_INFO_PARAMS params = { 0 }; 524 525 LOCK_ASSERT_AND_RETURN(rmDeviceGpuLockIsOwner(gpuGetInstance(pGpu))); 526 527 // Check if NVSwitch based system. If not return without doing anything 528 if (!gpuFabricProbeIsSupported(pGpu)) 529 { 530 return NV_OK; 531 } 532 533 *ppGpuFabricProbeInfoKernel = portMemAllocNonPaged(sizeof(*pGpuFabricProbeInfoKernel)); 534 NV_ASSERT_OR_RETURN(*ppGpuFabricProbeInfoKernel != NULL, NV_ERR_NO_MEMORY); 535 536 pGpuFabricProbeInfoKernel = *ppGpuFabricProbeInfoKernel; 537 538 portMemSet(pGpuFabricProbeInfoKernel, 0, sizeof(*pGpuFabricProbeInfoKernel)); 539 540 pGpuFabricProbeInfoKernel->pGpu = pGpu; 541 542 // Register the receive callback 543 inbandMsgCbParams.messageType = NVLINK_INBAND_MSG_TYPE_GPU_PROBE_RSP; 544 inbandMsgCbParams.pCallback = _gpuFabricProbeReceiveKernel; 545 inbandMsgCbParams.wqItemFlags = (OS_QUEUE_WORKITEM_FLAGS_LOCK_SEMA | 546 OS_QUEUE_WORKITEM_FLAGS_LOCK_GPU_GROUP_SUBDEVICE_RW); 547 status = knvlinkRegisterInbandCallback(pGpu, 548 pKernelNvlink, 549 &inbandMsgCbParams); 550 if (status != NV_OK) 551 { 552 NV_PRINTF(LEVEL_ERROR, "GPU%u Registering Inband Cb failed\n", 553 gpuGetInstance(pGpu)); 554 goto fail; 555 } 556 557 pGpuFabricProbeInfoKernel->bwMode = gpumgrGetGpuNvlinkBwMode(); 558 params.bwMode = pGpuFabricProbeInfoKernel->bwMode; 559 560 // Send IOCTL to start probe 561 NV_CHECK_OK_OR_GOTO(status, LEVEL_ERROR, 562 pRmApi->Control(pRmApi, 563 pGpu->hInternalClient, 564 pGpu->hInternalSubdevice, 565 NV2080_CTRL_CMD_INTERNAL_GPU_START_FABRIC_PROBE, 566 ¶ms, sizeof(params)), 567 fail); 568 569 return NV_OK; 570 571 fail: 572 portMemFree(pGpuFabricProbeInfoKernel); 573 pGpu->pGpuFabricProbeInfoKernel = NULL; 574 575 return status; 576 } 577 578 void 579 gpuFabricProbeStop 580 ( 581 GPU_FABRIC_PROBE_INFO_KERNEL *pGpuFabricProbeInfoKernel 582 ) 583 { 584 OBJGPU *pGpu; 585 KernelNvlink *pKernelNvlink; 586 RM_API *pRmApi; 587 588 if (pGpuFabricProbeInfoKernel == NULL) 589 { 590 return; 591 } 592 593 pGpu = pGpuFabricProbeInfoKernel->pGpu; 594 595 NV_ASSERT_OR_RETURN_VOID(rmDeviceGpuLockIsOwner(gpuGetInstance(pGpu))); 596 597 pKernelNvlink = GPU_GET_KERNEL_NVLINK(pGpu); 598 NV_ASSERT(pKernelNvlink != NULL); 599 600 // Unregister the receive callback 601 NV_ASSERT_OK(knvlinkUnregisterInbandCallback(pGpu, pKernelNvlink, 602 NVLINK_INBAND_MSG_TYPE_GPU_PROBE_RSP)); 603 604 pRmApi = GPU_GET_PHYSICAL_RMAPI(pGpu); 605 606 NV_ASSERT_OK(pRmApi->Control(pRmApi, 607 pGpu->hInternalClient, 608 pGpu->hInternalSubdevice, 609 NV2080_CTRL_CMD_INTERNAL_GPU_STOP_FABRIC_PROBE, 610 NULL, 0)); 611 612 portMemFree(pGpuFabricProbeInfoKernel); 613 pGpu->pGpuFabricProbeInfoKernel = NULL; 614 } 615 616 NvBool 617 gpuFabricProbeIsSupported 618 ( 619 OBJGPU *pGpu 620 ) 621 { 622 if (pGpu->fabricProbeRetryDelay == 0) 623 { 624 NV_PRINTF(LEVEL_INFO, "GPU%u Probe handling is disabled\n", 625 gpuGetInstance(pGpu)); 626 return NV_FALSE; 627 } 628 629 if (GPU_GET_KERNEL_NVLINK(pGpu) == NULL) 630 { 631 return NV_FALSE; 632 } 633 634 return NV_TRUE; 635 } 636 637 static void 638 _gpuFabricProbeInvalidate 639 ( 640 OBJGPU *pGpu 641 ) 642 { 643 GPU_FABRIC_PROBE_INFO_KERNEL *pGpuFabricProbeInfoKernel = 644 pGpu->pGpuFabricProbeInfoKernel; 645 KernelNvlink *pKernelNvlink = GPU_GET_KERNEL_NVLINK(pGpu); 646 FABRIC_VASPACE *pFabricVAS = dynamicCast(pGpu->pFabricVAS, FABRIC_VASPACE); 647 648 pGpuFabricProbeInfoKernel->bProbeRespRcvd = NV_FALSE; 649 650 if (pKernelNvlink != NULL) 651 knvlinkClearUniqueFabricBaseAddress_HAL(pGpu, pKernelNvlink); 652 653 if (pFabricVAS != NULL) 654 fabricvaspaceClearUCRange(pFabricVAS); 655 } 656 657 #define GPU_FABRIC_CHECK_BW_MODE(fmCaps, mode) \ 658 do \ 659 { \ 660 if ((fmCaps & NVLINK_INBAND_FM_CAPS_BW_MODE_##mode) == 0) \ 661 return NV_ERR_NOT_SUPPORTED; \ 662 } while (0) 663 664 static NV_STATUS 665 _gpuFabricProbeUpdateBwMode 666 ( 667 OBJGPU *pGpu, 668 NvU8 mode 669 ) 670 { 671 GPU_FABRIC_PROBE_INFO_KERNEL *pGpuFabricProbeInfoKernel = 672 pGpu->pGpuFabricProbeInfoKernel; 673 RM_API *pRmApi = GPU_GET_PHYSICAL_RMAPI(pGpu); 674 675 pGpuFabricProbeInfoKernel->bwMode = mode; 676 677 gpuFabricProbeSuspend(pGpuFabricProbeInfoKernel); 678 679 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR, 680 pRmApi->Control(pRmApi, 681 pGpu->hInternalClient, 682 pGpu->hInternalSubdevice, 683 NV2080_CTRL_CMD_INTERNAL_GPU_INVALIDATE_FABRIC_PROBE, 684 NULL, 0)); 685 686 _gpuFabricProbeInvalidate(pGpu); 687 688 NV_CHECK_OK_OR_RETURN(LEVEL_ERROR, gpuFabricProbeResume(pGpuFabricProbeInfoKernel)); 689 690 return NV_OK; 691 } 692 693 NV_STATUS 694 gpuFabricProbeSetBwMode 695 ( 696 NvU8 mode 697 ) 698 { 699 NvU32 attachedGpuCount; 700 NvU32 attachedGpuMask; 701 NV_STATUS status; 702 NvU32 gpuIndex; 703 OBJGPU *pGpu; 704 705 status = gpumgrGetGpuAttachInfo(&attachedGpuCount, &attachedGpuMask); 706 if (status != NV_OK) 707 { 708 return NV_ERR_INVALID_STATE; 709 } 710 711 // Check if all GPUs belong to NvSwitch 712 gpuIndex = 0; 713 for(pGpu = gpumgrGetNextGpu(attachedGpuMask, &gpuIndex); 714 pGpu != NULL; 715 pGpu = gpumgrGetNextGpu(attachedGpuMask, &gpuIndex)) 716 { 717 if (!gpuFabricProbeIsSupported(pGpu)) 718 { 719 // For directed connected system 720 return NV_OK; 721 } 722 } 723 724 // 725 // Check if all GPUs received fabric probe and 726 // if the mode is supported on all GPUs. 727 // 728 gpuIndex = 0; 729 for(pGpu = gpumgrGetNextGpu(attachedGpuMask, &gpuIndex); 730 pGpu != NULL; 731 pGpu = gpumgrGetNextGpu(attachedGpuMask, &gpuIndex)) 732 { 733 if (!gpuFabricProbeIsReceived(pGpu->pGpuFabricProbeInfoKernel) || 734 !gpuFabricProbeIsSuccess(pGpu->pGpuFabricProbeInfoKernel)) 735 { 736 return NV_ERR_NOT_READY; 737 } 738 739 NvU64 fmCaps = pGpu->pGpuFabricProbeInfoKernel->probeResponseMsg.probeRsp.fmCaps; 740 switch(mode) 741 { 742 case GPU_NVLINK_BW_MODE_MIN: 743 GPU_FABRIC_CHECK_BW_MODE(fmCaps, MIN); 744 break; 745 case GPU_NVLINK_BW_MODE_HALF: 746 GPU_FABRIC_CHECK_BW_MODE(fmCaps, HALF); 747 break; 748 case GPU_NVLINK_BW_MODE_3QUARTER: 749 GPU_FABRIC_CHECK_BW_MODE(fmCaps, 3QUARTER); 750 break; 751 case GPU_NVLINK_BW_MODE_OFF: 752 return NV_OK; // Don't need to ask FM 753 default: 754 break; 755 } 756 } 757 758 gpuIndex = 0; 759 for(pGpu = gpumgrGetNextGpu(attachedGpuMask, &gpuIndex); 760 pGpu != NULL; 761 pGpu = gpumgrGetNextGpu(attachedGpuMask, &gpuIndex)) 762 { 763 FABRIC_VASPACE *pFabricVAS = dynamicCast(pGpu->pFabricVAS, 764 FABRIC_VASPACE); 765 if (pFabricVAS == NULL) 766 { 767 continue; 768 } 769 770 if (fabricvaspaceIsInUse(pFabricVAS)) 771 { 772 return NV_ERR_STATE_IN_USE; 773 } 774 } 775 776 gpuIndex = 0; 777 for(pGpu = gpumgrGetNextGpu(attachedGpuMask, &gpuIndex); 778 pGpu != NULL; 779 pGpu = gpumgrGetNextGpu(attachedGpuMask, &gpuIndex)) 780 { 781 status = _gpuFabricProbeUpdateBwMode(pGpu, mode); 782 if (status != NV_OK) 783 { 784 return status; 785 } 786 } 787 788 return NV_OK; 789 } 790 791 NV_STATUS 792 gpuFabricProbeGetlinkMaskToBeReduced 793 ( 794 GPU_FABRIC_PROBE_INFO_KERNEL *pGpuFabricProbeInfoKernel, 795 NvU32 *linkMaskToBeReduced 796 ) 797 { 798 NV_STATUS status; 799 800 status = _gpuFabricProbeFullSanityCheck(pGpuFabricProbeInfoKernel); 801 NV_CHECK_OR_RETURN(LEVEL_ERROR, status == NV_OK, status); 802 803 *linkMaskToBeReduced = pGpuFabricProbeInfoKernel->probeResponseMsg.probeRsp.linkMaskToBeReduced; 804 return NV_OK; 805 } 806