1 /******************************************************************************* 2 Copyright (c) 2015-2023 NVIDIA Corporation 3 4 Permission is hereby granted, free of charge, to any person obtaining a copy 5 of this software and associated documentation files (the "Software"), to 6 deal in the Software without restriction, including without limitation the 7 rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 8 sell copies of the Software, and to permit persons to whom the Software is 9 furnished to do so, subject to the following conditions: 10 11 The above copyright notice and this permission notice shall be 12 included in all copies or substantial portions of the Software. 13 14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 DEALINGS IN THE SOFTWARE. 21 22 *******************************************************************************/ 23 24 #include "nv_uvm_interface.h" 25 #include "uvm_api.h" 26 #include "uvm_channel.h" 27 #include "uvm_global.h" 28 #include "uvm_gpu.h" 29 #include "uvm_gpu_semaphore.h" 30 #include "uvm_hal.h" 31 #include "uvm_procfs.h" 32 #include "uvm_pmm_gpu.h" 33 #include "uvm_pmm_sysmem.h" 34 #include "uvm_va_space.h" 35 #include "uvm_user_channel.h" 36 #include "uvm_perf_events.h" 37 #include "uvm_perf_heuristics.h" 38 #include "uvm_common.h" 39 #include "ctrl2080mc.h" 40 #include "nv-kthread-q.h" 41 #include "uvm_gpu_access_counters.h" 42 #include "uvm_ats.h" 43 #include "uvm_test.h" 44 #include "uvm_conf_computing.h" 45 46 #include "uvm_linux.h" 47 48 #define UVM_PROC_GPUS_PEER_DIR_NAME "peers" 49 50 // The uvm_peer_copy module parameter enables to choose from "phys" or "virt". 51 // It determines the addressing mode for P2P copies. 52 #define UVM_PARAM_PEER_COPY_VIRTUAL "virt" 53 #define UVM_PARAM_PEER_COPY_PHYSICAL "phys" 54 static char *uvm_peer_copy = UVM_PARAM_PEER_COPY_PHYSICAL; 55 module_param(uvm_peer_copy, charp, S_IRUGO); 56 MODULE_PARM_DESC(uvm_peer_copy, "Choose the addressing mode for peer copying, options: " 57 UVM_PARAM_PEER_COPY_PHYSICAL " [default] or " UVM_PARAM_PEER_COPY_VIRTUAL ". " 58 "Valid for Ampere+ GPUs."); 59 60 static void remove_gpu(uvm_gpu_t *gpu); 61 static void disable_peer_access(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1); 62 static NV_STATUS discover_nvlink_peers(uvm_gpu_t *gpu); 63 static void destroy_nvlink_peers(uvm_gpu_t *gpu); 64 65 static uvm_user_channel_t *get_user_channel(uvm_rb_tree_node_t *node) 66 { 67 return container_of(node, uvm_user_channel_t, instance_ptr.node); 68 } 69 70 static uvm_gpu_link_type_t get_gpu_link_type(UVM_LINK_TYPE link_type) 71 { 72 switch (link_type) { 73 case UVM_LINK_TYPE_PCIE: 74 return UVM_GPU_LINK_PCIE; 75 case UVM_LINK_TYPE_NVLINK_1: 76 return UVM_GPU_LINK_NVLINK_1; 77 case UVM_LINK_TYPE_NVLINK_2: 78 return UVM_GPU_LINK_NVLINK_2; 79 case UVM_LINK_TYPE_NVLINK_3: 80 return UVM_GPU_LINK_NVLINK_3; 81 case UVM_LINK_TYPE_NVLINK_4: 82 return UVM_GPU_LINK_NVLINK_4; 83 case UVM_LINK_TYPE_C2C: 84 return UVM_GPU_LINK_C2C; 85 default: 86 return UVM_GPU_LINK_INVALID; 87 } 88 } 89 90 static void fill_gpu_info(uvm_parent_gpu_t *parent_gpu, const UvmGpuInfo *gpu_info) 91 { 92 char uuid_buffer[UVM_GPU_UUID_TEXT_BUFFER_LENGTH]; 93 94 parent_gpu->rm_info = *gpu_info; 95 96 parent_gpu->system_bus.link = get_gpu_link_type(gpu_info->sysmemLink); 97 UVM_ASSERT(parent_gpu->system_bus.link != UVM_GPU_LINK_INVALID); 98 99 parent_gpu->system_bus.link_rate_mbyte_per_s = gpu_info->sysmemLinkRateMBps; 100 101 if (gpu_info->systemMemoryWindowSize > 0) { 102 // memory_window_end is inclusive but uvm_gpu_is_coherent() checks 103 // memory_window_end > memory_window_start as its condition. 104 UVM_ASSERT(gpu_info->systemMemoryWindowSize > 1); 105 parent_gpu->system_bus.memory_window_start = gpu_info->systemMemoryWindowStart; 106 parent_gpu->system_bus.memory_window_end = gpu_info->systemMemoryWindowStart + 107 gpu_info->systemMemoryWindowSize - 1; 108 } 109 110 parent_gpu->nvswitch_info.is_nvswitch_connected = gpu_info->connectedToSwitch; 111 112 // nvswitch is routed via physical pages, where the upper 13-bits of the 113 // 47-bit address space holds the routing information for each peer. 114 // Currently, this is limited to a 16GB framebuffer window size. 115 if (parent_gpu->nvswitch_info.is_nvswitch_connected) 116 parent_gpu->nvswitch_info.fabric_memory_window_start = gpu_info->nvswitchMemoryWindowStart; 117 118 format_uuid_to_buffer(uuid_buffer, sizeof(uuid_buffer), &parent_gpu->uuid); 119 snprintf(parent_gpu->name, 120 sizeof(parent_gpu->name), 121 "ID %u: %s: %s", 122 uvm_id_value(parent_gpu->id), 123 parent_gpu->rm_info.name, 124 uuid_buffer); 125 } 126 127 static NV_STATUS get_gpu_caps(uvm_gpu_t *gpu) 128 { 129 NV_STATUS status; 130 UvmGpuCaps gpu_caps; 131 132 memset(&gpu_caps, 0, sizeof(gpu_caps)); 133 134 status = uvm_rm_locked_call(nvUvmInterfaceQueryCaps(uvm_gpu_device_handle(gpu), &gpu_caps)); 135 if (status != NV_OK) 136 return status; 137 138 if (gpu_caps.numaEnabled) { 139 UVM_ASSERT(uvm_gpu_is_coherent(gpu->parent)); 140 gpu->mem_info.numa.enabled = true; 141 gpu->mem_info.numa.node_id = gpu_caps.numaNodeId; 142 } 143 else { 144 UVM_ASSERT(!uvm_gpu_is_coherent(gpu->parent)); 145 } 146 147 return NV_OK; 148 } 149 150 static NV_STATUS alloc_and_init_address_space(uvm_gpu_t *gpu) 151 { 152 NV_STATUS status; 153 UvmGpuAddressSpaceInfo gpu_address_space_info = {0}; 154 155 status = uvm_rm_locked_call(nvUvmInterfaceAddressSpaceCreate(uvm_gpu_device_handle(gpu), 156 gpu->parent->rm_va_base, 157 gpu->parent->rm_va_size, 158 &gpu->rm_address_space, 159 &gpu_address_space_info)); 160 if (status != NV_OK) 161 return status; 162 163 gpu->big_page.internal_size = gpu_address_space_info.bigPageSize; 164 165 gpu->time.time0_register = gpu_address_space_info.time0Offset; 166 gpu->time.time1_register = gpu_address_space_info.time1Offset; 167 168 gpu->max_subcontexts = gpu_address_space_info.maxSubctxCount; 169 170 return NV_OK; 171 } 172 173 static NV_STATUS get_gpu_fb_info(uvm_gpu_t *gpu) 174 { 175 NV_STATUS status; 176 UvmGpuFbInfo fb_info = {0}; 177 178 status = uvm_rm_locked_call(nvUvmInterfaceGetFbInfo(uvm_gpu_device_handle(gpu), &fb_info)); 179 if (status != NV_OK) 180 return status; 181 182 if (!fb_info.bZeroFb) { 183 gpu->mem_info.size = ((NvU64)fb_info.heapSize + fb_info.reservedHeapSize) * 1024; 184 gpu->mem_info.max_allocatable_address = fb_info.maxAllocatableAddress; 185 } 186 187 return NV_OK; 188 } 189 190 static NV_STATUS get_gpu_ecc_info(uvm_gpu_t *gpu) 191 { 192 NV_STATUS status; 193 UvmGpuEccInfo ecc_info = {0}; 194 195 status = uvm_rm_locked_call(nvUvmInterfaceGetEccInfo(uvm_gpu_device_handle(gpu), &ecc_info)); 196 if (status != NV_OK) 197 return status; 198 199 gpu->ecc.enabled = ecc_info.bEccEnabled; 200 if (gpu->ecc.enabled) { 201 gpu->ecc.hw_interrupt_tree_location = (volatile NvU32*)((char*)ecc_info.eccReadLocation + ecc_info.eccOffset); 202 UVM_ASSERT(gpu->ecc.hw_interrupt_tree_location != NULL); 203 204 gpu->ecc.mask = ecc_info.eccMask; 205 UVM_ASSERT(gpu->ecc.mask != 0); 206 207 gpu->ecc.error_notifier = ecc_info.eccErrorNotifier; 208 UVM_ASSERT(gpu->ecc.error_notifier != NULL); 209 } 210 211 return NV_OK; 212 } 213 214 static bool gpu_supports_uvm(uvm_parent_gpu_t *parent_gpu) 215 { 216 // TODO: Bug 1757136: Add Linux SLI support. Until then, explicitly disable 217 // UVM on SLI. 218 return parent_gpu->rm_info.subdeviceCount == 1; 219 } 220 221 static bool platform_uses_canonical_form_address(void) 222 { 223 if (NVCPU_IS_PPC64LE) 224 return false; 225 226 return true; 227 } 228 229 bool uvm_gpu_can_address(uvm_gpu_t *gpu, NvU64 addr, NvU64 size) 230 { 231 // Lower and upper address spaces are typically found in platforms that use 232 // the canonical address form. 233 NvU64 max_va_lower; 234 NvU64 addr_end = addr + size - 1; 235 NvU8 gpu_addr_shift; 236 NvU8 cpu_addr_shift; 237 NvU8 addr_shift; 238 239 // Watch out for calling this too early in init 240 UVM_ASSERT(gpu->address_space_tree.hal); 241 UVM_ASSERT(gpu->address_space_tree.hal->num_va_bits() < 64); 242 UVM_ASSERT(addr <= addr_end); 243 UVM_ASSERT(size > 0); 244 245 gpu_addr_shift = gpu->address_space_tree.hal->num_va_bits(); 246 cpu_addr_shift = fls64(TASK_SIZE - 1) + 1; 247 addr_shift = gpu_addr_shift; 248 249 // Pascal+ GPUs are capable of accessing kernel pointers in various modes 250 // by applying the same upper-bit checks that x86, ARM, and Power 251 // processors do. x86 and ARM use canonical form addresses. For ARM, even 252 // with Top-Byte Ignore enabled, the following logic validates addresses 253 // from the kernel VA range. PowerPC does not use canonical form address. 254 // The following diagram illustrates the valid (V) VA regions that can be 255 // mapped (or addressed) by the GPU/CPU when the CPU uses canonical form. 256 // (C) regions are only accessible by the CPU. Similarly, (G) regions 257 // are only accessible by the GPU. (X) regions are not addressible. 258 // Note that we only consider (V) regions, i.e., address ranges that are 259 // addressable by both, the CPU and GPU. 260 // 261 // GPU MAX VA < CPU MAX VA GPU MAX VA >= CPU MAX VA 262 // 0xF..F +----------------+ 0xF..F +----------------+ 263 // |VVVVVVVVVVVVVVVV| |VVVVVVVVVVVVVVVV| 264 // |VVVVVVVVVVVVVVVV| |VVVVVVVVVVVVVVVV| 265 // |VVVVVVVVVVVVVVVV| |VVVVVVVVVVVVVVVV| 266 // GPU MIN UPPER VA|----------------| CPU MIN UPPER VA|----------------| 267 // |CCCCCCCCCCCCCCCC| |GGGGGGGGGGGGGGGG| 268 // |CCCCCCCCCCCCCCCC| |GGGGGGGGGGGGGGGG| 269 // CPU MIN UPPER VA|----------------| GPU MIN UPPER VA|----------------| 270 // |XXXXXXXXXXXXXXXX| |XXXXXXXXXXXXXXXX| 271 // |XXXXXXXXXXXXXXXX| |XXXXXXXXXXXXXXXX| 272 // CPU MAX LOWER VA|----------------| GPU MAX LOWER VA|----------------| 273 // |CCCCCCCCCCCCCCCC| |GGGGGGGGGGGGGGGG| 274 // |CCCCCCCCCCCCCCCC| |GGGGGGGGGGGGGGGG| 275 // GPU MAX LOWER VA|----------------| CPU MAX LOWER VA|----------------| 276 // |VVVVVVVVVVVVVVVV| |VVVVVVVVVVVVVVVV| 277 // |VVVVVVVVVVVVVVVV| |VVVVVVVVVVVVVVVV| 278 // |VVVVVVVVVVVVVVVV| |VVVVVVVVVVVVVVVV| 279 // 0 +----------------+ 0 +----------------+ 280 281 // On canonical form address platforms and Pascal+ GPUs. 282 if (platform_uses_canonical_form_address() && gpu_addr_shift > 40) { 283 NvU64 min_va_upper; 284 285 // On x86, when cpu_addr_shift > gpu_addr_shift, it means the CPU uses 286 // 5-level paging and the GPU is pre-Hopper. On Pascal-Ada GPUs (49b 287 // wide VA) we set addr_shift to match a 4-level paging x86 (48b wide). 288 // See more details on uvm_parent_gpu_canonical_address(..); 289 if (cpu_addr_shift > gpu_addr_shift) 290 addr_shift = NVCPU_IS_X86_64 ? 48 : 49; 291 else if (gpu_addr_shift == 57) 292 addr_shift = gpu_addr_shift; 293 else 294 addr_shift = cpu_addr_shift; 295 296 min_va_upper = (NvU64)((NvS64)(1ULL << 63) >> (64 - addr_shift)); 297 max_va_lower = 1ULL << (addr_shift - 1); 298 return (addr_end < max_va_lower) || (addr >= min_va_upper); 299 } 300 else { 301 max_va_lower = 1ULL << addr_shift; 302 return addr_end < max_va_lower; 303 } 304 } 305 306 // The internal UVM VAS does not use canonical form addresses. 307 bool uvm_gpu_can_address_kernel(uvm_gpu_t *gpu, NvU64 addr, NvU64 size) 308 { 309 NvU64 addr_end = addr + size - 1; 310 NvU64 max_gpu_va; 311 312 // Watch out for calling this too early in init 313 UVM_ASSERT(gpu->address_space_tree.hal); 314 UVM_ASSERT(gpu->address_space_tree.hal->num_va_bits() < 64); 315 UVM_ASSERT(addr <= addr_end); 316 UVM_ASSERT(size > 0); 317 318 max_gpu_va = 1ULL << gpu->address_space_tree.hal->num_va_bits(); 319 return addr_end < max_gpu_va; 320 } 321 322 NvU64 uvm_parent_gpu_canonical_address(uvm_parent_gpu_t *parent_gpu, NvU64 addr) 323 { 324 NvU8 gpu_addr_shift; 325 NvU8 cpu_addr_shift; 326 NvU8 addr_shift; 327 NvU64 input_addr = addr; 328 329 if (platform_uses_canonical_form_address()) { 330 // When the CPU VA width is larger than GPU's, it means that: 331 // On ARM: the CPU is on LVA mode and the GPU is pre-Hopper. 332 // On x86: the CPU uses 5-level paging and the GPU is pre-Hopper. 333 // We sign-extend on the 48b on ARM and on the 47b on x86 to mirror the 334 // behavior of CPUs with smaller (than GPU) VA widths. 335 gpu_addr_shift = parent_gpu->arch_hal->mmu_mode_hal(UVM_PAGE_SIZE_64K)->num_va_bits(); 336 cpu_addr_shift = fls64(TASK_SIZE - 1) + 1; 337 338 if (cpu_addr_shift > gpu_addr_shift) 339 addr_shift = NVCPU_IS_X86_64 ? 48 : 49; 340 else if (gpu_addr_shift == 57) 341 addr_shift = gpu_addr_shift; 342 else 343 addr_shift = cpu_addr_shift; 344 345 addr = (NvU64)((NvS64)(addr << (64 - addr_shift)) >> (64 - addr_shift)); 346 347 // This protection acts on when the address is not covered by the GPU's 348 // OOR_ADDR_CHECK. This can only happen when OOR_ADDR_CHECK is in 349 // permissive (NO_CHECK) mode. 350 if ((addr << (64 - gpu_addr_shift)) != (input_addr << (64 - gpu_addr_shift))) 351 return input_addr; 352 } 353 354 return addr; 355 } 356 357 static void gpu_info_print_ce_caps(uvm_gpu_t *gpu, struct seq_file *s) 358 { 359 NvU32 i; 360 UvmGpuCopyEnginesCaps *ces_caps; 361 NV_STATUS status; 362 363 ces_caps = uvm_kvmalloc_zero(sizeof(*ces_caps)); 364 if (!ces_caps) { 365 UVM_SEQ_OR_DBG_PRINT(s, "supported_ces: unavailable (no memory)\n"); 366 return; 367 } 368 369 status = uvm_rm_locked_call(nvUvmInterfaceQueryCopyEnginesCaps(uvm_gpu_device_handle(gpu), ces_caps)); 370 if (status != NV_OK) { 371 UVM_SEQ_OR_DBG_PRINT(s, "supported_ces: unavailable (query failed)\n"); 372 goto out; 373 } 374 375 UVM_SEQ_OR_DBG_PRINT(s, "supported_ces:\n"); 376 for (i = 0; i < UVM_COPY_ENGINE_COUNT_MAX; ++i) { 377 UvmGpuCopyEngineCaps *ce_caps = ces_caps->copyEngineCaps + i; 378 379 if (!ce_caps->supported) 380 continue; 381 382 UVM_SEQ_OR_DBG_PRINT(s, " ce %u pce mask 0x%08x grce %u shared %u sysmem read %u sysmem write %u sysmem %u " 383 "nvlink p2p %u p2p %u\n", 384 i, 385 ce_caps->cePceMask, 386 ce_caps->grce, 387 ce_caps->shared, 388 ce_caps->sysmemRead, 389 ce_caps->sysmemWrite, 390 ce_caps->sysmem, 391 ce_caps->nvlinkP2p, 392 ce_caps->p2p); 393 } 394 395 out: 396 uvm_kvfree(ces_caps); 397 } 398 399 static const char *uvm_gpu_virt_type_string(UVM_VIRT_MODE virtMode) 400 { 401 BUILD_BUG_ON(UVM_VIRT_MODE_COUNT != 4); 402 403 switch (virtMode) { 404 UVM_ENUM_STRING_CASE(UVM_VIRT_MODE_NONE); 405 UVM_ENUM_STRING_CASE(UVM_VIRT_MODE_LEGACY); 406 UVM_ENUM_STRING_CASE(UVM_VIRT_MODE_SRIOV_HEAVY); 407 UVM_ENUM_STRING_CASE(UVM_VIRT_MODE_SRIOV_STANDARD); 408 UVM_ENUM_STRING_DEFAULT(); 409 } 410 } 411 412 static const char *uvm_gpu_link_type_string(uvm_gpu_link_type_t link_type) 413 { 414 BUILD_BUG_ON(UVM_GPU_LINK_MAX != 7); 415 416 switch (link_type) { 417 UVM_ENUM_STRING_CASE(UVM_GPU_LINK_INVALID); 418 UVM_ENUM_STRING_CASE(UVM_GPU_LINK_PCIE); 419 UVM_ENUM_STRING_CASE(UVM_GPU_LINK_NVLINK_1); 420 UVM_ENUM_STRING_CASE(UVM_GPU_LINK_NVLINK_2); 421 UVM_ENUM_STRING_CASE(UVM_GPU_LINK_NVLINK_3); 422 UVM_ENUM_STRING_CASE(UVM_GPU_LINK_NVLINK_4); 423 UVM_ENUM_STRING_CASE(UVM_GPU_LINK_C2C); 424 UVM_ENUM_STRING_DEFAULT(); 425 } 426 } 427 428 static void gpu_info_print_common(uvm_gpu_t *gpu, struct seq_file *s) 429 { 430 const UvmGpuInfo *gpu_info = &gpu->parent->rm_info; 431 NvU64 num_pages_in; 432 NvU64 num_pages_out; 433 NvU64 mapped_cpu_pages_size; 434 NvU32 get, put; 435 unsigned int cpu; 436 437 UVM_SEQ_OR_DBG_PRINT(s, "GPU %s\n", uvm_gpu_name(gpu)); 438 UVM_SEQ_OR_DBG_PRINT(s, "retained_count %llu\n", uvm_gpu_retained_count(gpu)); 439 UVM_SEQ_OR_DBG_PRINT(s, "ecc %s\n", gpu->ecc.enabled ? "enabled" : "disabled"); 440 if (gpu->parent->closest_cpu_numa_node == -1) 441 UVM_SEQ_OR_DBG_PRINT(s, "closest_cpu_numa_node n/a\n"); 442 else 443 UVM_SEQ_OR_DBG_PRINT(s, "closest_cpu_numa_node %d\n", gpu->parent->closest_cpu_numa_node); 444 445 if (!uvm_procfs_is_debug_enabled()) 446 return; 447 448 UVM_SEQ_OR_DBG_PRINT(s, "CPU link type %s\n", 449 uvm_gpu_link_type_string(gpu->parent->system_bus.link)); 450 UVM_SEQ_OR_DBG_PRINT(s, "CPU link bandwidth %uMBps\n", 451 gpu->parent->system_bus.link_rate_mbyte_per_s); 452 453 UVM_SEQ_OR_DBG_PRINT(s, "architecture 0x%X\n", gpu_info->gpuArch); 454 UVM_SEQ_OR_DBG_PRINT(s, "implementation 0x%X\n", gpu_info->gpuImplementation); 455 UVM_SEQ_OR_DBG_PRINT(s, "gpcs %u\n", gpu_info->gpcCount); 456 UVM_SEQ_OR_DBG_PRINT(s, "max_gpcs %u\n", gpu_info->maxGpcCount); 457 UVM_SEQ_OR_DBG_PRINT(s, "tpcs %u\n", gpu_info->tpcCount); 458 UVM_SEQ_OR_DBG_PRINT(s, "max_tpcs_per_gpc %u\n", gpu_info->maxTpcPerGpcCount); 459 UVM_SEQ_OR_DBG_PRINT(s, "host_class 0x%X\n", gpu_info->hostClass); 460 UVM_SEQ_OR_DBG_PRINT(s, "ce_class 0x%X\n", gpu_info->ceClass); 461 UVM_SEQ_OR_DBG_PRINT(s, "virtualization_mode %s\n", 462 uvm_gpu_virt_type_string(gpu_info->virtMode)); 463 UVM_SEQ_OR_DBG_PRINT(s, "big_page_size %u\n", gpu->big_page.internal_size); 464 UVM_SEQ_OR_DBG_PRINT(s, "rm_va_base 0x%llx\n", gpu->parent->rm_va_base); 465 UVM_SEQ_OR_DBG_PRINT(s, "rm_va_size 0x%llx\n", gpu->parent->rm_va_size); 466 UVM_SEQ_OR_DBG_PRINT(s, "vidmem_size %llu (%llu MBs)\n", 467 gpu->mem_info.size, 468 gpu->mem_info.size / (1024 * 1024)); 469 UVM_SEQ_OR_DBG_PRINT(s, "vidmem_max_allocatable 0x%llx (%llu MBs)\n", 470 gpu->mem_info.max_allocatable_address, 471 gpu->mem_info.max_allocatable_address / (1024 * 1024)); 472 473 if (gpu->mem_info.numa.enabled) { 474 NvU64 window_size = gpu->parent->system_bus.memory_window_end - gpu->parent->system_bus.memory_window_start + 1; 475 UVM_SEQ_OR_DBG_PRINT(s, "numa_node_id %u\n", uvm_gpu_numa_node(gpu)); 476 UVM_SEQ_OR_DBG_PRINT(s, "memory_window_start 0x%llx\n", 477 gpu->parent->system_bus.memory_window_start); 478 UVM_SEQ_OR_DBG_PRINT(s, "memory_window_end 0x%llx\n", 479 gpu->parent->system_bus.memory_window_end); 480 UVM_SEQ_OR_DBG_PRINT(s, "system_memory_window_size 0x%llx (%llu MBs)\n", 481 window_size, 482 window_size / (1024 * 1024)); 483 } 484 485 if (gpu->parent->npu) 486 UVM_SEQ_OR_DBG_PRINT(s, "npu_domain %d\n", gpu->parent->npu->pci_domain); 487 488 UVM_SEQ_OR_DBG_PRINT(s, "interrupts %llu\n", gpu->parent->isr.interrupt_count); 489 490 if (gpu->parent->isr.replayable_faults.handling) { 491 UVM_SEQ_OR_DBG_PRINT(s, "replayable_faults_bh %llu\n", 492 gpu->parent->isr.replayable_faults.stats.bottom_half_count); 493 UVM_SEQ_OR_DBG_PRINT(s, "replayable_faults_bh/cpu\n"); 494 for_each_cpu(cpu, &gpu->parent->isr.replayable_faults.stats.cpus_used_mask) { 495 UVM_SEQ_OR_DBG_PRINT(s, " cpu%02u %llu\n", 496 cpu, 497 gpu->parent->isr.replayable_faults.stats.cpu_exec_count[cpu]); 498 } 499 UVM_SEQ_OR_DBG_PRINT(s, "replayable_faults_buffer_entries %u\n", 500 gpu->parent->fault_buffer_info.replayable.max_faults); 501 UVM_SEQ_OR_DBG_PRINT(s, "replayable_faults_cached_get %u\n", 502 gpu->parent->fault_buffer_info.replayable.cached_get); 503 UVM_SEQ_OR_DBG_PRINT(s, "replayable_faults_cached_put %u\n", 504 gpu->parent->fault_buffer_info.replayable.cached_put); 505 UVM_SEQ_OR_DBG_PRINT(s, "replayable_faults_get %u\n", 506 gpu->parent->fault_buffer_hal->read_get(gpu->parent)); 507 UVM_SEQ_OR_DBG_PRINT(s, "replayable_faults_put %u\n", 508 gpu->parent->fault_buffer_hal->read_put(gpu->parent)); 509 UVM_SEQ_OR_DBG_PRINT(s, "replayable_faults_fault_batch_size %u\n", 510 gpu->parent->fault_buffer_info.max_batch_size); 511 UVM_SEQ_OR_DBG_PRINT(s, "replayable_faults_replay_policy %s\n", 512 uvm_perf_fault_replay_policy_string(gpu->parent->fault_buffer_info.replayable.replay_policy)); 513 UVM_SEQ_OR_DBG_PRINT(s, "replayable_faults_num_faults %llu\n", 514 gpu->parent->stats.num_replayable_faults); 515 } 516 if (gpu->parent->isr.non_replayable_faults.handling) { 517 UVM_SEQ_OR_DBG_PRINT(s, "non_replayable_faults_bh %llu\n", 518 gpu->parent->isr.non_replayable_faults.stats.bottom_half_count); 519 UVM_SEQ_OR_DBG_PRINT(s, "non_replayable_faults_bh/cpu\n"); 520 for_each_cpu(cpu, &gpu->parent->isr.non_replayable_faults.stats.cpus_used_mask) { 521 UVM_SEQ_OR_DBG_PRINT(s, " cpu%02u %llu\n", 522 cpu, 523 gpu->parent->isr.non_replayable_faults.stats.cpu_exec_count[cpu]); 524 } 525 UVM_SEQ_OR_DBG_PRINT(s, "non_replayable_faults_buffer_entries %u\n", 526 gpu->parent->fault_buffer_info.non_replayable.max_faults); 527 UVM_SEQ_OR_DBG_PRINT(s, "non_replayable_faults_num_faults %llu\n", 528 gpu->parent->stats.num_non_replayable_faults); 529 } 530 531 if (gpu->parent->isr.access_counters.handling_ref_count > 0) { 532 UVM_SEQ_OR_DBG_PRINT(s, "access_counters_bh %llu\n", 533 gpu->parent->isr.access_counters.stats.bottom_half_count); 534 UVM_SEQ_OR_DBG_PRINT(s, "access_counters_bh/cpu\n"); 535 for_each_cpu(cpu, &gpu->parent->isr.access_counters.stats.cpus_used_mask) { 536 UVM_SEQ_OR_DBG_PRINT(s, " cpu%02u %llu\n", 537 cpu, 538 gpu->parent->isr.access_counters.stats.cpu_exec_count[cpu]); 539 } 540 UVM_SEQ_OR_DBG_PRINT(s, "access_counters_buffer_entries %u\n", 541 gpu->parent->access_counter_buffer_info.max_notifications); 542 UVM_SEQ_OR_DBG_PRINT(s, "access_counters_cached_get %u\n", 543 gpu->parent->access_counter_buffer_info.cached_get); 544 UVM_SEQ_OR_DBG_PRINT(s, "access_counters_cached_put %u\n", 545 gpu->parent->access_counter_buffer_info.cached_put); 546 547 get = UVM_GPU_READ_ONCE(*gpu->parent->access_counter_buffer_info.rm_info.pAccessCntrBufferGet); 548 put = UVM_GPU_READ_ONCE(*gpu->parent->access_counter_buffer_info.rm_info.pAccessCntrBufferPut); 549 550 UVM_SEQ_OR_DBG_PRINT(s, "access_counters_get %u\n", get); 551 UVM_SEQ_OR_DBG_PRINT(s, "access_counters_put %u\n", put); 552 } 553 554 num_pages_out = atomic64_read(&gpu->parent->stats.num_pages_out); 555 num_pages_in = atomic64_read(&gpu->parent->stats.num_pages_in); 556 mapped_cpu_pages_size = atomic64_read(&gpu->parent->mapped_cpu_pages_size); 557 558 UVM_SEQ_OR_DBG_PRINT(s, "migrated_pages_in %llu (%llu MB)\n", 559 num_pages_in, 560 (num_pages_in * (NvU64)PAGE_SIZE) / (1024u * 1024u)); 561 UVM_SEQ_OR_DBG_PRINT(s, "migrated_pages_out %llu (%llu MB)\n", 562 num_pages_out, 563 (num_pages_out * (NvU64)PAGE_SIZE) / (1024u * 1024u)); 564 UVM_SEQ_OR_DBG_PRINT(s, "mapped_cpu_pages_dma %llu (%llu MB)\n", 565 mapped_cpu_pages_size / PAGE_SIZE, 566 mapped_cpu_pages_size / (1024u * 1024u)); 567 568 gpu_info_print_ce_caps(gpu, s); 569 570 if (uvm_conf_computing_mode_enabled(gpu)) { 571 UVM_SEQ_OR_DBG_PRINT(s, "dma_buffer_pool_num_buffers %lu\n", 572 gpu->conf_computing.dma_buffer_pool.num_dma_buffers); 573 } 574 } 575 576 static void 577 gpu_fault_stats_print_common(uvm_parent_gpu_t *parent_gpu, struct seq_file *s) 578 { 579 NvU64 num_pages_in; 580 NvU64 num_pages_out; 581 582 UVM_ASSERT(uvm_procfs_is_debug_enabled()); 583 584 UVM_SEQ_OR_DBG_PRINT(s, "replayable_faults %llu\n", parent_gpu->stats.num_replayable_faults); 585 UVM_SEQ_OR_DBG_PRINT(s, "duplicates %llu\n", 586 parent_gpu->fault_buffer_info.replayable.stats.num_duplicate_faults); 587 UVM_SEQ_OR_DBG_PRINT(s, "faults_by_access_type:\n"); 588 UVM_SEQ_OR_DBG_PRINT(s, " prefetch %llu\n", 589 parent_gpu->fault_buffer_info.replayable.stats.num_prefetch_faults); 590 UVM_SEQ_OR_DBG_PRINT(s, " read %llu\n", 591 parent_gpu->fault_buffer_info.replayable.stats.num_read_faults); 592 UVM_SEQ_OR_DBG_PRINT(s, " write %llu\n", 593 parent_gpu->fault_buffer_info.replayable.stats.num_write_faults); 594 UVM_SEQ_OR_DBG_PRINT(s, " atomic %llu\n", 595 parent_gpu->fault_buffer_info.replayable.stats.num_atomic_faults); 596 num_pages_out = atomic64_read(&parent_gpu->fault_buffer_info.replayable.stats.num_pages_out); 597 num_pages_in = atomic64_read(&parent_gpu->fault_buffer_info.replayable.stats.num_pages_in); 598 UVM_SEQ_OR_DBG_PRINT(s, "migrations:\n"); 599 UVM_SEQ_OR_DBG_PRINT(s, " num_pages_in %llu (%llu MB)\n", num_pages_in, 600 (num_pages_in * (NvU64)PAGE_SIZE) / (1024u * 1024u)); 601 UVM_SEQ_OR_DBG_PRINT(s, " num_pages_out %llu (%llu MB)\n", num_pages_out, 602 (num_pages_out * (NvU64)PAGE_SIZE) / (1024u * 1024u)); 603 UVM_SEQ_OR_DBG_PRINT(s, "replays:\n"); 604 UVM_SEQ_OR_DBG_PRINT(s, " start %llu\n", 605 parent_gpu->fault_buffer_info.replayable.stats.num_replays); 606 UVM_SEQ_OR_DBG_PRINT(s, " start_ack_all %llu\n", 607 parent_gpu->fault_buffer_info.replayable.stats.num_replays_ack_all); 608 UVM_SEQ_OR_DBG_PRINT(s, "non_replayable_faults %llu\n", parent_gpu->stats.num_non_replayable_faults); 609 UVM_SEQ_OR_DBG_PRINT(s, "faults_by_access_type:\n"); 610 UVM_SEQ_OR_DBG_PRINT(s, " read %llu\n", 611 parent_gpu->fault_buffer_info.non_replayable.stats.num_read_faults); 612 UVM_SEQ_OR_DBG_PRINT(s, " write %llu\n", 613 parent_gpu->fault_buffer_info.non_replayable.stats.num_write_faults); 614 UVM_SEQ_OR_DBG_PRINT(s, " atomic %llu\n", 615 parent_gpu->fault_buffer_info.non_replayable.stats.num_atomic_faults); 616 UVM_SEQ_OR_DBG_PRINT(s, "faults_by_addressing:\n"); 617 UVM_SEQ_OR_DBG_PRINT(s, " virtual %llu\n", 618 parent_gpu->stats.num_non_replayable_faults - 619 parent_gpu->fault_buffer_info.non_replayable.stats.num_physical_faults); 620 UVM_SEQ_OR_DBG_PRINT(s, " physical %llu\n", 621 parent_gpu->fault_buffer_info.non_replayable.stats.num_physical_faults); 622 num_pages_out = atomic64_read(&parent_gpu->fault_buffer_info.non_replayable.stats.num_pages_out); 623 num_pages_in = atomic64_read(&parent_gpu->fault_buffer_info.non_replayable.stats.num_pages_in); 624 UVM_SEQ_OR_DBG_PRINT(s, "migrations:\n"); 625 UVM_SEQ_OR_DBG_PRINT(s, " num_pages_in %llu (%llu MB)\n", num_pages_in, 626 (num_pages_in * (NvU64)PAGE_SIZE) / (1024u * 1024u)); 627 UVM_SEQ_OR_DBG_PRINT(s, " num_pages_out %llu (%llu MB)\n", num_pages_out, 628 (num_pages_out * (NvU64)PAGE_SIZE) / (1024u * 1024u)); 629 } 630 631 static void gpu_access_counters_print_common(uvm_parent_gpu_t *parent_gpu, struct seq_file *s) 632 { 633 NvU64 num_pages_in; 634 NvU64 num_pages_out; 635 636 UVM_ASSERT(uvm_procfs_is_debug_enabled()); 637 638 num_pages_out = atomic64_read(&parent_gpu->access_counter_buffer_info.stats.num_pages_out); 639 num_pages_in = atomic64_read(&parent_gpu->access_counter_buffer_info.stats.num_pages_in); 640 UVM_SEQ_OR_DBG_PRINT(s, "migrations:\n"); 641 UVM_SEQ_OR_DBG_PRINT(s, " num_pages_in %llu (%llu MB)\n", num_pages_in, 642 (num_pages_in * (NvU64)PAGE_SIZE) / (1024u * 1024u)); 643 UVM_SEQ_OR_DBG_PRINT(s, " num_pages_out %llu (%llu MB)\n", num_pages_out, 644 (num_pages_out * (NvU64)PAGE_SIZE) / (1024u * 1024u)); 645 } 646 647 void uvm_gpu_print(uvm_gpu_t *gpu) 648 { 649 gpu_info_print_common(gpu, NULL); 650 } 651 652 static void gpu_peer_caps_print(uvm_gpu_t **gpu_pair, struct seq_file *s) 653 { 654 bool nvswitch_connected; 655 uvm_aperture_t aperture; 656 uvm_gpu_peer_t *peer_caps; 657 uvm_gpu_t *local; 658 uvm_gpu_t *remote; 659 660 UVM_ASSERT(uvm_procfs_is_debug_enabled()); 661 662 local = gpu_pair[0]; 663 remote = gpu_pair[1]; 664 peer_caps = uvm_gpu_peer_caps(local, remote); 665 aperture = uvm_gpu_peer_aperture(local, remote); 666 nvswitch_connected = uvm_gpus_are_nvswitch_connected(local, remote); 667 UVM_SEQ_OR_DBG_PRINT(s, "Link type %s\n", uvm_gpu_link_type_string(peer_caps->link_type)); 668 UVM_SEQ_OR_DBG_PRINT(s, "Bandwidth %uMBps\n", peer_caps->total_link_line_rate_mbyte_per_s); 669 UVM_SEQ_OR_DBG_PRINT(s, "Aperture %s\n", uvm_aperture_string(aperture)); 670 UVM_SEQ_OR_DBG_PRINT(s, "Connected through NVSWITCH %s\n", nvswitch_connected ? "True" : "False"); 671 UVM_SEQ_OR_DBG_PRINT(s, "Refcount %llu\n", UVM_READ_ONCE(peer_caps->ref_count)); 672 } 673 674 static int nv_procfs_read_gpu_info(struct seq_file *s, void *v) 675 { 676 uvm_gpu_t *gpu = (uvm_gpu_t *)s->private; 677 678 if (!uvm_down_read_trylock(&g_uvm_global.pm.lock)) 679 return -EAGAIN; 680 681 gpu_info_print_common(gpu, s); 682 683 uvm_up_read(&g_uvm_global.pm.lock); 684 685 return 0; 686 } 687 688 static int nv_procfs_read_gpu_info_entry(struct seq_file *s, void *v) 689 { 690 UVM_ENTRY_RET(nv_procfs_read_gpu_info(s, v)); 691 } 692 693 static int nv_procfs_read_gpu_fault_stats(struct seq_file *s, void *v) 694 { 695 uvm_parent_gpu_t *parent_gpu = (uvm_parent_gpu_t *)s->private; 696 697 if (!uvm_down_read_trylock(&g_uvm_global.pm.lock)) 698 return -EAGAIN; 699 700 gpu_fault_stats_print_common(parent_gpu, s); 701 702 uvm_up_read(&g_uvm_global.pm.lock); 703 704 return 0; 705 } 706 707 static int nv_procfs_read_gpu_fault_stats_entry(struct seq_file *s, void *v) 708 { 709 UVM_ENTRY_RET(nv_procfs_read_gpu_fault_stats(s, v)); 710 } 711 712 static int nv_procfs_read_gpu_access_counters(struct seq_file *s, void *v) 713 { 714 uvm_parent_gpu_t *parent_gpu = (uvm_parent_gpu_t *)s->private; 715 716 if (!uvm_down_read_trylock(&g_uvm_global.pm.lock)) 717 return -EAGAIN; 718 719 gpu_access_counters_print_common(parent_gpu, s); 720 721 uvm_up_read(&g_uvm_global.pm.lock); 722 723 return 0; 724 } 725 726 static int nv_procfs_read_gpu_access_counters_entry(struct seq_file *s, void *v) 727 { 728 UVM_ENTRY_RET(nv_procfs_read_gpu_access_counters(s, v)); 729 } 730 731 UVM_DEFINE_SINGLE_PROCFS_FILE(gpu_info_entry); 732 UVM_DEFINE_SINGLE_PROCFS_FILE(gpu_fault_stats_entry); 733 UVM_DEFINE_SINGLE_PROCFS_FILE(gpu_access_counters_entry); 734 735 static NV_STATUS init_parent_procfs_dir(uvm_parent_gpu_t *parent_gpu) 736 { 737 struct proc_dir_entry *gpu_base_dir_entry; 738 char uuid_text_buffer[UVM_GPU_UUID_TEXT_BUFFER_LENGTH]; 739 char gpu_dir_name[sizeof(uuid_text_buffer) + 1]; 740 741 if (!uvm_procfs_is_enabled()) 742 return NV_OK; 743 744 gpu_base_dir_entry = uvm_procfs_get_gpu_base_dir(); 745 746 format_uuid_to_buffer(uuid_text_buffer, sizeof(uuid_text_buffer), &parent_gpu->uuid); 747 748 // Create UVM-GPU-${UUID} directory 749 snprintf(gpu_dir_name, sizeof(gpu_dir_name), "%s", uuid_text_buffer); 750 751 parent_gpu->procfs.dir = NV_CREATE_PROC_DIR(gpu_dir_name, gpu_base_dir_entry); 752 if (parent_gpu->procfs.dir == NULL) 753 return NV_ERR_OPERATING_SYSTEM; 754 755 return NV_OK; 756 } 757 758 static void deinit_parent_procfs_dir(uvm_parent_gpu_t *parent_gpu) 759 { 760 proc_remove(parent_gpu->procfs.dir); 761 } 762 763 static NV_STATUS init_parent_procfs_files(uvm_parent_gpu_t *parent_gpu) 764 { 765 // Fault and access counter files are debug only 766 if (!uvm_procfs_is_debug_enabled()) 767 return NV_OK; 768 769 parent_gpu->procfs.fault_stats_file = NV_CREATE_PROC_FILE("fault_stats", 770 parent_gpu->procfs.dir, 771 gpu_fault_stats_entry, 772 parent_gpu); 773 if (parent_gpu->procfs.fault_stats_file == NULL) 774 return NV_ERR_OPERATING_SYSTEM; 775 776 parent_gpu->procfs.access_counters_file = NV_CREATE_PROC_FILE("access_counters", 777 parent_gpu->procfs.dir, 778 gpu_access_counters_entry, 779 parent_gpu); 780 if (parent_gpu->procfs.access_counters_file == NULL) 781 return NV_ERR_OPERATING_SYSTEM; 782 783 return NV_OK; 784 } 785 786 static void deinit_parent_procfs_files(uvm_parent_gpu_t *parent_gpu) 787 { 788 proc_remove(parent_gpu->procfs.access_counters_file); 789 proc_remove(parent_gpu->procfs.fault_stats_file); 790 } 791 792 static NV_STATUS init_procfs_dirs(uvm_gpu_t *gpu) 793 { 794 struct proc_dir_entry *gpu_base_dir_entry; 795 char symlink_name[16]; // Hold a global_gpu_id_t value in decimal. 796 char uuid_text_buffer[UVM_GPU_UUID_TEXT_BUFFER_LENGTH]; 797 char gpu_dir_name[sizeof(symlink_name) + sizeof(uuid_text_buffer) + 1]; 798 799 if (!uvm_procfs_is_enabled()) 800 return NV_OK; 801 802 format_uuid_to_buffer(uuid_text_buffer, sizeof(uuid_text_buffer), uvm_gpu_uuid(gpu)); 803 804 gpu_base_dir_entry = uvm_procfs_get_gpu_base_dir(); 805 806 // Create UVM-GPU-${UUID}/${sub_processor_index} directory 807 snprintf(gpu_dir_name, sizeof(gpu_dir_name), "%u", uvm_global_id_sub_processor_index(gpu->global_id)); 808 809 gpu->procfs.dir = NV_CREATE_PROC_DIR(gpu_dir_name, gpu->parent->procfs.dir); 810 if (gpu->procfs.dir == NULL) 811 return NV_ERR_OPERATING_SYSTEM; 812 813 // Create symlink from ${global_gpu_id} to 814 // gpus/UVM-GPU-${UUID}/${sub_processor_index} 815 snprintf(symlink_name, sizeof(symlink_name), "%u", uvm_global_id_value(gpu->global_id)); 816 snprintf(gpu_dir_name, 817 sizeof(gpu_dir_name), 818 "%s/%u", 819 uuid_text_buffer, 820 uvm_global_id_sub_processor_index(gpu->global_id)); 821 822 gpu->procfs.dir_symlink = proc_symlink(symlink_name, gpu_base_dir_entry, gpu_dir_name); 823 if (gpu->procfs.dir_symlink == NULL) 824 return NV_ERR_OPERATING_SYSTEM; 825 826 // GPU peer files are debug only 827 if (!uvm_procfs_is_debug_enabled()) 828 return NV_OK; 829 830 gpu->procfs.dir_peers = NV_CREATE_PROC_DIR(UVM_PROC_GPUS_PEER_DIR_NAME, gpu->procfs.dir); 831 if (gpu->procfs.dir_peers == NULL) 832 return NV_ERR_OPERATING_SYSTEM; 833 834 return NV_OK; 835 } 836 837 // The kernel waits on readers to finish before returning from those calls 838 static void deinit_procfs_dirs(uvm_gpu_t *gpu) 839 { 840 proc_remove(gpu->procfs.dir_peers); 841 proc_remove(gpu->procfs.dir_symlink); 842 proc_remove(gpu->procfs.dir); 843 } 844 845 static NV_STATUS init_procfs_files(uvm_gpu_t *gpu) 846 { 847 gpu->procfs.info_file = NV_CREATE_PROC_FILE("info", gpu->procfs.dir, gpu_info_entry, gpu); 848 if (gpu->procfs.info_file == NULL) 849 return NV_ERR_OPERATING_SYSTEM; 850 851 return NV_OK; 852 } 853 854 static void deinit_procfs_files(uvm_gpu_t *gpu) 855 { 856 proc_remove(gpu->procfs.info_file); 857 } 858 859 static void deinit_procfs_peer_cap_files(uvm_gpu_peer_t *peer_caps) 860 { 861 proc_remove(peer_caps->procfs.peer_symlink_file[0]); 862 proc_remove(peer_caps->procfs.peer_symlink_file[1]); 863 proc_remove(peer_caps->procfs.peer_file[0]); 864 proc_remove(peer_caps->procfs.peer_file[1]); 865 } 866 867 static NV_STATUS init_semaphore_pools(uvm_gpu_t *gpu) 868 { 869 NV_STATUS status; 870 uvm_gpu_t *other_gpu; 871 872 status = uvm_gpu_semaphore_pool_create(gpu, &gpu->semaphore_pool); 873 if (status != NV_OK) 874 return status; 875 876 // When the Confidential Computing feature is enabled, a separate secure 877 // pool is created that holds page allocated in the CPR of vidmem. 878 if (uvm_conf_computing_mode_enabled(gpu)) { 879 status = uvm_gpu_semaphore_secure_pool_create(gpu, &gpu->secure_semaphore_pool); 880 if (status != NV_OK) 881 return status; 882 } 883 884 for_each_global_gpu(other_gpu) { 885 if (uvm_conf_computing_mode_enabled(gpu)) 886 break; 887 if (other_gpu == gpu) 888 continue; 889 status = uvm_gpu_semaphore_pool_map_gpu(other_gpu->semaphore_pool, gpu); 890 if (status != NV_OK) 891 return status; 892 } 893 894 return NV_OK; 895 } 896 897 static void deinit_semaphore_pools(uvm_gpu_t *gpu) 898 { 899 uvm_gpu_t *other_gpu; 900 901 for_each_global_gpu(other_gpu) { 902 if (other_gpu == gpu) 903 continue; 904 uvm_gpu_semaphore_pool_unmap_gpu(other_gpu->semaphore_pool, gpu); 905 } 906 907 uvm_gpu_semaphore_pool_destroy(gpu->semaphore_pool); 908 uvm_gpu_semaphore_pool_destroy(gpu->secure_semaphore_pool); 909 } 910 911 static NV_STATUS find_unused_global_gpu_id(uvm_parent_gpu_t *parent_gpu, uvm_global_gpu_id_t *out_id) 912 { 913 NvU32 i; 914 915 uvm_assert_mutex_locked(&g_uvm_global.global_lock); 916 917 if (!parent_gpu) { 918 for (i = 0; i < UVM_MAX_GPUS; i++) { 919 if (!g_uvm_global.parent_gpus[i]) { 920 *out_id = uvm_global_gpu_id_from_parent_index(i); 921 return NV_OK; 922 } 923 } 924 } 925 else { 926 NvU32 sub_processor_index = find_first_zero_bit(parent_gpu->valid_gpus, UVM_ID_MAX_SUB_PROCESSORS); 927 if (sub_processor_index < UVM_ID_MAX_SUB_PROCESSORS) { 928 *out_id = uvm_global_gpu_id_from_sub_processor_index(parent_gpu->id, sub_processor_index); 929 return NV_OK; 930 } 931 } 932 933 return NV_ERR_INSUFFICIENT_RESOURCES; 934 } 935 936 // Allocates a uvm_parent_gpu_t, assigns the GPU ID, and sets up basic data 937 // structures, but leaves all other initialization up to the caller. 938 static NV_STATUS alloc_parent_gpu(const NvProcessorUuid *gpu_uuid, 939 uvm_gpu_id_t gpu_id, 940 uvm_parent_gpu_t **parent_gpu_out) 941 { 942 uvm_parent_gpu_t *parent_gpu; 943 NV_STATUS status; 944 945 parent_gpu = uvm_kvmalloc_zero(sizeof(*parent_gpu)); 946 if (!parent_gpu) 947 return NV_ERR_NO_MEMORY; 948 949 parent_gpu->id = gpu_id; 950 951 uvm_processor_uuid_copy(&parent_gpu->uuid, gpu_uuid); 952 uvm_sema_init(&parent_gpu->isr.replayable_faults.service_lock, 1, UVM_LOCK_ORDER_ISR); 953 uvm_sema_init(&parent_gpu->isr.non_replayable_faults.service_lock, 1, UVM_LOCK_ORDER_ISR); 954 uvm_sema_init(&parent_gpu->isr.access_counters.service_lock, 1, UVM_LOCK_ORDER_ISR); 955 uvm_spin_lock_irqsave_init(&parent_gpu->isr.interrupts_lock, UVM_LOCK_ORDER_LEAF); 956 uvm_spin_lock_init(&parent_gpu->instance_ptr_table_lock, UVM_LOCK_ORDER_LEAF); 957 uvm_rb_tree_init(&parent_gpu->instance_ptr_table); 958 uvm_rb_tree_init(&parent_gpu->tsg_table); 959 960 // TODO: Bug 3881835: revisit whether to use nv_kthread_q_t or workqueue. 961 status = errno_to_nv_status(nv_kthread_q_init(&parent_gpu->lazy_free_q, "vidmem lazy free")); 962 963 nv_kref_init(&parent_gpu->gpu_kref); 964 965 *parent_gpu_out = parent_gpu; 966 967 return status; 968 } 969 970 // Allocates a uvm_gpu_t struct and initializes the basic fields and leaves all 971 // other initialization up to the caller. 972 static uvm_gpu_t *alloc_gpu(uvm_parent_gpu_t *parent_gpu, uvm_global_gpu_id_t global_gpu_id) 973 { 974 NvU32 sub_processor_index; 975 uvm_gpu_t *gpu; 976 977 gpu = uvm_kvmalloc_zero(sizeof(*gpu)); 978 if (!gpu) 979 return gpu; 980 981 gpu->id = parent_gpu->id; 982 gpu->global_id = global_gpu_id; 983 gpu->parent = parent_gpu; 984 985 // Initialize enough of the gpu struct for remove_gpu to be called 986 gpu->magic = UVM_GPU_MAGIC_VALUE; 987 uvm_spin_lock_init(&gpu->peer_info.peer_gpus_lock, UVM_LOCK_ORDER_LEAF); 988 989 sub_processor_index = uvm_global_id_sub_processor_index(global_gpu_id); 990 parent_gpu->gpus[sub_processor_index] = gpu; 991 992 return gpu; 993 } 994 995 static NV_STATUS configure_address_space(uvm_gpu_t *gpu) 996 { 997 NV_STATUS status; 998 NvU32 num_entries; 999 NvU64 va_size; 1000 NvU64 va_per_entry; 1001 1002 status = uvm_page_tree_init(gpu, 1003 NULL, 1004 UVM_PAGE_TREE_TYPE_KERNEL, 1005 gpu->big_page.internal_size, 1006 uvm_gpu_page_tree_init_location(gpu), 1007 &gpu->address_space_tree); 1008 if (status != NV_OK) { 1009 UVM_ERR_PRINT("Initializing the page tree failed: %s, GPU %s\n", nvstatusToString(status), uvm_gpu_name(gpu)); 1010 return status; 1011 } 1012 1013 num_entries = uvm_mmu_page_tree_entries(&gpu->address_space_tree, 0, UVM_PAGE_SIZE_AGNOSTIC); 1014 1015 UVM_ASSERT(gpu->address_space_tree.hal->num_va_bits() < 64); 1016 va_size = 1ull << gpu->address_space_tree.hal->num_va_bits(); 1017 va_per_entry = va_size / num_entries; 1018 1019 // Make sure that RM's part of the VA is aligned to the VA covered by a 1020 // single top level PDE. 1021 UVM_ASSERT_MSG(gpu->parent->rm_va_base % va_per_entry == 0, 1022 "va_base 0x%llx va_per_entry 0x%llx\n", gpu->parent->rm_va_base, va_per_entry); 1023 UVM_ASSERT_MSG(gpu->parent->rm_va_size % va_per_entry == 0, 1024 "va_size 0x%llx va_per_entry 0x%llx\n", gpu->parent->rm_va_size, va_per_entry); 1025 1026 status = uvm_rm_locked_call(nvUvmInterfaceSetPageDirectory(gpu->rm_address_space, 1027 uvm_page_tree_pdb(&gpu->address_space_tree)->addr.address, num_entries, 1028 uvm_page_tree_pdb(&gpu->address_space_tree)->addr.aperture == UVM_APERTURE_VID, 1029 -1U /* Invalid PASID for internal RM address space */)); 1030 if (status != NV_OK) { 1031 UVM_ERR_PRINT("nvUvmInterfaceSetPageDirectory() failed: %s, GPU %s\n", 1032 nvstatusToString(status), 1033 uvm_gpu_name(gpu)); 1034 return status; 1035 } 1036 gpu->rm_address_space_moved_to_page_tree = true; 1037 1038 return NV_OK; 1039 } 1040 1041 static void deconfigure_address_space(uvm_gpu_t *gpu) 1042 { 1043 if (gpu->rm_address_space_moved_to_page_tree) 1044 uvm_rm_locked_call_void(nvUvmInterfaceUnsetPageDirectory(gpu->rm_address_space)); 1045 1046 if (gpu->address_space_tree.root) 1047 uvm_page_tree_deinit(&gpu->address_space_tree); 1048 } 1049 1050 static NV_STATUS service_interrupts(uvm_parent_gpu_t *parent_gpu) 1051 { 1052 // Asking RM to service interrupts from top half interrupt handler would 1053 // very likely deadlock. 1054 UVM_ASSERT(!in_interrupt()); 1055 1056 return uvm_rm_locked_call(nvUvmInterfaceServiceDeviceInterruptsRM(parent_gpu->rm_device)); 1057 } 1058 1059 NV_STATUS uvm_gpu_check_ecc_error(uvm_gpu_t *gpu) 1060 { 1061 NV_STATUS status = uvm_gpu_check_ecc_error_no_rm(gpu); 1062 1063 if (status == NV_OK || status != NV_WARN_MORE_PROCESSING_REQUIRED) 1064 return status; 1065 1066 // An interrupt that might mean an ECC error needs to be serviced. 1067 UVM_ASSERT(status == NV_WARN_MORE_PROCESSING_REQUIRED); 1068 1069 status = service_interrupts(gpu->parent); 1070 if (status != NV_OK) { 1071 UVM_ERR_PRINT("Servicing interrupts failed: %s, GPU %s\n", nvstatusToString(status), uvm_gpu_name(gpu)); 1072 return status; 1073 } 1074 1075 // After servicing interrupts the ECC error notifier should be current. 1076 if (*gpu->ecc.error_notifier) { 1077 UVM_ERR_PRINT("ECC error encountered, GPU %s\n", uvm_gpu_name(gpu)); 1078 uvm_global_set_fatal_error(NV_ERR_ECC_ERROR); 1079 return NV_ERR_ECC_ERROR; 1080 } 1081 1082 return NV_OK; 1083 } 1084 1085 static NV_STATUS init_parent_gpu(uvm_parent_gpu_t *parent_gpu, 1086 const NvProcessorUuid *gpu_uuid, 1087 const UvmGpuInfo *gpu_info, 1088 const UvmGpuPlatformInfo *gpu_platform_info) 1089 { 1090 NV_STATUS status; 1091 1092 status = uvm_rm_locked_call(nvUvmInterfaceDeviceCreate(g_uvm_global.rm_session_handle, 1093 gpu_info, 1094 gpu_uuid, 1095 &parent_gpu->rm_device, 1096 NV_FALSE)); 1097 if (status != NV_OK) { 1098 UVM_ERR_PRINT("Creating RM device failed: %s, GPU %s\n", nvstatusToString(status), parent_gpu->name); 1099 return status; 1100 } 1101 1102 status = uvm_conf_computing_init_parent_gpu(parent_gpu); 1103 if (status != NV_OK) { 1104 UVM_ERR_PRINT("Confidential computing: %s, GPU %s\n", 1105 nvstatusToString(status), parent_gpu->name); 1106 return status; 1107 } 1108 1109 parent_gpu->pci_dev = gpu_platform_info->pci_dev; 1110 parent_gpu->closest_cpu_numa_node = dev_to_node(&parent_gpu->pci_dev->dev); 1111 parent_gpu->dma_addressable_start = gpu_platform_info->dma_addressable_start; 1112 parent_gpu->dma_addressable_limit = gpu_platform_info->dma_addressable_limit; 1113 1114 parent_gpu->sli_enabled = (gpu_info->subdeviceCount > 1); 1115 1116 parent_gpu->virt_mode = gpu_info->virtMode; 1117 if (parent_gpu->virt_mode == UVM_VIRT_MODE_LEGACY) { 1118 UVM_ERR_PRINT("Failed to init GPU %s. UVM is not supported in legacy virtualization mode\n", parent_gpu->name); 1119 return NV_ERR_NOT_SUPPORTED; 1120 } 1121 1122 if (gpu_info->isSimulated) 1123 ++g_uvm_global.num_simulated_devices; 1124 1125 status = init_parent_procfs_dir(parent_gpu); 1126 if (status != NV_OK) { 1127 UVM_ERR_PRINT("Failed to init parent procfs dir: %s, GPU %s\n", nvstatusToString(status), parent_gpu->name); 1128 return status; 1129 } 1130 1131 status = uvm_hal_init_gpu(parent_gpu); 1132 if (status != NV_OK) { 1133 UVM_ERR_PRINT("Failed to init GPU hal: %s, GPU %s\n", nvstatusToString(status), parent_gpu->name); 1134 return status; 1135 } 1136 1137 uvm_hal_init_properties(parent_gpu); 1138 1139 UVM_ASSERT(!parent_gpu->rm_info.smcEnabled || parent_gpu->smc.supported); 1140 parent_gpu->smc.enabled = !!parent_gpu->rm_info.smcEnabled; 1141 1142 uvm_mmu_init_gpu_chunk_sizes(parent_gpu); 1143 1144 status = uvm_ats_add_gpu(parent_gpu); 1145 if (status != NV_OK) { 1146 UVM_ERR_PRINT("uvm_ats_add_gpu failed: %s, GPU %s\n", nvstatusToString(status), parent_gpu->name); 1147 return status; 1148 } 1149 1150 status = init_parent_procfs_files(parent_gpu); 1151 if (status != NV_OK) { 1152 UVM_ERR_PRINT("Failed to init parent procfs files: %s, GPU %s\n", nvstatusToString(status), parent_gpu->name); 1153 return status; 1154 } 1155 1156 status = uvm_gpu_init_isr(parent_gpu); 1157 if (status != NV_OK) { 1158 UVM_ERR_PRINT("Failed to init ISR: %s, GPU %s\n", nvstatusToString(status), parent_gpu->name); 1159 return status; 1160 } 1161 1162 return NV_OK; 1163 } 1164 1165 static NV_STATUS init_gpu(uvm_gpu_t *gpu, const UvmGpuInfo *gpu_info) 1166 { 1167 NV_STATUS status; 1168 1169 // Presently, an RM client can only subscribe to a single partition per 1170 // GPU. Therefore, UVM needs to create several RM clients. For simplicity, 1171 // and since P2P is not supported when SMC partitions are created, we 1172 // create a client (session) per GPU partition. 1173 if (gpu->parent->smc.enabled) { 1174 UvmPlatformInfo platform_info; 1175 status = uvm_rm_locked_call(nvUvmInterfaceSessionCreate(&gpu->smc.rm_session_handle, &platform_info)); 1176 if (status != NV_OK) { 1177 UVM_ERR_PRINT("Creating RM session failed: %s\n", nvstatusToString(status)); 1178 return status; 1179 } 1180 1181 status = uvm_rm_locked_call(nvUvmInterfaceDeviceCreate(uvm_gpu_session_handle(gpu), 1182 gpu_info, 1183 uvm_gpu_uuid(gpu), 1184 &gpu->smc.rm_device, 1185 NV_TRUE)); 1186 if (status != NV_OK) { 1187 UVM_ERR_PRINT("Creating RM device failed: %s, GPU %s\n", nvstatusToString(status), uvm_gpu_name(gpu)); 1188 return status; 1189 } 1190 } 1191 1192 gpu->smc.swizz_id = gpu_info->smcSwizzId; 1193 1194 // Initialize the per-GPU procfs dirs as early as possible so that other 1195 // parts of the driver can add files in them as part of their per-GPU init. 1196 status = init_procfs_dirs(gpu); 1197 if (status != NV_OK) { 1198 UVM_ERR_PRINT("Failed to init procfs dirs: %s, GPU %s\n", nvstatusToString(status), uvm_gpu_name(gpu)); 1199 return status; 1200 } 1201 1202 status = get_gpu_caps(gpu); 1203 if (status != NV_OK) { 1204 UVM_ERR_PRINT("Failed to get GPU caps: %s, GPU %s\n", nvstatusToString(status), uvm_gpu_name(gpu)); 1205 return status; 1206 } 1207 1208 uvm_mmu_init_gpu_peer_addresses(gpu); 1209 1210 status = alloc_and_init_address_space(gpu); 1211 if (status != NV_OK) { 1212 UVM_ERR_PRINT("Creating RM address space failed: %s, GPU %s\n", nvstatusToString(status), uvm_gpu_name(gpu)); 1213 return status; 1214 } 1215 1216 status = get_gpu_fb_info(gpu); 1217 if (status != NV_OK) { 1218 UVM_ERR_PRINT("Failed to get GPU FB info: %s, GPU %s\n", nvstatusToString(status), uvm_gpu_name(gpu)); 1219 return status; 1220 } 1221 1222 status = get_gpu_ecc_info(gpu); 1223 if (status != NV_OK) { 1224 UVM_ERR_PRINT("Failed to get GPU ECC info: %s, GPU %s\n", nvstatusToString(status), uvm_gpu_name(gpu)); 1225 return status; 1226 } 1227 1228 status = uvm_pmm_gpu_init(&gpu->pmm); 1229 if (status != NV_OK) { 1230 UVM_ERR_PRINT("PMM initialization failed: %s, GPU %s\n", nvstatusToString(status), uvm_gpu_name(gpu)); 1231 return status; 1232 } 1233 1234 status = uvm_pmm_sysmem_mappings_init(gpu, &gpu->pmm_reverse_sysmem_mappings); 1235 if (status != NV_OK) { 1236 UVM_ERR_PRINT("CPU PMM MMIO initialization failed: %s, GPU %s\n", nvstatusToString(status), uvm_gpu_name(gpu)); 1237 return status; 1238 } 1239 1240 status = init_semaphore_pools(gpu); 1241 if (status != NV_OK) { 1242 UVM_ERR_PRINT("Failed to initialize the semaphore pool: %s, GPU %s\n", 1243 nvstatusToString(status), 1244 uvm_gpu_name(gpu)); 1245 return status; 1246 } 1247 1248 status = uvm_channel_manager_create(gpu, &gpu->channel_manager); 1249 if (status != NV_OK) { 1250 UVM_ERR_PRINT("Failed to initialize the channel manager: %s, GPU %s\n", 1251 nvstatusToString(status), 1252 uvm_gpu_name(gpu)); 1253 return status; 1254 } 1255 1256 status = configure_address_space(gpu); 1257 if (status != NV_OK) { 1258 UVM_ERR_PRINT("Failed to configure the GPU address space: %s, GPU %s\n", 1259 nvstatusToString(status), 1260 uvm_gpu_name(gpu)); 1261 return status; 1262 } 1263 1264 status = uvm_mmu_create_flat_mappings(gpu); 1265 if (status != NV_OK) { 1266 UVM_ERR_PRINT("Creating flat mappings failed: %s, GPU %s\n", nvstatusToString(status), uvm_gpu_name(gpu)); 1267 return status; 1268 } 1269 1270 status = uvm_conf_computing_gpu_init(gpu); 1271 if (status != NV_OK) { 1272 UVM_ERR_PRINT("Failed to initialize Confidential Compute: %s for GPU %s\n", 1273 nvstatusToString(status), 1274 uvm_gpu_name(gpu)); 1275 return status; 1276 } 1277 1278 status = init_procfs_files(gpu); 1279 if (status != NV_OK) { 1280 UVM_ERR_PRINT("Failed to init procfs files: %s, GPU %s\n", nvstatusToString(status), uvm_gpu_name(gpu)); 1281 return status; 1282 } 1283 1284 status = uvm_perf_heuristics_add_gpu(gpu); 1285 if (status != NV_OK) { 1286 UVM_ERR_PRINT("Failed to init heuristics: %s, GPU %s\n", nvstatusToString(status), uvm_gpu_name(gpu)); 1287 return status; 1288 } 1289 1290 return NV_OK; 1291 } 1292 1293 // Add a new gpu and register it with RM 1294 // TODO: Bug 2844714: Split parent-specific parts of this function out into a 1295 // separate add_parent_gpu() function. 1296 static NV_STATUS add_gpu(const NvProcessorUuid *gpu_uuid, 1297 const uvm_global_gpu_id_t global_gpu_id, 1298 const UvmGpuInfo *gpu_info, 1299 const UvmGpuPlatformInfo *gpu_platform_info, 1300 uvm_parent_gpu_t *parent_gpu, 1301 uvm_gpu_t **gpu_out) 1302 { 1303 NV_STATUS status; 1304 bool alloc_parent = (parent_gpu == NULL); 1305 uvm_gpu_t *gpu = NULL; 1306 1307 uvm_assert_mutex_locked(&g_uvm_global.global_lock); 1308 1309 if (alloc_parent) { 1310 status = alloc_parent_gpu(gpu_uuid, uvm_gpu_id_from_global_gpu_id(global_gpu_id), &parent_gpu); 1311 if (status != NV_OK) 1312 return status; 1313 } 1314 1315 gpu = alloc_gpu(parent_gpu, global_gpu_id); 1316 if (!gpu) { 1317 if (alloc_parent) 1318 uvm_parent_gpu_kref_put(parent_gpu); 1319 1320 return NV_ERR_NO_MEMORY; 1321 } 1322 1323 parent_gpu->num_retained_gpus++; 1324 1325 if (alloc_parent) 1326 fill_gpu_info(parent_gpu, gpu_info); 1327 1328 // After this point all error clean up should be handled by remove_gpu() 1329 1330 if (!gpu_supports_uvm(parent_gpu)) { 1331 UVM_DBG_PRINT("Registration of non-UVM-capable GPU attempted: GPU %s\n", uvm_gpu_name(gpu)); 1332 status = NV_ERR_NOT_SUPPORTED; 1333 goto error; 1334 } 1335 1336 if (alloc_parent) { 1337 status = init_parent_gpu(parent_gpu, gpu_uuid, gpu_info, gpu_platform_info); 1338 if (status != NV_OK) 1339 goto error; 1340 } 1341 1342 status = init_gpu(gpu, gpu_info); 1343 if (status != NV_OK) 1344 goto error; 1345 1346 status = uvm_gpu_check_ecc_error(gpu); 1347 if (status != NV_OK) 1348 goto error; 1349 1350 atomic64_set(&gpu->retained_count, 1); 1351 uvm_global_processor_mask_set(&g_uvm_global.retained_gpus, gpu->global_id); 1352 1353 uvm_spin_lock_irqsave(&g_uvm_global.gpu_table_lock); 1354 1355 if (alloc_parent) 1356 uvm_global_add_parent_gpu(parent_gpu); 1357 1358 // Mark the GPU as valid in the parent GPU's GPU table. 1359 UVM_ASSERT(!test_bit(uvm_global_id_sub_processor_index(gpu->global_id), parent_gpu->valid_gpus)); 1360 __set_bit(uvm_global_id_sub_processor_index(gpu->global_id), parent_gpu->valid_gpus); 1361 1362 // Although locking correctness does not, at this early point (before the 1363 // GPU is visible in the table) strictly require holding the gpu_table_lock 1364 // in order to read gpu->isr.replayable_faults.handling, nor to enable page 1365 // fault interrupts (this could have been done earlier), it is best to do it 1366 // here, in order to avoid an interrupt storm. That way, we take advantage 1367 // of the spinlock_irqsave side effect of turning off local CPU interrupts, 1368 // part of holding the gpu_table_lock. That means that the local CPU won't 1369 // receive any of these interrupts, until the GPU is safely added to the 1370 // table (where the top half ISR can find it). 1371 // 1372 // As usual with spinlock_irqsave behavior, *other* CPUs can still handle 1373 // these interrupts, but the local CPU will not be slowed down (interrupted) 1374 // by such handling, and can quickly release the gpu_table_lock, thus 1375 // unblocking any other CPU's top half (which waits for the gpu_table_lock). 1376 if (alloc_parent && parent_gpu->isr.replayable_faults.handling) { 1377 parent_gpu->fault_buffer_hal->enable_replayable_faults(parent_gpu); 1378 1379 // Clear the interrupt bit and force the re-evaluation of the interrupt 1380 // condition to ensure that we don't miss any pending interrupt 1381 parent_gpu->fault_buffer_hal->clear_replayable_faults(parent_gpu, 1382 parent_gpu->fault_buffer_info.replayable.cached_get); 1383 } 1384 1385 // Access counters are enabled on demand 1386 1387 uvm_spin_unlock_irqrestore(&g_uvm_global.gpu_table_lock); 1388 1389 if (alloc_parent) { 1390 status = discover_nvlink_peers(gpu); 1391 if (status != NV_OK) { 1392 UVM_ERR_PRINT("Failed to discover NVLINK peers: %s, GPU %s\n", nvstatusToString(status), uvm_gpu_name(gpu)); 1393 1394 // Nobody can have retained the GPU yet, since we still hold the global 1395 // lock. 1396 UVM_ASSERT(uvm_gpu_retained_count(gpu) == 1); 1397 atomic64_set(&gpu->retained_count, 0); 1398 goto error; 1399 } 1400 } 1401 1402 *gpu_out = gpu; 1403 1404 return NV_OK; 1405 1406 error: 1407 remove_gpu(gpu); 1408 1409 return status; 1410 } 1411 1412 static void sync_parent_gpu_trackers(uvm_parent_gpu_t *parent_gpu, 1413 bool sync_replay_tracker, 1414 bool sync_clear_faulted_tracker) 1415 { 1416 NV_STATUS status; 1417 1418 // Sync the replay tracker since it inherits dependencies from the VA block 1419 // trackers. 1420 if (sync_replay_tracker) { 1421 uvm_gpu_replayable_faults_isr_lock(parent_gpu); 1422 status = uvm_tracker_wait(&parent_gpu->fault_buffer_info.replayable.replay_tracker); 1423 uvm_gpu_replayable_faults_isr_unlock(parent_gpu); 1424 1425 if (status != NV_OK) 1426 UVM_ASSERT(status == uvm_global_get_status()); 1427 } 1428 1429 // Sync the clear_faulted tracker since it inherits dependencies from the 1430 // VA block trackers, too. 1431 if (sync_clear_faulted_tracker) { 1432 uvm_gpu_non_replayable_faults_isr_lock(parent_gpu); 1433 status = uvm_tracker_wait(&parent_gpu->fault_buffer_info.non_replayable.clear_faulted_tracker); 1434 uvm_gpu_non_replayable_faults_isr_unlock(parent_gpu); 1435 1436 if (status != NV_OK) 1437 UVM_ASSERT(status == uvm_global_get_status()); 1438 } 1439 } 1440 1441 // Remove all references the given GPU has to other GPUs, since one of those 1442 // other GPUs is getting removed. This involves waiting for any unfinished 1443 // trackers contained by this GPU. 1444 static void remove_gpus_from_gpu(uvm_gpu_t *gpu) 1445 { 1446 sync_parent_gpu_trackers(gpu->parent, 1447 gpu->parent->isr.replayable_faults.handling, 1448 gpu->parent->isr.non_replayable_faults.handling); 1449 1450 // Sync all trackers in PMM 1451 uvm_pmm_gpu_sync(&gpu->pmm); 1452 1453 // Sync all trackers in the GPU's DMA allocation pool 1454 uvm_conf_computing_dma_buffer_pool_sync(&gpu->conf_computing.dma_buffer_pool); 1455 } 1456 1457 // Remove all references to the given GPU from its parent, since it is being 1458 // removed. This involves waiting for any unfinished trackers contained 1459 // by the parent GPU. 1460 static void remove_gpu_from_parent_gpu(uvm_gpu_t *gpu) 1461 { 1462 // We use *.was_handling instead of *.handling here since this function is 1463 // called after uvm_gpu_disable_isr(), and the *.handling flags will 1464 // already have been copied to *.was_handling, and then set to false. 1465 sync_parent_gpu_trackers(gpu->parent, 1466 gpu->parent->isr.replayable_faults.was_handling, 1467 gpu->parent->isr.non_replayable_faults.was_handling); 1468 } 1469 1470 static void deinit_parent_gpu(uvm_parent_gpu_t *parent_gpu) 1471 { 1472 // All channels should have been removed before the retained count went to 0 1473 UVM_ASSERT(uvm_rb_tree_empty(&parent_gpu->instance_ptr_table)); 1474 UVM_ASSERT(uvm_rb_tree_empty(&parent_gpu->tsg_table)); 1475 1476 // Access counters should have been disabled when the GPU is no longer 1477 // registered in any VA space. 1478 UVM_ASSERT(parent_gpu->isr.access_counters.handling_ref_count == 0); 1479 1480 // Return ownership to RM 1481 uvm_gpu_deinit_isr(parent_gpu); 1482 1483 deinit_parent_procfs_files(parent_gpu); 1484 1485 uvm_ats_remove_gpu(parent_gpu); 1486 1487 UVM_ASSERT(atomic64_read(&parent_gpu->mapped_cpu_pages_size) == 0); 1488 1489 // After calling nvUvmInterfaceUnregisterGpu() the reference to pci_dev may 1490 // not be valid any more so clear it ahead of time. 1491 parent_gpu->pci_dev = NULL; 1492 1493 deinit_parent_procfs_dir(parent_gpu); 1494 1495 if (parent_gpu->rm_info.isSimulated) 1496 --g_uvm_global.num_simulated_devices; 1497 1498 if (parent_gpu->rm_device != 0) 1499 uvm_rm_locked_call_void(nvUvmInterfaceDeviceDestroy(parent_gpu->rm_device)); 1500 1501 uvm_parent_gpu_kref_put(parent_gpu); 1502 } 1503 1504 static void deinit_gpu(uvm_gpu_t *gpu) 1505 { 1506 uvm_gpu_t *other_gpu; 1507 1508 // Remove any pointers to this GPU from other GPUs' trackers. 1509 for_each_global_gpu(other_gpu) { 1510 UVM_ASSERT(other_gpu != gpu); 1511 remove_gpus_from_gpu(other_gpu); 1512 } 1513 1514 // Further, remove any pointers to this GPU from its parent's trackers. 1515 remove_gpu_from_parent_gpu(gpu); 1516 1517 uvm_perf_heuristics_remove_gpu(gpu); 1518 1519 deinit_procfs_files(gpu); 1520 1521 // TODO Bug 3429163: [UVM] Move uvm_mmu_destroy_flat_mapping() to the 1522 // correct spot 1523 uvm_mmu_destroy_flat_mappings(gpu); 1524 1525 // Wait for any deferred frees and their associated trackers to be finished 1526 // before tearing down channels. 1527 uvm_pmm_gpu_sync(&gpu->pmm); 1528 1529 uvm_channel_manager_destroy(gpu->channel_manager); 1530 1531 // Deconfigure the address space only after destroying all the channels as 1532 // in case any of them hit fatal errors, RM will assert that they are not 1533 // idle during nvUvmInterfaceUnsetPageDirectory() and that's an unnecessary 1534 // pain during development. 1535 deconfigure_address_space(gpu); 1536 1537 deinit_semaphore_pools(gpu); 1538 1539 uvm_pmm_sysmem_mappings_deinit(&gpu->pmm_reverse_sysmem_mappings); 1540 1541 uvm_pmm_gpu_deinit(&gpu->pmm); 1542 1543 if (gpu->rm_address_space != 0) 1544 uvm_rm_locked_call_void(nvUvmInterfaceAddressSpaceDestroy(gpu->rm_address_space)); 1545 1546 deinit_procfs_dirs(gpu); 1547 1548 if (gpu->parent->smc.enabled) { 1549 if (gpu->smc.rm_device != 0) 1550 uvm_rm_locked_call_void(nvUvmInterfaceDeviceDestroy(gpu->smc.rm_device)); 1551 1552 if (gpu->smc.rm_session_handle != 0) 1553 uvm_rm_locked_call_void(nvUvmInterfaceSessionDestroy(gpu->smc.rm_session_handle)); 1554 } 1555 1556 gpu->magic = 0; 1557 } 1558 1559 // Remove a gpu and unregister it from RM 1560 // Note that this is also used in most error paths in add_gpu() 1561 static void remove_gpu(uvm_gpu_t *gpu) 1562 { 1563 NvU32 sub_processor_index; 1564 uvm_parent_gpu_t *parent_gpu; 1565 bool free_parent; 1566 1567 uvm_assert_mutex_locked(&g_uvm_global.global_lock); 1568 1569 sub_processor_index = uvm_global_id_sub_processor_index(gpu->global_id); 1570 parent_gpu = gpu->parent; 1571 1572 UVM_ASSERT_MSG(uvm_gpu_retained_count(gpu) == 0, 1573 "gpu_id %u retained_count %llu\n", 1574 uvm_id_value(gpu->id), 1575 uvm_gpu_retained_count(gpu)); 1576 1577 UVM_ASSERT(parent_gpu->num_retained_gpus > 0); 1578 parent_gpu->num_retained_gpus--; 1579 1580 free_parent = (parent_gpu->num_retained_gpus == 0); 1581 1582 // NVLINK peers must be removed and the relevant access counter buffers must 1583 // be flushed before removing this GPU from the global table. See the 1584 // comment on discover_nvlink_peers in add_gpu. 1585 if (free_parent) 1586 destroy_nvlink_peers(gpu); 1587 1588 // uvm_mem_free and other uvm_mem APIs invoked by the Confidential Compute 1589 // deinitialization must be called before the GPU is removed from the global 1590 // table. 1591 // 1592 // TODO: Bug 2008200: Add and remove the GPU in a more reasonable spot. 1593 uvm_conf_computing_gpu_deinit(gpu); 1594 1595 // TODO: Bug 2844714: If the parent is not being freed, the following 1596 // gpu_table_lock is only needed to protect concurrent 1597 // find_first_valid_gpu() in BH from the __clear_bit here. After 1598 // find_first_valid_gpu() is removed, gpu_table_lock should only be acquired 1599 // and released in the free_parent case. 1600 // 1601 // In the free_parent case, gpu_table_lock protects the top half from the 1602 // uvm_global_remove_parent_gpu() 1603 uvm_spin_lock_irqsave(&g_uvm_global.gpu_table_lock); 1604 1605 // Mark the GPU as invalid in the parent GPU's GPU table. 1606 __clear_bit(sub_processor_index, parent_gpu->valid_gpus); 1607 1608 // Remove the GPU from the table. 1609 if (free_parent) 1610 uvm_global_remove_parent_gpu(parent_gpu); 1611 1612 uvm_spin_unlock_irqrestore(&g_uvm_global.gpu_table_lock); 1613 1614 uvm_global_processor_mask_clear(&g_uvm_global.retained_gpus, gpu->global_id); 1615 1616 // If the parent is being freed, stop scheduling new bottom halves and 1617 // update relevant software state. Else flush any pending bottom halves 1618 // before continuing. 1619 if (free_parent) 1620 uvm_gpu_disable_isr(parent_gpu); 1621 else 1622 uvm_gpu_flush_bottom_halves(parent_gpu); 1623 1624 deinit_gpu(gpu); 1625 1626 UVM_ASSERT(parent_gpu->gpus[sub_processor_index] == gpu); 1627 parent_gpu->gpus[sub_processor_index] = NULL; 1628 uvm_kvfree(gpu); 1629 1630 if (free_parent) 1631 deinit_parent_gpu(parent_gpu); 1632 } 1633 1634 // Do not not call this directly. It is called by nv_kref_put, when the 1635 // GPU's ref count drops to zero. 1636 static void uvm_parent_gpu_destroy(nv_kref_t *nv_kref) 1637 { 1638 uvm_parent_gpu_t *parent_gpu = container_of(nv_kref, uvm_parent_gpu_t, gpu_kref); 1639 NvU32 sub_processor_index; 1640 1641 UVM_ASSERT(parent_gpu->num_retained_gpus == 0); 1642 UVM_ASSERT(bitmap_empty(parent_gpu->valid_gpus, UVM_ID_MAX_SUB_PROCESSORS)); 1643 1644 nv_kthread_q_stop(&parent_gpu->lazy_free_q); 1645 1646 for (sub_processor_index = 0; sub_processor_index < UVM_ID_MAX_SUB_PROCESSORS; sub_processor_index++) 1647 UVM_ASSERT(!parent_gpu->gpus[sub_processor_index]); 1648 1649 uvm_kvfree(parent_gpu); 1650 } 1651 1652 void uvm_parent_gpu_kref_put(uvm_parent_gpu_t *parent_gpu) 1653 { 1654 nv_kref_put(&parent_gpu->gpu_kref, uvm_parent_gpu_destroy); 1655 } 1656 1657 static void update_stats_gpu_fault_instance(uvm_gpu_t *gpu, 1658 const uvm_fault_buffer_entry_t *fault_entry, 1659 bool is_duplicate) 1660 { 1661 if (!fault_entry->is_replayable) { 1662 switch (fault_entry->fault_access_type) 1663 { 1664 case UVM_FAULT_ACCESS_TYPE_READ: 1665 ++gpu->parent->fault_buffer_info.non_replayable.stats.num_read_faults; 1666 break; 1667 case UVM_FAULT_ACCESS_TYPE_WRITE: 1668 ++gpu->parent->fault_buffer_info.non_replayable.stats.num_write_faults; 1669 break; 1670 case UVM_FAULT_ACCESS_TYPE_ATOMIC_WEAK: 1671 case UVM_FAULT_ACCESS_TYPE_ATOMIC_STRONG: 1672 ++gpu->parent->fault_buffer_info.non_replayable.stats.num_atomic_faults; 1673 break; 1674 default: 1675 UVM_ASSERT_MSG(false, "Invalid access type for non-replayable faults\n"); 1676 break; 1677 } 1678 1679 if (!fault_entry->is_virtual) 1680 ++gpu->parent->fault_buffer_info.non_replayable.stats.num_physical_faults; 1681 1682 ++gpu->parent->stats.num_non_replayable_faults; 1683 1684 return; 1685 } 1686 1687 UVM_ASSERT(fault_entry->is_virtual); 1688 1689 switch (fault_entry->fault_access_type) 1690 { 1691 case UVM_FAULT_ACCESS_TYPE_PREFETCH: 1692 ++gpu->parent->fault_buffer_info.replayable.stats.num_prefetch_faults; 1693 break; 1694 case UVM_FAULT_ACCESS_TYPE_READ: 1695 ++gpu->parent->fault_buffer_info.replayable.stats.num_read_faults; 1696 break; 1697 case UVM_FAULT_ACCESS_TYPE_WRITE: 1698 ++gpu->parent->fault_buffer_info.replayable.stats.num_write_faults; 1699 break; 1700 case UVM_FAULT_ACCESS_TYPE_ATOMIC_WEAK: 1701 case UVM_FAULT_ACCESS_TYPE_ATOMIC_STRONG: 1702 ++gpu->parent->fault_buffer_info.replayable.stats.num_atomic_faults; 1703 break; 1704 default: 1705 break; 1706 } 1707 if (is_duplicate || fault_entry->filtered) 1708 ++gpu->parent->fault_buffer_info.replayable.stats.num_duplicate_faults; 1709 1710 ++gpu->parent->stats.num_replayable_faults; 1711 } 1712 1713 static void update_stats_fault_cb(uvm_perf_event_t event_id, uvm_perf_event_data_t *event_data) 1714 { 1715 uvm_gpu_t *gpu; 1716 const uvm_fault_buffer_entry_t *fault_entry, *fault_instance; 1717 1718 UVM_ASSERT(event_id == UVM_PERF_EVENT_FAULT); 1719 1720 if (UVM_ID_IS_CPU(event_data->fault.proc_id)) 1721 return; 1722 1723 // The reported fault entry must be the "representative" fault entry 1724 UVM_ASSERT(!event_data->fault.gpu.buffer_entry->filtered); 1725 1726 gpu = uvm_va_space_get_gpu(event_data->fault.space, event_data->fault.proc_id); 1727 1728 fault_entry = event_data->fault.gpu.buffer_entry; 1729 1730 // Update the stats using the representative fault entry and the rest of 1731 // instances 1732 update_stats_gpu_fault_instance(gpu, fault_entry, event_data->fault.gpu.is_duplicate); 1733 1734 list_for_each_entry(fault_instance, &fault_entry->merged_instances_list, merged_instances_list) 1735 update_stats_gpu_fault_instance(gpu, fault_instance, event_data->fault.gpu.is_duplicate); 1736 } 1737 1738 static void update_stats_migration_cb(uvm_perf_event_t event_id, uvm_perf_event_data_t *event_data) 1739 { 1740 uvm_gpu_t *gpu_dst = NULL; 1741 uvm_gpu_t *gpu_src = NULL; 1742 NvU64 pages; 1743 bool is_replayable_fault; 1744 bool is_non_replayable_fault; 1745 bool is_access_counter; 1746 uvm_va_space_t *va_space = uvm_va_block_get_va_space(event_data->migration.block); 1747 1748 UVM_ASSERT(event_id == UVM_PERF_EVENT_MIGRATION); 1749 1750 if (UVM_ID_IS_GPU(event_data->migration.dst)) 1751 gpu_dst = uvm_va_space_get_gpu(va_space, event_data->migration.dst); 1752 1753 if (UVM_ID_IS_GPU(event_data->migration.src)) 1754 gpu_src = uvm_va_space_get_gpu(va_space, event_data->migration.src); 1755 1756 if (!gpu_dst && !gpu_src) 1757 return; 1758 1759 // Page prefetching is also triggered by faults 1760 is_replayable_fault = 1761 event_data->migration.make_resident_context->cause == UVM_MAKE_RESIDENT_CAUSE_REPLAYABLE_FAULT; 1762 is_non_replayable_fault = 1763 event_data->migration.make_resident_context->cause == UVM_MAKE_RESIDENT_CAUSE_NON_REPLAYABLE_FAULT; 1764 is_access_counter = 1765 event_data->migration.make_resident_context->cause == UVM_MAKE_RESIDENT_CAUSE_ACCESS_COUNTER; 1766 1767 pages = event_data->migration.bytes / PAGE_SIZE; 1768 UVM_ASSERT(event_data->migration.bytes % PAGE_SIZE == 0); 1769 UVM_ASSERT(pages > 0); 1770 1771 if (gpu_dst) { 1772 atomic64_add(pages, &gpu_dst->parent->stats.num_pages_in); 1773 if (is_replayable_fault) 1774 atomic64_add(pages, &gpu_dst->parent->fault_buffer_info.replayable.stats.num_pages_in); 1775 else if (is_non_replayable_fault) 1776 atomic64_add(pages, &gpu_dst->parent->fault_buffer_info.non_replayable.stats.num_pages_in); 1777 else if (is_access_counter) 1778 atomic64_add(pages, &gpu_dst->parent->access_counter_buffer_info.stats.num_pages_in); 1779 } 1780 if (gpu_src) { 1781 atomic64_add(pages, &gpu_src->parent->stats.num_pages_out); 1782 if (is_replayable_fault) 1783 atomic64_add(pages, &gpu_src->parent->fault_buffer_info.replayable.stats.num_pages_out); 1784 else if (is_non_replayable_fault) 1785 atomic64_add(pages, &gpu_src->parent->fault_buffer_info.non_replayable.stats.num_pages_out); 1786 else if (is_access_counter) 1787 atomic64_add(pages, &gpu_src->parent->access_counter_buffer_info.stats.num_pages_out); 1788 } 1789 } 1790 1791 // Override the UVM driver and GPU settings from the module loader 1792 static void uvm_param_conf(void) 1793 { 1794 // uvm_peer_copy: Valid entries are "phys" and "virt" for Ampere+ GPUs. 1795 // No effect in pre-Ampere GPUs 1796 if (strcmp(uvm_peer_copy, UVM_PARAM_PEER_COPY_VIRTUAL) == 0) { 1797 g_uvm_global.peer_copy_mode = UVM_GPU_PEER_COPY_MODE_VIRTUAL; 1798 } 1799 else { 1800 if (strcmp(uvm_peer_copy, UVM_PARAM_PEER_COPY_PHYSICAL) != 0) { 1801 pr_info("Invalid value for uvm_peer_copy = %s, using %s instead.\n", 1802 uvm_peer_copy, UVM_PARAM_PEER_COPY_PHYSICAL); 1803 } 1804 1805 g_uvm_global.peer_copy_mode = UVM_GPU_PEER_COPY_MODE_PHYSICAL; 1806 } 1807 } 1808 1809 NV_STATUS uvm_gpu_init(void) 1810 { 1811 NV_STATUS status; 1812 1813 uvm_param_conf(); 1814 1815 status = uvm_hal_init_table(); 1816 if (status != NV_OK) { 1817 UVM_ERR_PRINT("uvm_hal_init_table() failed: %s\n", nvstatusToString(status)); 1818 return status; 1819 } 1820 1821 return NV_OK; 1822 } 1823 1824 void uvm_gpu_exit(void) 1825 { 1826 uvm_parent_gpu_t *parent_gpu; 1827 1828 for_each_parent_gpu(parent_gpu) 1829 UVM_ASSERT_MSG(false, "GPU still present: %s\n", parent_gpu->name); 1830 1831 // CPU should never be in the retained GPUs mask 1832 UVM_ASSERT(!uvm_global_processor_mask_test(&g_uvm_global.retained_gpus, UVM_GLOBAL_ID_CPU)); 1833 } 1834 1835 NV_STATUS uvm_gpu_init_va_space(uvm_va_space_t *va_space) 1836 { 1837 NV_STATUS status; 1838 1839 if (uvm_procfs_is_debug_enabled()) { 1840 status = uvm_perf_register_event_callback(&va_space->perf_events, 1841 UVM_PERF_EVENT_FAULT, 1842 update_stats_fault_cb); 1843 if (status != NV_OK) 1844 return status; 1845 1846 status = uvm_perf_register_event_callback(&va_space->perf_events, 1847 UVM_PERF_EVENT_MIGRATION, 1848 update_stats_migration_cb); 1849 if (status != NV_OK) 1850 return status; 1851 } 1852 1853 return NV_OK; 1854 } 1855 1856 uvm_parent_gpu_t *uvm_parent_gpu_get_by_uuid_locked(const NvProcessorUuid *gpu_uuid) 1857 { 1858 uvm_parent_gpu_t *parent_gpu; 1859 1860 for_each_parent_gpu(parent_gpu) { 1861 if (uvm_processor_uuid_eq(&parent_gpu->uuid, gpu_uuid)) 1862 return parent_gpu; 1863 } 1864 1865 return NULL; 1866 } 1867 1868 uvm_parent_gpu_t *uvm_parent_gpu_get_by_uuid(const NvProcessorUuid *gpu_uuid) 1869 { 1870 uvm_assert_mutex_locked(&g_uvm_global.global_lock); 1871 1872 return uvm_parent_gpu_get_by_uuid_locked(gpu_uuid); 1873 } 1874 1875 static uvm_gpu_t *uvm_gpu_get_by_uuid_locked(const NvProcessorUuid *gpu_uuid) 1876 { 1877 uvm_gpu_id_t gpu_id; 1878 uvm_global_gpu_id_t global_gpu_id; 1879 uvm_gpu_t *gpu; 1880 1881 for_each_gpu_id(gpu_id) { 1882 global_gpu_id = uvm_global_gpu_id_from_gpu_id(gpu_id); 1883 gpu = uvm_gpu_get(global_gpu_id); 1884 if (gpu) { 1885 if (uvm_processor_uuid_eq(uvm_gpu_uuid(gpu), gpu_uuid)) { 1886 UVM_ASSERT(!gpu->parent->smc.enabled); 1887 return gpu; 1888 } 1889 } 1890 } 1891 1892 return NULL; 1893 } 1894 1895 uvm_gpu_t *uvm_gpu_get_by_uuid(const NvProcessorUuid *gpu_uuid) 1896 { 1897 uvm_assert_mutex_locked(&g_uvm_global.global_lock); 1898 1899 return uvm_gpu_get_by_uuid_locked(gpu_uuid); 1900 } 1901 1902 uvm_gpu_t *uvm_gpu_get_by_parent_and_swizz_id_locked(uvm_parent_gpu_t *parent_gpu, NvU32 swizz_id) 1903 { 1904 uvm_gpu_t *gpu; 1905 1906 UVM_ASSERT(parent_gpu); 1907 1908 for_each_gpu_in_parent(parent_gpu, gpu) { 1909 if (gpu->smc.swizz_id == swizz_id) 1910 return gpu; 1911 } 1912 1913 return NULL; 1914 } 1915 1916 uvm_gpu_t *uvm_gpu_get_by_parent_and_swizz_id(uvm_parent_gpu_t *parent_gpu, NvU32 swizz_id) 1917 { 1918 uvm_assert_mutex_locked(&g_uvm_global.global_lock); 1919 1920 return uvm_gpu_get_by_parent_and_swizz_id_locked(parent_gpu, swizz_id); 1921 } 1922 1923 // Increment the refcount for the GPU with the given UUID. If this is the first 1924 // time that this UUID is retained, the GPU is added to UVM. 1925 // When SMC partitioning is enabled, user_rm_device contains the user handles 1926 // that were created by the caller, and that can be used to identify and 1927 // obtain information about the partition. nvUvmInterfaceGetGpuInfo returns, in 1928 // gpu_info, whether SMC is enabled and the swizzId corresponding to the 1929 // partition. 1930 static NV_STATUS gpu_retain_by_uuid_locked(const NvProcessorUuid *gpu_uuid, 1931 const uvm_rm_user_object_t *user_rm_device, 1932 uvm_gpu_t **gpu_out) 1933 { 1934 NV_STATUS status = NV_OK; 1935 uvm_gpu_t *gpu = NULL; 1936 uvm_parent_gpu_t *parent_gpu; 1937 UvmGpuInfo *gpu_info = NULL; 1938 UvmGpuClientInfo client_info = {0}; 1939 UvmGpuPlatformInfo gpu_platform_info = {0}; 1940 uvm_global_gpu_id_t global_gpu_id; 1941 1942 client_info.hClient = user_rm_device->user_client; 1943 client_info.hSmcPartRef = user_rm_device->user_object; 1944 1945 gpu_info = uvm_kvmalloc_zero(sizeof(*gpu_info)); 1946 if (!gpu_info) 1947 return NV_ERR_NO_MEMORY; 1948 1949 uvm_assert_mutex_locked(&g_uvm_global.global_lock); 1950 1951 parent_gpu = uvm_parent_gpu_get_by_uuid(gpu_uuid); 1952 1953 if (parent_gpu == NULL) { 1954 // If this is the first time the UUID is seen, register it on RM 1955 status = uvm_rm_locked_call(nvUvmInterfaceRegisterGpu(gpu_uuid, &gpu_platform_info)); 1956 if (status != NV_OK) 1957 goto error_free_gpu_info; 1958 } 1959 1960 status = uvm_rm_locked_call(nvUvmInterfaceGetGpuInfo(gpu_uuid, &client_info, gpu_info)); 1961 if (status != NV_OK) 1962 goto error_unregister; 1963 1964 if (parent_gpu != NULL) { 1965 // If the UUID has been seen before, and if SMC is enabled, then check 1966 // if this specific partition has been seen previously. The UUID-based 1967 // look-up above may have succeeded for a different partition with the 1968 // same parent GPU. 1969 if (gpu_info->smcEnabled) { 1970 gpu = uvm_gpu_get_by_parent_and_swizz_id(parent_gpu, gpu_info->smcSwizzId); 1971 } 1972 else { 1973 gpu = parent_gpu->gpus[0]; 1974 UVM_ASSERT(gpu != NULL); 1975 } 1976 } 1977 1978 if (gpu == NULL) { 1979 status = find_unused_global_gpu_id(parent_gpu, &global_gpu_id); 1980 if (status != NV_OK) 1981 goto error_unregister; 1982 1983 status = add_gpu(gpu_uuid, global_gpu_id, gpu_info, &gpu_platform_info, parent_gpu, &gpu); 1984 if (status != NV_OK) 1985 goto error_unregister; 1986 } 1987 else { 1988 atomic64_inc(&gpu->retained_count); 1989 } 1990 1991 *gpu_out = gpu; 1992 1993 uvm_kvfree(gpu_info); 1994 1995 return status; 1996 1997 error_unregister: 1998 if (parent_gpu == NULL) 1999 uvm_rm_locked_call_void(nvUvmInterfaceUnregisterGpu(gpu_uuid)); 2000 error_free_gpu_info: 2001 uvm_kvfree(gpu_info); 2002 2003 return status; 2004 } 2005 2006 NV_STATUS uvm_gpu_retain_by_uuid(const NvProcessorUuid *gpu_uuid, 2007 const uvm_rm_user_object_t *user_rm_device, 2008 uvm_gpu_t **gpu_out) 2009 { 2010 NV_STATUS status; 2011 uvm_mutex_lock(&g_uvm_global.global_lock); 2012 status = gpu_retain_by_uuid_locked(gpu_uuid, user_rm_device, gpu_out); 2013 uvm_mutex_unlock(&g_uvm_global.global_lock); 2014 return status; 2015 } 2016 2017 void uvm_gpu_retain(uvm_gpu_t *gpu) 2018 { 2019 UVM_ASSERT(uvm_gpu_retained_count(gpu) > 0); 2020 atomic64_inc(&gpu->retained_count); 2021 } 2022 2023 void uvm_gpu_release_locked(uvm_gpu_t *gpu) 2024 { 2025 uvm_parent_gpu_t *parent_gpu = gpu->parent; 2026 2027 uvm_assert_mutex_locked(&g_uvm_global.global_lock); 2028 UVM_ASSERT(uvm_gpu_retained_count(gpu) > 0); 2029 2030 if (atomic64_dec_and_test(&gpu->retained_count)) { 2031 nv_kref_get(&parent_gpu->gpu_kref); 2032 remove_gpu(gpu); 2033 if (parent_gpu->num_retained_gpus == 0) 2034 uvm_rm_locked_call_void(nvUvmInterfaceUnregisterGpu(&parent_gpu->uuid)); 2035 uvm_parent_gpu_kref_put(parent_gpu); 2036 } 2037 } 2038 2039 void uvm_gpu_release(uvm_gpu_t *gpu) 2040 { 2041 uvm_mutex_lock(&g_uvm_global.global_lock); 2042 uvm_gpu_release_locked(gpu); 2043 uvm_mutex_unlock(&g_uvm_global.global_lock); 2044 } 2045 2046 // Note: Peer table is an upper triangular matrix packed into a flat array. 2047 // This function converts an index of 2D array of size [N x N] into an index 2048 // of upper triangular array of size [((N - 1) * ((N - 1) + 1)) / 2] which 2049 // does not include diagonal elements. 2050 NvU32 uvm_gpu_peer_table_index(const uvm_gpu_id_t gpu_id0, const uvm_gpu_id_t gpu_id1) 2051 { 2052 NvU32 square_index, triangular_index; 2053 NvU32 gpu_index0 = uvm_id_gpu_index(gpu_id0); 2054 NvU32 gpu_index1 = uvm_id_gpu_index(gpu_id1); 2055 2056 UVM_ASSERT(!uvm_id_equal(gpu_id0, gpu_id1)); 2057 2058 // Calculate an index of 2D array by re-ordering indices to always point 2059 // to the same entry. 2060 square_index = min(gpu_index0, gpu_index1) * UVM_ID_MAX_GPUS + 2061 max(gpu_index0, gpu_index1); 2062 2063 // Calculate and subtract number of lower triangular matrix elements till 2064 // the current row (which includes diagonal elements) to get the correct 2065 // index in an upper triangular matrix. 2066 // Note: As gpu_id can be [1, N), no extra logic is needed to calculate 2067 // diagonal elements. 2068 triangular_index = square_index - SUM_FROM_0_TO_N(min(uvm_id_value(gpu_id0), uvm_id_value(gpu_id1))); 2069 2070 UVM_ASSERT(triangular_index < UVM_MAX_UNIQUE_GPU_PAIRS); 2071 2072 return triangular_index; 2073 } 2074 2075 NV_STATUS uvm_gpu_check_ecc_error_no_rm(uvm_gpu_t *gpu) 2076 { 2077 // We may need to call service_interrupts() which cannot be done in the top 2078 // half interrupt handler so assert here as well to catch improper use as 2079 // early as possible. 2080 UVM_ASSERT(!in_interrupt()); 2081 2082 if (!gpu->ecc.enabled) 2083 return NV_OK; 2084 2085 // Early out If a global ECC error is already set to not spam the logs with 2086 // the same error. 2087 if (uvm_global_get_status() == NV_ERR_ECC_ERROR) 2088 return NV_ERR_ECC_ERROR; 2089 2090 if (*gpu->ecc.error_notifier) { 2091 UVM_ERR_PRINT("ECC error encountered, GPU %s\n", uvm_gpu_name(gpu)); 2092 uvm_global_set_fatal_error(NV_ERR_ECC_ERROR); 2093 return NV_ERR_ECC_ERROR; 2094 } 2095 2096 // RM hasn't seen an ECC error yet, check whether there is a pending 2097 // interrupt that might indicate one. We might get false positives because 2098 // the interrupt bits we read are not ECC-specific. They're just the 2099 // top-level bits for any interrupt on all engines which support ECC. On 2100 // Pascal for example, RM returns us a mask with the bits for GR, L2, and 2101 // FB, because any of those might raise an ECC interrupt. So if they're set 2102 // we have to ask RM to check whether it was really an ECC error (and a 2103 // double-bit ECC error at that), in which case it sets the notifier. 2104 if ((*gpu->ecc.hw_interrupt_tree_location & gpu->ecc.mask) == 0) { 2105 // No pending interrupts. 2106 return NV_OK; 2107 } 2108 2109 // An interrupt that might mean an ECC error needs to be serviced, signal 2110 // that to the caller. 2111 return NV_WARN_MORE_PROCESSING_REQUIRED; 2112 } 2113 2114 static NV_STATUS get_p2p_caps(uvm_gpu_t *gpu0, 2115 uvm_gpu_t *gpu1, 2116 UvmGpuP2PCapsParams *p2p_caps_params) 2117 { 2118 NV_STATUS status; 2119 uvmGpuDeviceHandle rm_device0, rm_device1; 2120 2121 if (uvm_id_value(gpu0->id) < uvm_id_value(gpu1->id)) { 2122 rm_device0 = uvm_gpu_device_handle(gpu0); 2123 rm_device1 = uvm_gpu_device_handle(gpu1); 2124 } 2125 else { 2126 rm_device0 = uvm_gpu_device_handle(gpu1); 2127 rm_device1 = uvm_gpu_device_handle(gpu0); 2128 } 2129 2130 memset(p2p_caps_params, 0, sizeof(*p2p_caps_params)); 2131 status = uvm_rm_locked_call(nvUvmInterfaceGetP2PCaps(rm_device0, rm_device1, p2p_caps_params)); 2132 if (status != NV_OK) { 2133 UVM_ERR_PRINT("nvUvmInterfaceGetP2PCaps() failed with error: %s, for GPU0:%s and GPU1:%s\n", 2134 nvstatusToString(status), 2135 uvm_gpu_name(gpu0), 2136 uvm_gpu_name(gpu1)); 2137 return status; 2138 } 2139 2140 if (p2p_caps_params->p2pLink != UVM_LINK_TYPE_NONE) { 2141 // P2P is not supported under SMC partitioning 2142 UVM_ASSERT(!gpu0->parent->smc.enabled); 2143 UVM_ASSERT(!gpu1->parent->smc.enabled); 2144 } 2145 2146 return NV_OK; 2147 } 2148 2149 static NV_STATUS create_p2p_object(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1, NvHandle *p2p_handle) 2150 { 2151 NV_STATUS status; 2152 uvmGpuDeviceHandle rm_device0, rm_device1; 2153 2154 if (uvm_id_value(gpu0->id) < uvm_id_value(gpu1->id)) { 2155 rm_device0 = uvm_gpu_device_handle(gpu0); 2156 rm_device1 = uvm_gpu_device_handle(gpu1); 2157 } 2158 else { 2159 rm_device0 = uvm_gpu_device_handle(gpu1); 2160 rm_device1 = uvm_gpu_device_handle(gpu0); 2161 } 2162 2163 *p2p_handle = 0; 2164 2165 status = uvm_rm_locked_call(nvUvmInterfaceP2pObjectCreate(rm_device0, rm_device1, p2p_handle)); 2166 if (status != NV_OK) { 2167 UVM_ERR_PRINT("nvUvmInterfaceP2pObjectCreate() failed with error: %s, for GPU0:%s and GPU1:%s\n", 2168 nvstatusToString(status), 2169 uvm_gpu_name(gpu0), 2170 uvm_gpu_name(gpu1)); 2171 return status; 2172 } 2173 2174 UVM_ASSERT(*p2p_handle); 2175 return NV_OK; 2176 } 2177 2178 static void set_optimal_p2p_write_ces(const UvmGpuP2PCapsParams *p2p_caps_params, 2179 const uvm_gpu_peer_t *peer_caps, 2180 uvm_gpu_t *gpu0, 2181 uvm_gpu_t *gpu1) 2182 { 2183 bool sorted; 2184 NvU32 ce0, ce1; 2185 2186 if (peer_caps->link_type < UVM_GPU_LINK_NVLINK_1) 2187 return; 2188 2189 sorted = uvm_id_value(gpu0->id) < uvm_id_value(gpu1->id); 2190 ce0 = p2p_caps_params->optimalNvlinkWriteCEs[sorted ? 0 : 1]; 2191 ce1 = p2p_caps_params->optimalNvlinkWriteCEs[sorted ? 1 : 0]; 2192 2193 // Indirect peers communicate through the CPU, so the optimal CE 2194 // should match the one selected for writing to system memory 2195 if (peer_caps->is_indirect_peer) { 2196 uvm_channel_pool_t *pool; 2197 2198 pool = gpu0->channel_manager->pool_to_use.default_for_type[UVM_CHANNEL_TYPE_GPU_TO_CPU]; 2199 UVM_ASSERT(ce0 == pool->engine_index); 2200 2201 pool = gpu1->channel_manager->pool_to_use.default_for_type[UVM_CHANNEL_TYPE_GPU_TO_CPU]; 2202 UVM_ASSERT(ce1 == pool->engine_index); 2203 } 2204 2205 uvm_channel_manager_set_p2p_ce(gpu0->channel_manager, gpu1, ce0); 2206 uvm_channel_manager_set_p2p_ce(gpu1->channel_manager, gpu0, ce1); 2207 } 2208 2209 static int nv_procfs_read_gpu_peer_caps(struct seq_file *s, void *v) 2210 { 2211 if (!uvm_down_read_trylock(&g_uvm_global.pm.lock)) 2212 return -EAGAIN; 2213 2214 gpu_peer_caps_print((uvm_gpu_t **)s->private, s); 2215 2216 uvm_up_read(&g_uvm_global.pm.lock); 2217 2218 return 0; 2219 } 2220 2221 static int nv_procfs_read_gpu_peer_caps_entry(struct seq_file *s, void *v) 2222 { 2223 UVM_ENTRY_RET(nv_procfs_read_gpu_peer_caps(s, v)); 2224 } 2225 2226 UVM_DEFINE_SINGLE_PROCFS_FILE(gpu_peer_caps_entry); 2227 2228 static NV_STATUS init_procfs_peer_cap_files(uvm_gpu_t *local, uvm_gpu_t *remote, size_t local_idx) 2229 { 2230 // This needs to hold a gpu_id_t in decimal 2231 char gpu_dir_name[16]; 2232 2233 // This needs to hold a GPU UUID 2234 char symlink_name[UVM_GPU_UUID_TEXT_BUFFER_LENGTH]; 2235 uvm_gpu_peer_t *peer_caps; 2236 2237 if (!uvm_procfs_is_enabled()) 2238 return NV_OK; 2239 2240 peer_caps = uvm_gpu_peer_caps(local, remote); 2241 peer_caps->procfs.pairs[local_idx][0] = local; 2242 peer_caps->procfs.pairs[local_idx][1] = remote; 2243 2244 // Create gpus/gpuA/peers/gpuB 2245 snprintf(gpu_dir_name, sizeof(gpu_dir_name), "%u", uvm_id_value(remote->id)); 2246 peer_caps->procfs.peer_file[local_idx] = NV_CREATE_PROC_FILE(gpu_dir_name, 2247 local->procfs.dir_peers, 2248 gpu_peer_caps_entry, 2249 &peer_caps->procfs.pairs[local_idx]); 2250 2251 if (peer_caps->procfs.peer_file[local_idx] == NULL) 2252 return NV_ERR_OPERATING_SYSTEM; 2253 2254 // Create a symlink from UVM GPU UUID (UVM-GPU-...) to the UVM GPU ID gpuB 2255 format_uuid_to_buffer(symlink_name, sizeof(symlink_name), uvm_gpu_uuid(remote)); 2256 peer_caps->procfs.peer_symlink_file[local_idx] = proc_symlink(symlink_name, 2257 local->procfs.dir_peers, 2258 gpu_dir_name); 2259 if (peer_caps->procfs.peer_symlink_file[local_idx] == NULL) 2260 return NV_ERR_OPERATING_SYSTEM; 2261 2262 return NV_OK; 2263 } 2264 2265 static NV_STATUS init_peer_access(uvm_gpu_t *gpu0, 2266 uvm_gpu_t *gpu1, 2267 const UvmGpuP2PCapsParams *p2p_caps_params, 2268 uvm_gpu_peer_t *peer_caps) 2269 { 2270 NV_STATUS status; 2271 2272 UVM_ASSERT(p2p_caps_params->p2pLink != UVM_LINK_TYPE_C2C); 2273 2274 // check for peer-to-peer compatibility (PCI-E or NvLink). 2275 peer_caps->link_type = get_gpu_link_type(p2p_caps_params->p2pLink); 2276 if (peer_caps->link_type == UVM_GPU_LINK_INVALID 2277 || peer_caps->link_type == UVM_GPU_LINK_C2C 2278 ) 2279 return NV_ERR_NOT_SUPPORTED; 2280 2281 peer_caps->total_link_line_rate_mbyte_per_s = p2p_caps_params->totalLinkLineRateMBps; 2282 2283 // Initialize peer ids and establish peer mappings 2284 peer_caps->is_indirect_peer = (p2p_caps_params->indirectAccess == NV_TRUE); 2285 2286 if (peer_caps->is_indirect_peer) { 2287 UVM_ASSERT(gpu0->mem_info.numa.enabled); 2288 UVM_ASSERT(gpu1->mem_info.numa.enabled); 2289 2290 status = uvm_pmm_gpu_indirect_peer_init(&gpu0->pmm, gpu1); 2291 if (status != NV_OK) 2292 return status; 2293 2294 status = uvm_pmm_gpu_indirect_peer_init(&gpu1->pmm, gpu0); 2295 if (status != NV_OK) 2296 return status; 2297 2298 set_optimal_p2p_write_ces(p2p_caps_params, peer_caps, gpu0, gpu1); 2299 UVM_ASSERT(peer_caps->total_link_line_rate_mbyte_per_s == 0); 2300 } 2301 else { 2302 // Peer id from min(gpu_id0, gpu_id1) -> max(gpu_id0, gpu_id1) 2303 peer_caps->peer_ids[0] = p2p_caps_params->peerIds[0]; 2304 2305 // Peer id from max(gpu_id0, gpu_id1) -> min(gpu_id0, gpu_id1) 2306 peer_caps->peer_ids[1] = p2p_caps_params->peerIds[1]; 2307 2308 // Establish peer mappings from each GPU to the other. Indirect peers 2309 // do not require identity mappings since they use sysmem aperture to 2310 // communicate. 2311 status = uvm_mmu_create_peer_identity_mappings(gpu0, gpu1); 2312 if (status != NV_OK) 2313 return status; 2314 2315 status = uvm_mmu_create_peer_identity_mappings(gpu1, gpu0); 2316 if (status != NV_OK) 2317 return status; 2318 2319 set_optimal_p2p_write_ces(p2p_caps_params, peer_caps, gpu0, gpu1); 2320 2321 UVM_ASSERT(uvm_gpu_get(gpu0->global_id) == gpu0); 2322 UVM_ASSERT(uvm_gpu_get(gpu1->global_id) == gpu1); 2323 2324 // In the case of NVLINK peers, this initialization will happen during 2325 // add_gpu. As soon as the peer info table is assigned below, the access 2326 // counter bottom half could start operating on the GPU being newly 2327 // added and inspecting the peer caps, so all of the appropriate 2328 // initialization must happen before this point. 2329 uvm_spin_lock(&gpu0->peer_info.peer_gpus_lock); 2330 2331 uvm_processor_mask_set(&gpu0->peer_info.peer_gpu_mask, gpu1->id); 2332 UVM_ASSERT(gpu0->peer_info.peer_gpus[uvm_id_gpu_index(gpu1->id)] == NULL); 2333 gpu0->peer_info.peer_gpus[uvm_id_gpu_index(gpu1->id)] = gpu1; 2334 2335 uvm_spin_unlock(&gpu0->peer_info.peer_gpus_lock); 2336 uvm_spin_lock(&gpu1->peer_info.peer_gpus_lock); 2337 2338 uvm_processor_mask_set(&gpu1->peer_info.peer_gpu_mask, gpu0->id); 2339 UVM_ASSERT(gpu1->peer_info.peer_gpus[uvm_id_gpu_index(gpu0->id)] == NULL); 2340 gpu1->peer_info.peer_gpus[uvm_id_gpu_index(gpu0->id)] = gpu0; 2341 2342 uvm_spin_unlock(&gpu1->peer_info.peer_gpus_lock); 2343 } 2344 2345 if (!uvm_procfs_is_debug_enabled()) 2346 return NV_OK; 2347 2348 status = init_procfs_peer_cap_files(gpu0, gpu1, 0); 2349 if (status != NV_OK) 2350 return status; 2351 2352 status = init_procfs_peer_cap_files(gpu1, gpu0, 1); 2353 if (status != NV_OK) 2354 return status; 2355 2356 return NV_OK; 2357 } 2358 2359 static NV_STATUS enable_pcie_peer_access(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1) 2360 { 2361 NV_STATUS status = NV_OK; 2362 UvmGpuP2PCapsParams p2p_caps_params; 2363 uvm_gpu_peer_t *peer_caps; 2364 NvHandle p2p_handle; 2365 2366 UVM_ASSERT(gpu0); 2367 UVM_ASSERT(gpu1); 2368 uvm_assert_mutex_locked(&g_uvm_global.global_lock); 2369 2370 peer_caps = uvm_gpu_peer_caps(gpu0, gpu1); 2371 UVM_ASSERT(peer_caps->link_type == UVM_GPU_LINK_INVALID); 2372 UVM_ASSERT(peer_caps->ref_count == 0); 2373 2374 status = create_p2p_object(gpu0, gpu1, &p2p_handle); 2375 if (status != NV_OK) 2376 return status; 2377 2378 // Store the handle in the global table. 2379 peer_caps->p2p_handle = p2p_handle; 2380 2381 status = get_p2p_caps(gpu0, gpu1, &p2p_caps_params); 2382 if (status != NV_OK) 2383 goto cleanup; 2384 2385 // Sanity checks 2386 UVM_ASSERT(p2p_caps_params.indirectAccess == NV_FALSE); 2387 UVM_ASSERT(p2p_caps_params.p2pLink == UVM_LINK_TYPE_PCIE); 2388 2389 status = init_peer_access(gpu0, gpu1, &p2p_caps_params, peer_caps); 2390 if (status != NV_OK) 2391 goto cleanup; 2392 2393 return NV_OK; 2394 2395 cleanup: 2396 disable_peer_access(gpu0, gpu1); 2397 return status; 2398 } 2399 2400 static NV_STATUS enable_nvlink_peer_access(uvm_gpu_t *gpu0, 2401 uvm_gpu_t *gpu1, 2402 UvmGpuP2PCapsParams *p2p_caps_params) 2403 { 2404 NV_STATUS status = NV_OK; 2405 NvHandle p2p_handle; 2406 uvm_gpu_peer_t *peer_caps; 2407 2408 UVM_ASSERT(gpu0); 2409 UVM_ASSERT(gpu1); 2410 uvm_assert_mutex_locked(&g_uvm_global.global_lock); 2411 2412 peer_caps = uvm_gpu_peer_caps(gpu0, gpu1); 2413 UVM_ASSERT(peer_caps->ref_count == 0); 2414 peer_caps->ref_count = 1; 2415 2416 if (!p2p_caps_params->indirectAccess) { 2417 // Create P2P object for direct NVLink peers 2418 status = create_p2p_object(gpu0, gpu1, &p2p_handle); 2419 if (status != NV_OK) { 2420 UVM_ERR_PRINT("failed to create a P2P object with error: %s, for GPU1:%s and GPU2:%s \n", 2421 nvstatusToString(status), 2422 uvm_gpu_name(gpu0), 2423 uvm_gpu_name(gpu1)); 2424 return status; 2425 } 2426 2427 UVM_ASSERT(p2p_handle != 0); 2428 2429 // Store the handle in the global table. 2430 peer_caps->p2p_handle = p2p_handle; 2431 2432 // Update p2p caps after p2p object creation as it generates the peer 2433 // ids 2434 status = get_p2p_caps(gpu0, gpu1, p2p_caps_params); 2435 if (status != NV_OK) 2436 goto cleanup; 2437 } 2438 2439 status = init_peer_access(gpu0, gpu1, p2p_caps_params, peer_caps); 2440 if (status != NV_OK) 2441 goto cleanup; 2442 2443 return NV_OK; 2444 2445 cleanup: 2446 disable_peer_access(gpu0, gpu1); 2447 return status; 2448 } 2449 2450 static NV_STATUS discover_nvlink_peers(uvm_gpu_t *gpu) 2451 { 2452 NV_STATUS status = NV_OK; 2453 uvm_gpu_t *other_gpu; 2454 2455 UVM_ASSERT(gpu); 2456 uvm_assert_mutex_locked(&g_uvm_global.global_lock); 2457 2458 if (gpu->parent->smc.enabled) 2459 return NV_OK; 2460 2461 for_each_global_gpu(other_gpu) { 2462 UvmGpuP2PCapsParams p2p_caps_params; 2463 2464 if ((other_gpu == gpu) || other_gpu->parent->smc.enabled) 2465 continue; 2466 2467 status = get_p2p_caps(gpu, other_gpu, &p2p_caps_params); 2468 if (status != NV_OK) 2469 goto cleanup; 2470 2471 // PCIe peers need to be explicitly enabled via UvmEnablePeerAccess 2472 if (p2p_caps_params.p2pLink == UVM_LINK_TYPE_NONE || p2p_caps_params.p2pLink == UVM_LINK_TYPE_PCIE) 2473 continue; 2474 2475 // Indirect peers are only supported when onlined as NUMA nodes, because 2476 // we want to use vm_insert_page and dma_map_page. 2477 if (p2p_caps_params.indirectAccess && (!gpu->mem_info.numa.enabled || !other_gpu->mem_info.numa.enabled)) 2478 continue; 2479 2480 status = enable_nvlink_peer_access(gpu, other_gpu, &p2p_caps_params); 2481 if (status != NV_OK) 2482 goto cleanup; 2483 } 2484 2485 return NV_OK; 2486 2487 cleanup: 2488 destroy_nvlink_peers(gpu); 2489 2490 return status; 2491 } 2492 2493 static void destroy_nvlink_peers(uvm_gpu_t *gpu) 2494 { 2495 uvm_gpu_t *other_gpu; 2496 2497 UVM_ASSERT(gpu); 2498 uvm_assert_mutex_locked(&g_uvm_global.global_lock); 2499 2500 if (gpu->parent->smc.enabled) 2501 return; 2502 2503 for_each_global_gpu(other_gpu) { 2504 uvm_gpu_peer_t *peer_caps; 2505 2506 if ((other_gpu == gpu) || other_gpu->parent->smc.enabled) 2507 continue; 2508 2509 peer_caps = uvm_gpu_peer_caps(gpu, other_gpu); 2510 2511 // PCIe peers need to be explicitly destroyed via UvmDisablePeerAccess 2512 if (peer_caps->link_type == UVM_GPU_LINK_INVALID || peer_caps->link_type == UVM_GPU_LINK_PCIE) 2513 continue; 2514 2515 disable_peer_access(gpu, other_gpu); 2516 } 2517 } 2518 2519 NV_STATUS uvm_gpu_retain_pcie_peer_access(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1) 2520 { 2521 NV_STATUS status = NV_OK; 2522 uvm_gpu_peer_t *peer_caps; 2523 2524 UVM_ASSERT(gpu0); 2525 UVM_ASSERT(gpu1); 2526 uvm_assert_mutex_locked(&g_uvm_global.global_lock); 2527 2528 peer_caps = uvm_gpu_peer_caps(gpu0, gpu1); 2529 2530 // Insert an entry into global peer table, if not present. 2531 if (peer_caps->link_type == UVM_GPU_LINK_INVALID) { 2532 UVM_ASSERT(peer_caps->ref_count == 0); 2533 2534 status = enable_pcie_peer_access(gpu0, gpu1); 2535 if (status != NV_OK) 2536 return status; 2537 } 2538 else if (peer_caps->link_type != UVM_GPU_LINK_PCIE) { 2539 return NV_ERR_INVALID_DEVICE; 2540 } 2541 2542 // GPUs can't be destroyed until their peer pairings have also been 2543 // destroyed. 2544 uvm_gpu_retain(gpu0); 2545 uvm_gpu_retain(gpu1); 2546 2547 peer_caps->ref_count++; 2548 2549 return status; 2550 } 2551 2552 static void disable_peer_access(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1) 2553 { 2554 uvm_gpu_peer_t *peer_caps; 2555 NvHandle p2p_handle = 0; 2556 2557 UVM_ASSERT(gpu0); 2558 UVM_ASSERT(gpu1); 2559 2560 // P2P is not supported under SMC partitioning 2561 UVM_ASSERT(!gpu0->parent->smc.enabled); 2562 UVM_ASSERT(!gpu1->parent->smc.enabled); 2563 2564 uvm_assert_mutex_locked(&g_uvm_global.global_lock); 2565 2566 peer_caps = uvm_gpu_peer_caps(gpu0, gpu1); 2567 2568 if (uvm_procfs_is_debug_enabled()) 2569 deinit_procfs_peer_cap_files(peer_caps); 2570 2571 p2p_handle = peer_caps->p2p_handle; 2572 2573 if (peer_caps->is_indirect_peer) { 2574 uvm_pmm_gpu_indirect_peer_destroy(&gpu0->pmm, gpu1); 2575 uvm_pmm_gpu_indirect_peer_destroy(&gpu1->pmm, gpu0); 2576 } 2577 else { 2578 UVM_ASSERT(p2p_handle); 2579 2580 uvm_mmu_destroy_peer_identity_mappings(gpu0, gpu1); 2581 uvm_mmu_destroy_peer_identity_mappings(gpu1, gpu0); 2582 2583 uvm_rm_locked_call_void(nvUvmInterfaceP2pObjectDestroy(uvm_gpu_session_handle(gpu0), p2p_handle)); 2584 2585 UVM_ASSERT(uvm_gpu_get(gpu0->global_id) == gpu0); 2586 UVM_ASSERT(uvm_gpu_get(gpu1->global_id) == gpu1); 2587 2588 uvm_spin_lock(&gpu0->peer_info.peer_gpus_lock); 2589 uvm_processor_mask_clear(&gpu0->peer_info.peer_gpu_mask, gpu1->id); 2590 gpu0->peer_info.peer_gpus[uvm_id_gpu_index(gpu1->id)] = NULL; 2591 uvm_spin_unlock(&gpu0->peer_info.peer_gpus_lock); 2592 2593 uvm_spin_lock(&gpu1->peer_info.peer_gpus_lock); 2594 uvm_processor_mask_clear(&gpu1->peer_info.peer_gpu_mask, gpu0->id); 2595 gpu1->peer_info.peer_gpus[uvm_id_gpu_index(gpu0->id)] = NULL; 2596 uvm_spin_unlock(&gpu1->peer_info.peer_gpus_lock); 2597 } 2598 2599 // Flush the access counter buffer to avoid getting stale notifications for 2600 // accesses to GPUs to which peer access is being disabled. This is also 2601 // needed in the case of disabling automatic (NVLINK) peers on GPU 2602 // unregister, because access counter processing might still be using GPU 2603 // IDs queried from the peer table above which are about to be removed from 2604 // the global table. 2605 if (gpu0->parent->access_counters_supported) 2606 uvm_gpu_access_counter_buffer_flush(gpu0); 2607 if (gpu1->parent->access_counters_supported) 2608 uvm_gpu_access_counter_buffer_flush(gpu1); 2609 2610 memset(peer_caps, 0, sizeof(*peer_caps)); 2611 } 2612 2613 void uvm_gpu_release_pcie_peer_access(uvm_gpu_t *gpu0, uvm_gpu_t *gpu1) 2614 { 2615 uvm_gpu_peer_t *peer_caps; 2616 UVM_ASSERT(gpu0); 2617 UVM_ASSERT(gpu1); 2618 uvm_assert_mutex_locked(&g_uvm_global.global_lock); 2619 2620 peer_caps = uvm_gpu_peer_caps(gpu0, gpu1); 2621 2622 UVM_ASSERT(peer_caps->ref_count > 0); 2623 UVM_ASSERT(peer_caps->link_type == UVM_GPU_LINK_PCIE); 2624 peer_caps->ref_count--; 2625 2626 if (peer_caps->ref_count == 0) 2627 disable_peer_access(gpu0, gpu1); 2628 2629 uvm_gpu_release_locked(gpu0); 2630 uvm_gpu_release_locked(gpu1); 2631 } 2632 2633 static uvm_aperture_t uvm_gpu_peer_caps_aperture(uvm_gpu_peer_t *peer_caps, uvm_gpu_t *local_gpu, uvm_gpu_t *remote_gpu) 2634 { 2635 size_t peer_index; 2636 UVM_ASSERT(peer_caps->link_type != UVM_GPU_LINK_INVALID); 2637 2638 // Indirect peers are accessed as sysmem addresses 2639 if (peer_caps->is_indirect_peer) 2640 return UVM_APERTURE_SYS; 2641 2642 if (uvm_id_value(local_gpu->id) < uvm_id_value(remote_gpu->id)) 2643 peer_index = 0; 2644 else 2645 peer_index = 1; 2646 2647 return UVM_APERTURE_PEER(peer_caps->peer_ids[peer_index]); 2648 } 2649 2650 uvm_aperture_t uvm_gpu_peer_aperture(uvm_gpu_t *local_gpu, uvm_gpu_t *remote_gpu) 2651 { 2652 uvm_gpu_peer_t *peer_caps = uvm_gpu_peer_caps(local_gpu, remote_gpu); 2653 return uvm_gpu_peer_caps_aperture(peer_caps, local_gpu, remote_gpu); 2654 } 2655 2656 uvm_aperture_t uvm_gpu_page_tree_init_location(const uvm_gpu_t *gpu) 2657 { 2658 // See comment in page_tree_set_location 2659 if (uvm_gpu_is_virt_mode_sriov_heavy(gpu)) 2660 return UVM_APERTURE_VID; 2661 2662 if (uvm_conf_computing_mode_enabled(gpu)) 2663 return UVM_APERTURE_VID; 2664 2665 return UVM_APERTURE_DEFAULT; 2666 } 2667 2668 uvm_processor_id_t uvm_gpu_get_processor_id_by_address(uvm_gpu_t *gpu, uvm_gpu_phys_address_t addr) 2669 { 2670 uvm_processor_id_t id = UVM_ID_INVALID; 2671 2672 // TODO: Bug 1899622: On P9 systems with multiple CPU sockets, SYS aperture 2673 // is also reported for accesses to remote GPUs connected to a different CPU 2674 // NUMA domain. We will need to determine the actual processor id using the 2675 // reported physical address. 2676 if (addr.aperture == UVM_APERTURE_SYS) 2677 return UVM_ID_CPU; 2678 else if (addr.aperture == UVM_APERTURE_VID) 2679 return gpu->id; 2680 2681 uvm_spin_lock(&gpu->peer_info.peer_gpus_lock); 2682 2683 for_each_gpu_id_in_mask(id, &gpu->peer_info.peer_gpu_mask) { 2684 uvm_gpu_t *other_gpu = gpu->peer_info.peer_gpus[uvm_id_gpu_index(id)]; 2685 UVM_ASSERT(other_gpu); 2686 2687 if (uvm_gpus_are_nvswitch_connected(gpu, other_gpu)) { 2688 // NVSWITCH connected systems use an extended physical address to 2689 // map to peers. Find the physical memory 'slot' containing the 2690 // given physical address to find the peer gpu that owns the 2691 // physical address 2692 NvU64 fabric_window_end = other_gpu->parent->nvswitch_info.fabric_memory_window_start + 2693 other_gpu->mem_info.max_allocatable_address; 2694 2695 if (other_gpu->parent->nvswitch_info.fabric_memory_window_start <= addr.address && 2696 fabric_window_end >= addr.address) 2697 break; 2698 } 2699 else if (uvm_gpu_peer_aperture(gpu, other_gpu) == addr.aperture) { 2700 break; 2701 } 2702 } 2703 2704 uvm_spin_unlock(&gpu->peer_info.peer_gpus_lock); 2705 2706 return id; 2707 } 2708 2709 uvm_gpu_peer_t *uvm_gpu_index_peer_caps(const uvm_gpu_id_t gpu_id1, const uvm_gpu_id_t gpu_id2) 2710 { 2711 NvU32 table_index = uvm_gpu_peer_table_index(gpu_id1, gpu_id2); 2712 return &g_uvm_global.peers[table_index]; 2713 } 2714 2715 static NvU64 instance_ptr_to_key(uvm_gpu_phys_address_t instance_ptr) 2716 { 2717 NvU64 key; 2718 int is_sys = (instance_ptr.aperture == UVM_APERTURE_SYS); 2719 2720 // Instance pointers must be 4k aligned and they must have either VID or SYS 2721 // apertures. Compress them as much as we can both to guarantee that the key 2722 // fits within 64 bits, and to make the table as shallow as possible. 2723 UVM_ASSERT(IS_ALIGNED(instance_ptr.address, UVM_PAGE_SIZE_4K)); 2724 UVM_ASSERT(instance_ptr.aperture == UVM_APERTURE_VID || instance_ptr.aperture == UVM_APERTURE_SYS); 2725 2726 key = (instance_ptr.address >> 11) | is_sys; 2727 2728 return key; 2729 } 2730 2731 static NV_STATUS gpu_add_user_channel_subctx_info(uvm_gpu_t *gpu, uvm_user_channel_t *user_channel) 2732 { 2733 uvm_gpu_phys_address_t instance_ptr = user_channel->instance_ptr.addr; 2734 NV_STATUS status = NV_OK; 2735 uvm_rb_tree_node_t *channel_tree_node; 2736 uvm_user_channel_subctx_info_t *channel_subctx_info; 2737 uvm_user_channel_subctx_info_t *new_channel_subctx_info = NULL; 2738 uvm_va_space_t *va_space = user_channel->gpu_va_space->va_space; 2739 2740 if (!user_channel->in_subctx) 2741 return NV_OK; 2742 2743 // Pre-allocate a subcontext info descriptor out of the lock, in case we 2744 // need to add a new entry to the tree 2745 new_channel_subctx_info = uvm_kvmalloc_zero(sizeof(*new_channel_subctx_info)); 2746 2747 // Don't check for the result of the allocation since it is only needed 2748 // if the TSG has not been registered yet, and we do that under the lock 2749 // below 2750 if (new_channel_subctx_info) { 2751 new_channel_subctx_info->subctxs = 2752 uvm_kvmalloc_zero(sizeof(*new_channel_subctx_info->subctxs) * user_channel->tsg.max_subctx_count); 2753 } 2754 2755 uvm_spin_lock(&gpu->parent->instance_ptr_table_lock); 2756 2757 // Check if the subcontext information for the channel already exists 2758 channel_tree_node = uvm_rb_tree_find(&gpu->parent->tsg_table, user_channel->tsg.id); 2759 2760 if (!channel_tree_node) { 2761 // We could not allocate the descriptor before taking the lock. Exiting 2762 if (!new_channel_subctx_info || !new_channel_subctx_info->subctxs) { 2763 status = NV_ERR_NO_MEMORY; 2764 goto exit_unlock; 2765 } 2766 2767 // Insert the new subcontext information descriptor 2768 new_channel_subctx_info->node.key = user_channel->tsg.id; 2769 status = uvm_rb_tree_insert(&gpu->parent->tsg_table, &new_channel_subctx_info->node); 2770 UVM_ASSERT(status == NV_OK); 2771 2772 channel_subctx_info = new_channel_subctx_info; 2773 channel_subctx_info->smc_engine_id = user_channel->smc_engine_id; 2774 } 2775 else { 2776 channel_subctx_info = container_of(channel_tree_node, uvm_user_channel_subctx_info_t, node); 2777 UVM_ASSERT(channel_subctx_info->smc_engine_id == user_channel->smc_engine_id); 2778 } 2779 2780 user_channel->subctx_info = channel_subctx_info; 2781 2782 // Register the VA space of the channel subcontext info descriptor, or 2783 // check that the existing one matches the channel's 2784 if (channel_subctx_info->subctxs[user_channel->subctx_id].refcount++ > 0) { 2785 UVM_ASSERT_MSG(channel_subctx_info->subctxs[user_channel->subctx_id].va_space == va_space, 2786 "CH %u:%u instance_ptr {0x%llx:%s} SubCTX %u in TSG %u: expected VA space 0x%llx but got 0x%llx instead\n", 2787 user_channel->hw_runlist_id, 2788 user_channel->hw_channel_id, 2789 instance_ptr.address, 2790 uvm_aperture_string(instance_ptr.aperture), 2791 user_channel->subctx_id, 2792 user_channel->tsg.id, 2793 (NvU64)va_space, 2794 (NvU64)channel_subctx_info->subctxs[user_channel->subctx_id].va_space); 2795 UVM_ASSERT_MSG(channel_subctx_info->subctxs[user_channel->subctx_id].va_space != NULL, 2796 "CH %u:%u instance_ptr {0x%llx:%s} SubCTX %u in TSG %u: VA space is NULL\n", 2797 user_channel->hw_runlist_id, 2798 user_channel->hw_channel_id, 2799 instance_ptr.address, 2800 uvm_aperture_string(instance_ptr.aperture), 2801 user_channel->subctx_id, 2802 user_channel->tsg.id); 2803 UVM_ASSERT_MSG(channel_subctx_info->total_refcount > 0, 2804 "CH %u:%u instance_ptr {0x%llx:%s} SubCTX %u in TSG %u: TSG refcount is 0\n", 2805 user_channel->hw_runlist_id, 2806 user_channel->hw_channel_id, 2807 instance_ptr.address, 2808 uvm_aperture_string(instance_ptr.aperture), 2809 user_channel->subctx_id, 2810 user_channel->tsg.id); 2811 } 2812 else { 2813 UVM_ASSERT_MSG(channel_subctx_info->subctxs[user_channel->subctx_id].va_space == NULL, 2814 "CH %u:%u instance_ptr {0x%llx:%s} SubCTX %u in TSG %u: expected VA space NULL but got 0x%llx instead\n", 2815 user_channel->hw_runlist_id, 2816 user_channel->hw_channel_id, 2817 instance_ptr.address, 2818 uvm_aperture_string(instance_ptr.aperture), 2819 user_channel->subctx_id, 2820 user_channel->tsg.id, 2821 (NvU64)channel_subctx_info->subctxs[user_channel->subctx_id].va_space); 2822 2823 channel_subctx_info->subctxs[user_channel->subctx_id].va_space = va_space; 2824 } 2825 2826 ++channel_subctx_info->total_refcount; 2827 2828 exit_unlock: 2829 uvm_spin_unlock(&gpu->parent->instance_ptr_table_lock); 2830 2831 // Remove the pre-allocated per-TSG subctx information struct if there was 2832 // some error or it was not used 2833 if (status != NV_OK || user_channel->subctx_info != new_channel_subctx_info) { 2834 if (new_channel_subctx_info) 2835 uvm_kvfree(new_channel_subctx_info->subctxs); 2836 2837 uvm_kvfree(new_channel_subctx_info); 2838 } 2839 2840 return status; 2841 } 2842 2843 static void gpu_remove_user_channel_subctx_info_locked(uvm_gpu_t *gpu, uvm_user_channel_t *user_channel) 2844 { 2845 uvm_gpu_phys_address_t instance_ptr = user_channel->instance_ptr.addr; 2846 uvm_va_space_t *va_space = user_channel->gpu_va_space->va_space; 2847 2848 uvm_assert_spinlock_locked(&gpu->parent->instance_ptr_table_lock); 2849 2850 // Channel subcontext info descriptor may not have been registered in 2851 // tsg_table since this function is called in some teardown paths during 2852 // channel creation 2853 if (!user_channel->subctx_info) 2854 return; 2855 2856 UVM_ASSERT_MSG(&user_channel->subctx_info->node == 2857 uvm_rb_tree_find(&gpu->parent->tsg_table, user_channel->subctx_info->node.key), 2858 "CH %u:%u instance_ptr {0x%llx:%s} SubCTX %u in TSG %u: SubCTX not found in TSG table\n", 2859 user_channel->hw_runlist_id, 2860 user_channel->hw_channel_id, 2861 instance_ptr.address, 2862 uvm_aperture_string(instance_ptr.aperture), 2863 user_channel->subctx_id, 2864 user_channel->tsg.id); 2865 2866 UVM_ASSERT_MSG(user_channel->subctx_info->subctxs[user_channel->subctx_id].refcount > 0, 2867 "CH %u:%u instance_ptr {0x%llx:%s} SubCTX %u in TSG %u: SubCTX refcount is 0\n", 2868 user_channel->hw_runlist_id, 2869 user_channel->hw_channel_id, 2870 instance_ptr.address, 2871 uvm_aperture_string(instance_ptr.aperture), 2872 user_channel->subctx_id, 2873 user_channel->tsg.id); 2874 2875 UVM_ASSERT_MSG(user_channel->subctx_info->subctxs[user_channel->subctx_id].va_space == va_space, 2876 "CH %u:%u instance_ptr {0x%llx:%s} SubCTX %u in TSG %u: expected VA space 0x%llx but got 0x%llx instead\n", 2877 user_channel->hw_runlist_id, 2878 user_channel->hw_channel_id, 2879 instance_ptr.address, 2880 uvm_aperture_string(instance_ptr.aperture), 2881 user_channel->subctx_id, 2882 user_channel->tsg.id, 2883 (NvU64)va_space, 2884 (NvU64)user_channel->subctx_info->subctxs[user_channel->subctx_id].va_space); 2885 2886 UVM_ASSERT_MSG(user_channel->subctx_info->total_refcount > 0, 2887 "CH %u:%u instance_ptr {0x%llx:%s} SubCTX %u in TSG %u: TSG refcount is 0\n", 2888 user_channel->hw_runlist_id, 2889 user_channel->hw_channel_id, 2890 instance_ptr.address, 2891 uvm_aperture_string(instance_ptr.aperture), 2892 user_channel->subctx_id, 2893 user_channel->tsg.id); 2894 2895 // Decrement VA space refcount. If it gets to zero, unregister the pointer 2896 if (--user_channel->subctx_info->subctxs[user_channel->subctx_id].refcount == 0) 2897 user_channel->subctx_info->subctxs[user_channel->subctx_id].va_space = NULL; 2898 2899 if (--user_channel->subctx_info->total_refcount == 0) { 2900 uvm_rb_tree_remove(&gpu->parent->tsg_table, &user_channel->subctx_info->node); 2901 uvm_kvfree(user_channel->subctx_info->subctxs); 2902 uvm_kvfree(user_channel->subctx_info); 2903 } 2904 2905 user_channel->subctx_info = NULL; 2906 } 2907 2908 static void gpu_remove_user_channel_subctx_info(uvm_gpu_t *gpu, uvm_user_channel_t *user_channel) 2909 { 2910 uvm_spin_lock(&gpu->parent->instance_ptr_table_lock); 2911 gpu_remove_user_channel_subctx_info_locked(gpu, user_channel); 2912 uvm_spin_unlock(&gpu->parent->instance_ptr_table_lock); 2913 } 2914 2915 static void gpu_add_user_channel_instance_ptr(uvm_gpu_t *gpu, uvm_user_channel_t *user_channel) 2916 { 2917 uvm_gpu_phys_address_t instance_ptr = user_channel->instance_ptr.addr; 2918 NvU64 instance_ptr_key = instance_ptr_to_key(instance_ptr); 2919 NV_STATUS status; 2920 2921 uvm_spin_lock(&gpu->parent->instance_ptr_table_lock); 2922 2923 // Insert the instance_ptr -> user_channel mapping 2924 user_channel->instance_ptr.node.key = instance_ptr_key; 2925 status = uvm_rb_tree_insert(&gpu->parent->instance_ptr_table, &user_channel->instance_ptr.node); 2926 2927 uvm_spin_unlock(&gpu->parent->instance_ptr_table_lock); 2928 2929 UVM_ASSERT_MSG(status == NV_OK, "CH %u:%u instance_ptr {0x%llx:%s} SubCTX %u in TSG %u: error %s\n", 2930 user_channel->hw_runlist_id, 2931 user_channel->hw_channel_id, 2932 instance_ptr.address, 2933 uvm_aperture_string(instance_ptr.aperture), 2934 user_channel->subctx_id, 2935 user_channel->tsg.id, 2936 nvstatusToString(status)); 2937 } 2938 2939 static void gpu_remove_user_channel_instance_ptr_locked(uvm_gpu_t *gpu, uvm_user_channel_t *user_channel) 2940 { 2941 uvm_assert_spinlock_locked(&gpu->parent->instance_ptr_table_lock); 2942 2943 if (UVM_RB_TREE_EMPTY_NODE(&user_channel->instance_ptr.node)) 2944 return; 2945 2946 uvm_rb_tree_remove(&gpu->parent->instance_ptr_table, &user_channel->instance_ptr.node); 2947 } 2948 2949 NV_STATUS uvm_gpu_add_user_channel(uvm_gpu_t *gpu, uvm_user_channel_t *user_channel) 2950 { 2951 uvm_va_space_t *va_space; 2952 uvm_gpu_va_space_t *gpu_va_space = user_channel->gpu_va_space; 2953 NV_STATUS status; 2954 2955 UVM_ASSERT(user_channel->rm_retained_channel); 2956 UVM_ASSERT(gpu_va_space); 2957 UVM_ASSERT(uvm_gpu_va_space_state(gpu_va_space) == UVM_GPU_VA_SPACE_STATE_ACTIVE); 2958 va_space = gpu_va_space->va_space; 2959 uvm_assert_rwsem_locked(&va_space->lock); 2960 2961 status = gpu_add_user_channel_subctx_info(gpu, user_channel); 2962 if (status != NV_OK) 2963 return status; 2964 2965 gpu_add_user_channel_instance_ptr(gpu, user_channel); 2966 2967 return NV_OK; 2968 } 2969 2970 static uvm_user_channel_t *instance_ptr_to_user_channel(uvm_gpu_t *gpu, uvm_gpu_phys_address_t instance_ptr) 2971 { 2972 NvU64 key = instance_ptr_to_key(instance_ptr); 2973 uvm_rb_tree_node_t *instance_node; 2974 2975 uvm_assert_spinlock_locked(&gpu->parent->instance_ptr_table_lock); 2976 2977 instance_node = uvm_rb_tree_find(&gpu->parent->instance_ptr_table, key); 2978 if (!instance_node) 2979 return NULL; 2980 2981 return get_user_channel(instance_node); 2982 } 2983 2984 static uvm_va_space_t *user_channel_and_subctx_to_va_space(uvm_user_channel_t *user_channel, NvU32 subctx_id) 2985 { 2986 uvm_user_channel_subctx_info_t *channel_subctx_info; 2987 2988 UVM_ASSERT(user_channel); 2989 UVM_ASSERT(user_channel->in_subctx); 2990 UVM_ASSERT(user_channel->subctx_info); 2991 2992 uvm_assert_spinlock_locked(&user_channel->gpu->parent->instance_ptr_table_lock); 2993 2994 channel_subctx_info = user_channel->subctx_info; 2995 2996 UVM_ASSERT_MSG(subctx_id < user_channel->tsg.max_subctx_count, 2997 "instance_ptr {0x%llx:%s} in TSG %u. Invalid SubCTX %u\n", 2998 user_channel->instance_ptr.addr.address, 2999 uvm_aperture_string(user_channel->instance_ptr.addr.aperture), 3000 user_channel->tsg.id, 3001 subctx_id); 3002 UVM_ASSERT_MSG(channel_subctx_info->total_refcount > 0, 3003 "instance_ptr {0x%llx:%s} in TSG %u: TSG refcount is 0\n", 3004 user_channel->instance_ptr.addr.address, 3005 uvm_aperture_string(user_channel->instance_ptr.addr.aperture), 3006 user_channel->tsg.id); 3007 3008 // A subcontext's refcount can be zero if that subcontext is torn down 3009 // uncleanly and work from that subcontext continues running with work from 3010 // other subcontexts. 3011 if (channel_subctx_info->subctxs[subctx_id].refcount == 0) { 3012 UVM_ASSERT(channel_subctx_info->subctxs[subctx_id].va_space == NULL); 3013 } 3014 else { 3015 UVM_ASSERT_MSG(channel_subctx_info->subctxs[subctx_id].va_space, 3016 "instance_ptr {0x%llx:%s} in TSG %u: no VA space for SubCTX %u\n", 3017 user_channel->instance_ptr.addr.address, 3018 uvm_aperture_string(user_channel->instance_ptr.addr.aperture), 3019 user_channel->tsg.id, 3020 subctx_id); 3021 } 3022 3023 return channel_subctx_info->subctxs[subctx_id].va_space; 3024 } 3025 3026 NV_STATUS uvm_gpu_fault_entry_to_va_space(uvm_gpu_t *gpu, 3027 uvm_fault_buffer_entry_t *fault, 3028 uvm_va_space_t **out_va_space) 3029 { 3030 uvm_user_channel_t *user_channel; 3031 NV_STATUS status = NV_OK; 3032 3033 *out_va_space = NULL; 3034 3035 uvm_spin_lock(&gpu->parent->instance_ptr_table_lock); 3036 3037 user_channel = instance_ptr_to_user_channel(gpu, fault->instance_ptr); 3038 if (!user_channel) { 3039 status = NV_ERR_INVALID_CHANNEL; 3040 goto exit_unlock; 3041 } 3042 3043 // Faults from HUB clients will always report VEID 0 even if the channel 3044 // belongs a TSG with many subcontexts. Therefore, we cannot use the per-TSG 3045 // subctx table and we need to directly return the channel's VA space 3046 if (!user_channel->in_subctx || (fault->fault_source.client_type == UVM_FAULT_CLIENT_TYPE_HUB)) { 3047 UVM_ASSERT_MSG(fault->fault_source.ve_id == 0, 3048 "Fault packet contains SubCTX %u for channel not in subctx\n", 3049 fault->fault_source.ve_id); 3050 3051 // We can safely access user_channel->gpu_va_space under the 3052 // instance_ptr_table_lock since gpu_va_space is set to NULL after this 3053 // function is called in uvm_user_channel_detach 3054 UVM_ASSERT(uvm_gpu_va_space_state(user_channel->gpu_va_space) == UVM_GPU_VA_SPACE_STATE_ACTIVE); 3055 *out_va_space = user_channel->gpu_va_space->va_space; 3056 } 3057 else { 3058 NvU32 ve_id = fault->fault_source.ve_id; 3059 3060 // Compute the SMC engine-local VEID 3061 UVM_ASSERT(ve_id >= user_channel->smc_engine_ve_id_offset); 3062 3063 ve_id -= user_channel->smc_engine_ve_id_offset; 3064 3065 *out_va_space = user_channel_and_subctx_to_va_space(user_channel, ve_id); 3066 3067 // Instance pointer is valid but the fault targets a non-existent 3068 // subcontext. 3069 if (!*out_va_space) 3070 status = NV_ERR_PAGE_TABLE_NOT_AVAIL; 3071 } 3072 3073 exit_unlock: 3074 uvm_spin_unlock(&gpu->parent->instance_ptr_table_lock); 3075 3076 return status; 3077 } 3078 3079 NV_STATUS uvm_gpu_access_counter_entry_to_va_space(uvm_gpu_t *gpu, 3080 uvm_access_counter_buffer_entry_t *entry, 3081 uvm_va_space_t **out_va_space) 3082 { 3083 uvm_user_channel_t *user_channel; 3084 NV_STATUS status = NV_OK; 3085 3086 *out_va_space = NULL; 3087 UVM_ASSERT(entry->address.is_virtual); 3088 3089 uvm_spin_lock(&gpu->parent->instance_ptr_table_lock); 3090 3091 user_channel = instance_ptr_to_user_channel(gpu, entry->virtual_info.instance_ptr); 3092 if (!user_channel) { 3093 status = NV_ERR_INVALID_CHANNEL; 3094 goto exit_unlock; 3095 } 3096 3097 if (!user_channel->in_subctx) { 3098 UVM_ASSERT_MSG(entry->virtual_info.ve_id == 0, 3099 "Access counter packet contains SubCTX %u for channel not in subctx\n", 3100 entry->virtual_info.ve_id); 3101 3102 UVM_ASSERT(uvm_gpu_va_space_state(user_channel->gpu_va_space) == UVM_GPU_VA_SPACE_STATE_ACTIVE); 3103 *out_va_space = user_channel->gpu_va_space->va_space; 3104 } 3105 else { 3106 *out_va_space = user_channel_and_subctx_to_va_space(user_channel, entry->virtual_info.ve_id); 3107 if (!*out_va_space) 3108 status = NV_ERR_PAGE_TABLE_NOT_AVAIL; 3109 } 3110 3111 exit_unlock: 3112 uvm_spin_unlock(&gpu->parent->instance_ptr_table_lock); 3113 3114 return status; 3115 } 3116 3117 void uvm_gpu_remove_user_channel(uvm_gpu_t *gpu, uvm_user_channel_t *user_channel) 3118 { 3119 uvm_va_space_t *va_space; 3120 uvm_gpu_va_space_t *gpu_va_space = user_channel->gpu_va_space; 3121 3122 UVM_ASSERT(user_channel->rm_retained_channel); 3123 UVM_ASSERT(gpu_va_space); 3124 UVM_ASSERT(uvm_gpu_va_space_state(gpu_va_space) == UVM_GPU_VA_SPACE_STATE_ACTIVE); 3125 va_space = gpu_va_space->va_space; 3126 uvm_assert_rwsem_locked_write(&va_space->lock); 3127 3128 uvm_spin_lock(&gpu->parent->instance_ptr_table_lock); 3129 gpu_remove_user_channel_subctx_info_locked(gpu, user_channel); 3130 gpu_remove_user_channel_instance_ptr_locked(gpu, user_channel); 3131 uvm_spin_unlock(&gpu->parent->instance_ptr_table_lock); 3132 } 3133 3134 static NvU64 gpu_addr_to_dma_addr(uvm_parent_gpu_t *parent_gpu, NvU64 gpu_addr) 3135 { 3136 NvU64 dma_addr = gpu_addr; 3137 UVM_ASSERT(dma_addr <= dma_addr + parent_gpu->dma_addressable_start); 3138 3139 if (parent_gpu->npu) 3140 dma_addr = nv_expand_nvlink_addr(dma_addr); 3141 3142 dma_addr += parent_gpu->dma_addressable_start; 3143 3144 return dma_addr; 3145 } 3146 3147 // The GPU has its NV_PFB_XV_UPPER_ADDR register set by RM to 3148 // dma_addressable_start (in bifSetupDmaWindow_IMPL()) and hence when 3149 // referencing sysmem from the GPU, dma_addressable_start should be 3150 // subtracted from the DMA address we get from the OS. 3151 static NvU64 dma_addr_to_gpu_addr(uvm_parent_gpu_t *parent_gpu, NvU64 dma_addr) 3152 { 3153 NvU64 gpu_addr = dma_addr - parent_gpu->dma_addressable_start; 3154 UVM_ASSERT(dma_addr >= gpu_addr); 3155 3156 // See Bug 1920398 for background and details about NVLink DMA address 3157 // transformations being applied here. 3158 if (parent_gpu->npu) 3159 gpu_addr = nv_compress_nvlink_addr(gpu_addr); 3160 3161 return gpu_addr; 3162 } 3163 3164 void *uvm_gpu_dma_alloc_page(uvm_parent_gpu_t *parent_gpu, gfp_t gfp_flags, NvU64 *dma_address_out) 3165 { 3166 NvU64 dma_addr; 3167 void *cpu_addr; 3168 3169 cpu_addr = dma_alloc_coherent(&parent_gpu->pci_dev->dev, PAGE_SIZE, &dma_addr, gfp_flags); 3170 3171 if (!cpu_addr) 3172 return cpu_addr; 3173 3174 *dma_address_out = dma_addr_to_gpu_addr(parent_gpu, dma_addr); 3175 atomic64_add(PAGE_SIZE, &parent_gpu->mapped_cpu_pages_size); 3176 return cpu_addr; 3177 } 3178 3179 void uvm_gpu_dma_free_page(uvm_parent_gpu_t *parent_gpu, void *va, NvU64 dma_address) 3180 { 3181 dma_address = gpu_addr_to_dma_addr(parent_gpu, dma_address); 3182 dma_free_coherent(&parent_gpu->pci_dev->dev, PAGE_SIZE, va, dma_address); 3183 atomic64_sub(PAGE_SIZE, &parent_gpu->mapped_cpu_pages_size); 3184 } 3185 3186 NV_STATUS uvm_gpu_map_cpu_pages(uvm_parent_gpu_t *parent_gpu, struct page *page, size_t size, NvU64 *dma_address_out) 3187 { 3188 NvU64 dma_addr; 3189 3190 UVM_ASSERT(PAGE_ALIGNED(size)); 3191 3192 dma_addr = dma_map_page(&parent_gpu->pci_dev->dev, page, 0, size, DMA_BIDIRECTIONAL); 3193 if (dma_mapping_error(&parent_gpu->pci_dev->dev, dma_addr)) 3194 return NV_ERR_OPERATING_SYSTEM; 3195 3196 if (dma_addr < parent_gpu->dma_addressable_start || 3197 dma_addr + size - 1 > parent_gpu->dma_addressable_limit) { 3198 dma_unmap_page(&parent_gpu->pci_dev->dev, dma_addr, size, DMA_BIDIRECTIONAL); 3199 UVM_ERR_PRINT_RL("PCI mapped range [0x%llx, 0x%llx) not in the addressable range [0x%llx, 0x%llx), GPU %s\n", 3200 dma_addr, 3201 dma_addr + (NvU64)size, 3202 parent_gpu->dma_addressable_start, 3203 parent_gpu->dma_addressable_limit + 1, 3204 parent_gpu->name); 3205 return NV_ERR_INVALID_ADDRESS; 3206 } 3207 3208 atomic64_add(size, &parent_gpu->mapped_cpu_pages_size); 3209 *dma_address_out = dma_addr_to_gpu_addr(parent_gpu, dma_addr); 3210 3211 return NV_OK; 3212 } 3213 3214 void uvm_gpu_unmap_cpu_pages(uvm_parent_gpu_t *parent_gpu, NvU64 dma_address, size_t size) 3215 { 3216 UVM_ASSERT(PAGE_ALIGNED(size)); 3217 3218 dma_address = gpu_addr_to_dma_addr(parent_gpu, dma_address); 3219 dma_unmap_page(&parent_gpu->pci_dev->dev, dma_address, size, DMA_BIDIRECTIONAL); 3220 atomic64_sub(size, &parent_gpu->mapped_cpu_pages_size); 3221 } 3222 3223 // This function implements the UvmRegisterGpu API call, as described in uvm.h. 3224 // Notes: 3225 // 3226 // 1. The UVM VA space has a 1-to-1 relationship with an open instance of 3227 // /dev/nvidia-uvm. That, in turn, has a 1-to-1 relationship with a process, 3228 // because the user-level UVM code (os-user-linux.c, for example) enforces an 3229 // "open /dev/nvidia-uvm only once per process" policy. So a UVM VA space is 3230 // very close to a process's VA space. 3231 // 3232 // If that user space code fails or is not used, then the relationship is no 3233 // longer 1-to-1. That situation requires that this code should avoid crashing, 3234 // leaking resources, exhibiting security holes, etc, but it does not have to 3235 // provide correct UVM API behavior. Correct UVM API behavior requires doing 3236 // the right things in user space before calling into the kernel. 3237 // 3238 // 2. The uvm_api*() routines are invoked directly from the top-level ioctl 3239 // handler. They are considered "API routing routines", because they are 3240 // responsible for providing the behavior that is described in the UVM 3241 // user-to-kernel API documentation, in uvm.h. 3242 // 3243 // 3. A GPU VA space, which you'll see in other parts of the driver, 3244 // is something different: there may be more than one 3245 // GPU VA space within a process, and therefore within a UVM VA space. 3246 // 3247 NV_STATUS uvm_api_register_gpu(UVM_REGISTER_GPU_PARAMS *params, struct file *filp) 3248 { 3249 uvm_va_space_t *va_space = uvm_va_space_get(filp); 3250 uvm_rm_user_object_t user_rm_va_space = { 3251 .rm_control_fd = params->rmCtrlFd, 3252 .user_client = params->hClient, 3253 .user_object = params->hSmcPartRef, 3254 }; 3255 3256 return uvm_va_space_register_gpu(va_space, 3257 ¶ms->gpu_uuid, 3258 &user_rm_va_space, 3259 ¶ms->numaEnabled, 3260 ¶ms->numaNodeId); 3261 } 3262 3263 NV_STATUS uvm_api_unregister_gpu(UVM_UNREGISTER_GPU_PARAMS *params, struct file *filp) 3264 { 3265 uvm_va_space_t *va_space = uvm_va_space_get(filp); 3266 3267 return uvm_va_space_unregister_gpu(va_space, ¶ms->gpu_uuid); 3268 } 3269 3270 NV_STATUS uvm_api_register_gpu_va_space(UVM_REGISTER_GPU_VASPACE_PARAMS *params, struct file *filp) 3271 { 3272 uvm_va_space_t *va_space = uvm_va_space_get(filp); 3273 uvm_rm_user_object_t user_rm_va_space = { 3274 .rm_control_fd = params->rmCtrlFd, 3275 .user_client = params->hClient, 3276 .user_object = params->hVaSpace 3277 }; 3278 return uvm_va_space_register_gpu_va_space(va_space, &user_rm_va_space, ¶ms->gpuUuid); 3279 } 3280 3281 NV_STATUS uvm_api_unregister_gpu_va_space(UVM_UNREGISTER_GPU_VASPACE_PARAMS *params, struct file *filp) 3282 { 3283 uvm_va_space_t *va_space = uvm_va_space_get(filp); 3284 return uvm_va_space_unregister_gpu_va_space(va_space, ¶ms->gpuUuid); 3285 } 3286 3287 NV_STATUS uvm_api_pageable_mem_access_on_gpu(UVM_PAGEABLE_MEM_ACCESS_ON_GPU_PARAMS *params, struct file *filp) 3288 { 3289 uvm_va_space_t *va_space = uvm_va_space_get(filp); 3290 uvm_gpu_t *gpu; 3291 3292 uvm_va_space_down_read(va_space); 3293 gpu = uvm_va_space_get_gpu_by_uuid(va_space, ¶ms->gpu_uuid); 3294 3295 if (!gpu) { 3296 uvm_va_space_up_read(va_space); 3297 return NV_ERR_INVALID_DEVICE; 3298 } 3299 3300 if (uvm_va_space_pageable_mem_access_supported(va_space) && gpu->parent->replayable_faults_supported) 3301 params->pageableMemAccess = NV_TRUE; 3302 else 3303 params->pageableMemAccess = NV_FALSE; 3304 3305 uvm_va_space_up_read(va_space); 3306 return NV_OK; 3307 } 3308 3309 NV_STATUS uvm_test_set_prefetch_filtering(UVM_TEST_SET_PREFETCH_FILTERING_PARAMS *params, struct file *filp) 3310 { 3311 uvm_va_space_t *va_space = uvm_va_space_get(filp); 3312 uvm_gpu_t *gpu = NULL; 3313 NV_STATUS status = NV_OK; 3314 3315 uvm_mutex_lock(&g_uvm_global.global_lock); 3316 3317 uvm_va_space_down_read(va_space); 3318 3319 gpu = uvm_va_space_get_gpu_by_uuid(va_space, ¶ms->gpu_uuid); 3320 3321 if (!gpu) { 3322 status = NV_ERR_INVALID_DEVICE; 3323 goto done; 3324 } 3325 3326 if (!gpu->parent->isr.replayable_faults.handling || !gpu->parent->prefetch_fault_supported) { 3327 status = NV_ERR_INVALID_DEVICE; 3328 goto done; 3329 } 3330 3331 switch (params->filtering_mode) { 3332 case UVM_TEST_PREFETCH_FILTERING_MODE_FILTER_ALL: 3333 uvm_gpu_disable_prefetch_faults(gpu->parent); 3334 break; 3335 case UVM_TEST_PREFETCH_FILTERING_MODE_FILTER_NONE: 3336 uvm_gpu_enable_prefetch_faults(gpu->parent); 3337 break; 3338 default: 3339 status = NV_ERR_INVALID_ARGUMENT; 3340 break; 3341 } 3342 3343 done: 3344 uvm_va_space_up_read(va_space); 3345 3346 uvm_mutex_unlock(&g_uvm_global.global_lock); 3347 return status; 3348 } 3349 3350 NV_STATUS uvm_test_get_gpu_time(UVM_TEST_GET_GPU_TIME_PARAMS *params, struct file *filp) 3351 { 3352 uvm_va_space_t *va_space = uvm_va_space_get(filp); 3353 uvm_gpu_t *gpu = NULL; 3354 NV_STATUS status = NV_OK; 3355 3356 uvm_va_space_down_read(va_space); 3357 3358 gpu = uvm_va_space_get_gpu_by_uuid(va_space, ¶ms->gpu_uuid); 3359 3360 if (gpu) 3361 params->timestamp_ns = gpu->parent->host_hal->get_time(gpu); 3362 else 3363 status = NV_ERR_INVALID_DEVICE; 3364 3365 uvm_va_space_up_read(va_space); 3366 3367 return status; 3368 } 3369