1 /******************************************************************************* 2 Copyright (c) 2016-2022 NVIDIA Corporation 3 4 Permission is hereby granted, free of charge, to any person obtaining a copy 5 of this software and associated documentation files (the "Software"), to 6 deal in the Software without restriction, including without limitation the 7 rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 8 sell copies of the Software, and to permit persons to whom the Software is 9 furnished to do so, subject to the following conditions: 10 11 The above copyright notice and this permission notice shall be 12 included in all copies or substantial portions of the Software. 13 14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 DEALINGS IN THE SOFTWARE. 21 22 *******************************************************************************/ 23 24 #include "uvm_common.h" 25 #include "uvm_linux.h" 26 #include "uvm_forward_decl.h" 27 #include "uvm_lock.h" 28 #include "uvm_mmu.h" 29 #include "uvm_api.h" 30 #include "uvm_global.h" 31 #include "uvm_gpu.h" 32 #include "uvm_push.h" 33 #include "uvm_va_space.h" 34 #include "uvm_va_range.h" 35 #include "uvm_tracker.h" 36 #include "uvm_hal.h" 37 #include "uvm_hal_types.h" 38 #include "uvm_map_external.h" 39 #include "uvm_pte_batch.h" 40 #include "uvm_tlb_batch.h" 41 #include "nv_uvm_interface.h" 42 43 #include "uvm_pushbuffer.h" 44 45 // Assume almost all of the push space can be used for PTEs leaving 1K of margin. 46 #define MAX_COPY_SIZE_PER_PUSH ((size_t)(UVM_MAX_PUSH_SIZE - 1024)) 47 48 typedef struct 49 { 50 // The VA range the buffer is for 51 uvm_va_range_t *va_range; 52 53 // The GPU that's mapping the VA range 54 uvm_gpu_t *gpu; 55 56 // Mapping info used for querying PTEs from RM 57 UvmGpuExternalMappingInfo mapping_info; 58 59 // Size of the buffer 60 size_t buffer_size; 61 62 // Page size in bytes 63 NvU32 page_size; 64 65 // Size of a single PTE in bytes 66 NvU32 pte_size; 67 68 // Max PTE offset covered by the VA range. 69 // 70 // Notably the mapping might not start at offset 0 and max PTE offset can be 71 // larger than number of PTEs covering the VA range. 72 size_t max_pte_offset; 73 74 // Number of PTEs currently in the buffer 75 size_t num_ptes; 76 77 // PTE offset at which the currently buffered PTEs start. 78 size_t pte_offset; 79 } uvm_pte_buffer_t; 80 81 // Max PTE buffer size is the size of the buffer used for querying PTEs from RM. 82 // It has to be big enough to amortize the cost of calling into RM, but small 83 // enough to fit in CPU caches as it's written and read multiple times on the 84 // CPU before it ends up in the pushbuffer. 85 // 96K seems to be a sweet spot at least on a Xeon W5580 system. This could use 86 // some benchmarking on more systems though. 87 #define MAX_PTE_BUFFER_SIZE ((size_t)96 * 1024) 88 89 static NV_STATUS uvm_pte_buffer_init(uvm_va_range_t *va_range, 90 uvm_gpu_t *gpu, 91 const uvm_map_rm_params_t *map_rm_params, 92 NvU64 length, 93 NvU32 page_size, 94 uvm_pte_buffer_t *pte_buffer) 95 { 96 uvm_gpu_va_space_t *gpu_va_space = uvm_gpu_va_space_get(va_range->va_space, gpu); 97 uvm_page_tree_t *tree = &gpu_va_space->page_tables; 98 size_t num_all_ptes; 99 100 memset(pte_buffer, 0, sizeof(*pte_buffer)); 101 102 pte_buffer->va_range = va_range; 103 pte_buffer->gpu = gpu; 104 pte_buffer->mapping_info.cachingType = map_rm_params->caching_type; 105 pte_buffer->mapping_info.mappingType = map_rm_params->mapping_type; 106 pte_buffer->mapping_info.formatType = map_rm_params->format_type; 107 pte_buffer->mapping_info.elementBits = map_rm_params->element_bits; 108 pte_buffer->mapping_info.compressionType = map_rm_params->compression_type; 109 if (va_range->type == UVM_VA_RANGE_TYPE_EXTERNAL) 110 pte_buffer->mapping_info.mappingPageSize = page_size; 111 112 pte_buffer->page_size = page_size; 113 pte_buffer->pte_size = uvm_mmu_pte_size(tree, page_size); 114 num_all_ptes = uvm_div_pow2_64(length, page_size); 115 pte_buffer->max_pte_offset = uvm_div_pow2_64(map_rm_params->map_offset, page_size) + num_all_ptes; 116 pte_buffer->buffer_size = min(MAX_PTE_BUFFER_SIZE, num_all_ptes * pte_buffer->pte_size); 117 118 pte_buffer->mapping_info.pteBuffer = uvm_kvmalloc(pte_buffer->buffer_size); 119 if (!pte_buffer->mapping_info.pteBuffer) 120 return NV_ERR_NO_MEMORY; 121 122 return NV_OK; 123 } 124 125 static void uvm_pte_buffer_deinit(uvm_pte_buffer_t *pte_buffer) 126 { 127 uvm_kvfree(pte_buffer->mapping_info.pteBuffer); 128 } 129 130 // Get the PTEs for mapping the [map_offset, map_offset + map_size) VA range. 131 static NV_STATUS uvm_pte_buffer_get(uvm_pte_buffer_t *pte_buffer, 132 NvHandle mem_handle, 133 NvU64 map_offset, 134 NvU64 map_size, 135 NvU64 **ptes_out) 136 { 137 NV_STATUS status; 138 size_t pte_offset; 139 size_t num_ptes; 140 size_t ptes_left; 141 uvm_va_range_t *va_range = pte_buffer->va_range; 142 uvm_gpu_va_space_t *gpu_va_space = uvm_gpu_va_space_get(va_range->va_space, pte_buffer->gpu); 143 144 UVM_ASSERT(IS_ALIGNED(map_offset, pte_buffer->page_size)); 145 UVM_ASSERT(IS_ALIGNED(map_size, pte_buffer->page_size)); 146 147 pte_offset = uvm_div_pow2_64(map_offset, pte_buffer->page_size); 148 num_ptes = uvm_div_pow2_64(map_size, pte_buffer->page_size); 149 150 UVM_ASSERT(num_ptes <= pte_buffer->buffer_size / pte_buffer->pte_size); 151 152 // If the requested range is already fully cached, just calculate its 153 // offset within the buffer and return. 154 if (pte_buffer->pte_offset <= pte_offset && pte_buffer->pte_offset + pte_buffer->num_ptes >= pte_offset + num_ptes) { 155 pte_offset -= pte_buffer->pte_offset; 156 *ptes_out = (NvU64 *)((char *)pte_buffer->mapping_info.pteBuffer + pte_offset * pte_buffer->pte_size); 157 return NV_OK; 158 } 159 160 // Otherwise get max possible PTEs from RM starting at the requested offset. 161 pte_buffer->pte_offset = pte_offset; 162 ptes_left = pte_buffer->max_pte_offset - pte_offset; 163 pte_buffer->num_ptes = min(pte_buffer->buffer_size / pte_buffer->pte_size, ptes_left); 164 165 UVM_ASSERT_MSG(pte_buffer->num_ptes >= num_ptes, "buffer num ptes %zu < num ptes %zu\n", 166 pte_buffer->num_ptes, num_ptes); 167 168 // TODO: Bug 1735291: RM can determine the buffer size from the map_size 169 // parameter. 170 pte_buffer->mapping_info.pteBufferSize = pte_buffer->num_ptes * pte_buffer->pte_size; 171 172 if (va_range->type == UVM_VA_RANGE_TYPE_CHANNEL) { 173 status = uvm_rm_locked_call(nvUvmInterfaceGetChannelResourcePtes(gpu_va_space->duped_gpu_va_space, 174 va_range->channel.rm_descriptor, 175 map_offset, 176 pte_buffer->num_ptes * pte_buffer->page_size, 177 &pte_buffer->mapping_info)); 178 } 179 else { 180 status = uvm_rm_locked_call(nvUvmInterfaceGetExternalAllocPtes(gpu_va_space->duped_gpu_va_space, 181 mem_handle, 182 map_offset, 183 pte_buffer->num_ptes * pte_buffer->page_size, 184 &pte_buffer->mapping_info)); 185 } 186 187 if (status != NV_OK) { 188 if (status != NV_ERR_NOT_READY) { 189 UVM_ERR_PRINT("Failed to get %s mappings for VA range [0x%llx, 0x%llx], offset 0x%llx, size 0x%llx: %s\n", 190 va_range->type == UVM_VA_RANGE_TYPE_CHANNEL ? "channel" : "external", 191 va_range->node.start, 192 va_range->node.end, 193 map_offset, 194 map_size, 195 nvstatusToString(status)); 196 } 197 return status; 198 } 199 200 *ptes_out = pte_buffer->mapping_info.pteBuffer; 201 202 return NV_OK; 203 } 204 205 // Copies the input ptes buffer to the given physical address, with an optional 206 // TLB invalidate. The copy acquires the input tracker then updates it. 207 static NV_STATUS copy_ptes(uvm_page_tree_t *tree, 208 NvU64 page_size, 209 uvm_gpu_phys_address_t pte_addr, 210 NvU64 *ptes, 211 NvU32 num_ptes, 212 bool last_mapping, 213 uvm_range_tree_node_t *range_node, 214 uvm_tracker_t *tracker) 215 { 216 uvm_push_t push; 217 NV_STATUS status; 218 NvU32 pte_size = uvm_mmu_pte_size(tree, page_size); 219 220 UVM_ASSERT(((NvU64)pte_size) * num_ptes == pte_size * num_ptes); 221 UVM_ASSERT(pte_size * num_ptes <= MAX_COPY_SIZE_PER_PUSH); 222 223 status = uvm_push_begin_acquire(tree->gpu->channel_manager, 224 UVM_CHANNEL_TYPE_MEMOPS, 225 tracker, 226 &push, 227 "Writing %u bytes of PTEs to {%s, 0x%llx}", 228 pte_size * num_ptes, 229 uvm_aperture_string(pte_addr.aperture), 230 pte_addr.address); 231 if (status != NV_OK) 232 return status; 233 234 uvm_pte_batch_single_write_ptes(&push, pte_addr, ptes, pte_size, num_ptes); 235 236 if (last_mapping) { 237 // Do a TLB invalidate if this is the last mapping in the VA range 238 // Membar: This is a permissions upgrade, so no post-invalidate membar 239 // is needed. 240 uvm_tlb_batch_single_invalidate(tree, 241 &push, 242 range_node->start, 243 uvm_range_tree_node_size(range_node), 244 page_size, 245 UVM_MEMBAR_NONE); 246 } 247 else { 248 // For pushes prior to the last one, the PTE batch write has 249 // already pushed a membar that's enough to order the PTE writes 250 // with the TLB invalidate in the last push and that's all 251 // that's needed. 252 // If a failure happens before the push for the last mapping, it is 253 // still ok as what will follow is more CE writes to unmap the PTEs and 254 // those will get ordered by the membar from the PTE batch. 255 uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE); 256 } 257 258 uvm_push_end(&push); 259 260 // The push acquired the tracker so it's ok to just overwrite it with 261 // the entry tracking the push. 262 uvm_tracker_overwrite_with_push(tracker, &push); 263 264 return NV_OK; 265 } 266 267 // Map all of pt_range, which is contained with the va_range and begins at 268 // virtual address map_start. The PTE values are queried from RM and the pushed 269 // writes are added to the input tracker. 270 // 271 // If the mapped range ends on range_node->end, a TLB invalidate for upgrade is 272 // also issued. 273 static NV_STATUS map_rm_pt_range(uvm_page_tree_t *tree, 274 uvm_page_table_range_t *pt_range, 275 uvm_pte_buffer_t *pte_buffer, 276 uvm_range_tree_node_t *range_node, 277 NvHandle mem_handle, 278 NvU64 map_start, 279 NvU64 map_offset, 280 uvm_tracker_t *tracker) 281 { 282 uvm_gpu_phys_address_t pte_addr; 283 NvU64 page_size = pt_range->page_size; 284 NvU32 pte_size = uvm_mmu_pte_size(tree, page_size); 285 NvU64 addr, end; 286 size_t max_ptes, ptes_left, num_ptes; 287 NvU64 map_size; 288 bool last_mapping; 289 NV_STATUS status = NV_OK; 290 291 end = map_start + uvm_page_table_range_size(pt_range) - 1; 292 293 UVM_ASSERT(map_start >= range_node->start); 294 UVM_ASSERT(end <= range_node->end); 295 UVM_ASSERT(page_size & tree->hal->page_sizes()); 296 UVM_ASSERT(IS_ALIGNED(map_start, page_size)); 297 UVM_ASSERT(IS_ALIGNED(map_offset, page_size)); 298 299 pte_addr = uvm_page_table_range_entry_address(tree, pt_range, 0); 300 max_ptes = min((size_t)(uvm_mmu_pde_coverage(tree, page_size) / page_size), MAX_COPY_SIZE_PER_PUSH / pte_size); 301 max_ptes = min(max_ptes, pte_buffer->buffer_size / pte_size); 302 303 addr = map_start; 304 ptes_left = (size_t)uvm_div_pow2_64(uvm_page_table_range_size(pt_range), page_size); 305 while (addr < end) { 306 NvU64 *pte_bits; 307 308 num_ptes = min(max_ptes, ptes_left); 309 map_size = num_ptes * page_size; 310 UVM_ASSERT(addr + map_size <= end + 1); 311 312 status = uvm_pte_buffer_get(pte_buffer, mem_handle, map_offset, map_size, &pte_bits); 313 if (status != NV_OK) 314 return status; 315 316 last_mapping = (addr + map_size - 1 == range_node->end); 317 318 // These copies are technically independent, except for the last one 319 // which issues the TLB invalidate and thus must wait for all others. 320 // However, since each copy will saturate the bus anyway we force them 321 // to serialize to avoid bus contention. 322 status = copy_ptes(tree, 323 page_size, 324 pte_addr, 325 pte_bits, 326 num_ptes, 327 last_mapping, 328 range_node, 329 tracker); 330 if (status != NV_OK) 331 return status; 332 333 ptes_left -= num_ptes; 334 pte_addr.address += num_ptes * pte_size; 335 addr += map_size; 336 map_offset += map_size; 337 } 338 339 return NV_OK; 340 } 341 342 // Determine the appropriate membar for downgrades on a VA range with type 343 // UVM_VA_RANGE_TYPE_EXTERNAL or UVM_VA_RANGE_TYPE_CHANNEL. 344 static uvm_membar_t va_range_downgrade_membar(uvm_va_range_t *va_range, uvm_ext_gpu_map_t *ext_gpu_map) 345 { 346 if (va_range->type == UVM_VA_RANGE_TYPE_CHANNEL) { 347 return uvm_hal_downgrade_membar_type(va_range->channel.gpu_va_space->gpu, 348 va_range->channel.aperture == UVM_APERTURE_VID); 349 } 350 351 // If there is no mem_handle, this is a sparse mapping. 352 // UVM_MEMBAR_GPU is sufficient because the debug pages remain allocated 353 // until the GPU is torn down. GPU tear down implies that our context has 354 // been switched out. In turn, this implies a sysmembar. 355 if (!ext_gpu_map->mem_handle) 356 return UVM_MEMBAR_GPU; 357 358 return uvm_hal_downgrade_membar_type(ext_gpu_map->gpu, 359 !ext_gpu_map->is_sysmem && ext_gpu_map->gpu == ext_gpu_map->owning_gpu); 360 } 361 362 NV_STATUS uvm_va_range_map_rm_allocation(uvm_va_range_t *va_range, 363 uvm_gpu_t *mapping_gpu, 364 const UvmGpuMemoryInfo *mem_info, 365 const uvm_map_rm_params_t *map_rm_params, 366 uvm_ext_gpu_map_t *ext_gpu_map, 367 uvm_tracker_t *out_tracker) 368 { 369 uvm_gpu_va_space_t *gpu_va_space = uvm_gpu_va_space_get(va_range->va_space, mapping_gpu); 370 uvm_page_tree_t *page_tree; 371 uvm_pte_buffer_t pte_buffer; 372 uvm_page_table_range_vec_t *pt_range_vec; 373 uvm_page_table_range_t *pt_range; 374 uvm_range_tree_node_t *node; 375 NvU64 addr, size; 376 NvU64 map_offset = map_rm_params->map_offset; 377 size_t i; 378 NV_STATUS status; 379 uvm_tracker_t *tracker; 380 381 // Track local pushes in a separate tracker, instead of adding them 382 // directly to the output tracker, to avoid false dependencies 383 // (serialization) on unrelated work. The local tracker is added to the 384 // output tracker before the function returns. 385 uvm_tracker_t local_tracker = UVM_TRACKER_INIT(); 386 387 // Local tracker is used when this function is called to map allocations 388 // other than external allocations. Otherwise, the external allocations 389 // use their own tracker. 390 if (ext_gpu_map) 391 tracker = &ext_gpu_map->tracker; 392 else 393 tracker = &local_tracker; 394 395 UVM_ASSERT(gpu_va_space); 396 UVM_ASSERT(va_range->type == UVM_VA_RANGE_TYPE_EXTERNAL || va_range->type == UVM_VA_RANGE_TYPE_CHANNEL); 397 UVM_ASSERT(IS_ALIGNED(mem_info->size, mem_info->pageSize)); 398 UVM_ASSERT(out_tracker); 399 400 page_tree = &gpu_va_space->page_tables; 401 402 UVM_ASSERT(uvm_mmu_page_size_supported(page_tree, mem_info->pageSize)); 403 404 if (va_range->type == UVM_VA_RANGE_TYPE_EXTERNAL) { 405 // We should be never called with ext_gpu_map == NULL 406 // and UVM_VA_RANGE_TYPE_EXTERNAL 407 UVM_ASSERT(ext_gpu_map != NULL); 408 node = &ext_gpu_map->node; 409 pt_range_vec = &ext_gpu_map->pt_range_vec; 410 } 411 else { 412 node = &va_range->node; 413 pt_range_vec = &va_range->channel.pt_range_vec; 414 } 415 416 if (map_offset + uvm_range_tree_node_size(node) > mem_info->size) 417 return NV_ERR_INVALID_OFFSET; 418 419 UVM_ASSERT(IS_ALIGNED(node->start, mem_info->pageSize) && 420 IS_ALIGNED(node->end + 1, mem_info->pageSize) && 421 IS_ALIGNED(map_offset, mem_info->pageSize)); 422 423 status = uvm_pte_buffer_init(va_range, 424 mapping_gpu, 425 map_rm_params, 426 uvm_range_tree_node_size(node), 427 mem_info->pageSize, 428 &pte_buffer); 429 if (status != NV_OK) 430 return status; 431 432 // Allocate all page tables for this VA range. 433 // 434 // TODO: Bug 1766649: Benchmark to see if we get any performance improvement 435 // from parallelizing page range allocation with writing PTEs for 436 // earlier ranges. 437 status = uvm_page_table_range_vec_init(page_tree, 438 node->start, 439 uvm_range_tree_node_size(node), 440 mem_info->pageSize, 441 UVM_PMM_ALLOC_FLAGS_EVICT, 442 pt_range_vec); 443 if (status != NV_OK) 444 goto out; 445 446 addr = node->start; 447 for (i = 0; i < pt_range_vec->range_count; i++) { 448 pt_range = &pt_range_vec->ranges[i]; 449 450 // External allocations track pushes in their own trackers. User channel 451 // mappings don't have their own trackers, so for those the local tracker 452 // is used. 453 status = map_rm_pt_range(page_tree, 454 pt_range, 455 &pte_buffer, 456 node, 457 ext_gpu_map ? ext_gpu_map->mem_handle->rm_handle : 0, 458 addr, 459 map_offset, 460 tracker); 461 if (status != NV_OK) 462 goto out; 463 464 size = uvm_page_table_range_size(pt_range); 465 addr += size; 466 map_offset += size; 467 } 468 469 status = uvm_tracker_add_tracker(out_tracker, tracker); 470 471 out: 472 if (status != NV_OK) { 473 // We could have any number of mappings in flight to these page tables, 474 // so wait for everything before we clear and free them. 475 if (uvm_tracker_wait(tracker) != NV_OK) { 476 // System-fatal error. Just leak. 477 return status; 478 } 479 480 if (pt_range_vec->ranges) { 481 uvm_page_table_range_vec_clear_ptes(pt_range_vec, va_range_downgrade_membar(va_range, ext_gpu_map)); 482 uvm_page_table_range_vec_deinit(pt_range_vec); 483 } 484 } 485 486 uvm_pte_buffer_deinit(&pte_buffer); 487 uvm_tracker_deinit(&local_tracker); 488 return status; 489 } 490 491 static bool uvm_api_mapping_type_invalid(UvmGpuMappingType map_type) 492 { 493 BUILD_BUG_ON((int)UvmGpuMappingTypeDefault != (int)UvmRmGpuMappingTypeDefault); 494 BUILD_BUG_ON((int)UvmGpuMappingTypeReadWriteAtomic != (int)UvmRmGpuMappingTypeReadWriteAtomic); 495 BUILD_BUG_ON((int)UvmGpuMappingTypeReadWrite != (int)UvmRmGpuMappingTypeReadWrite); 496 BUILD_BUG_ON((int)UvmGpuMappingTypeReadOnly != (int)UvmRmGpuMappingTypeReadOnly); 497 BUILD_BUG_ON((int)UvmGpuMappingTypeCount != (int)UvmRmGpuMappingTypeCount); 498 499 switch (map_type) { 500 case UvmGpuMappingTypeDefault: 501 case UvmGpuMappingTypeReadWriteAtomic: 502 case UvmGpuMappingTypeReadWrite: 503 case UvmGpuMappingTypeReadOnly: 504 return false; 505 default: 506 return true; 507 } 508 } 509 510 static bool uvm_api_caching_type_invalid(UvmGpuCachingType cache_type) 511 { 512 BUILD_BUG_ON((int)UvmGpuCachingTypeDefault != (int)UvmRmGpuCachingTypeDefault); 513 BUILD_BUG_ON((int)UvmGpuCachingTypeForceUncached != (int)UvmRmGpuCachingTypeForceUncached); 514 BUILD_BUG_ON((int)UvmGpuCachingTypeForceCached != (int)UvmRmGpuCachingTypeForceCached); 515 BUILD_BUG_ON((int)UvmGpuCachingTypeCount != (int)UvmRmGpuCachingTypeCount); 516 517 switch (cache_type) { 518 case UvmGpuCachingTypeDefault: 519 case UvmGpuCachingTypeForceUncached: 520 case UvmGpuCachingTypeForceCached: 521 return false; 522 default: 523 return true; 524 } 525 } 526 527 static bool uvm_api_kind_type_invalid(UvmGpuFormatType format_type, 528 UvmGpuFormatElementBits element_bits, 529 UvmGpuCompressionType compression_type) 530 { 531 BUILD_BUG_ON((int)UvmGpuFormatTypeDefault != (int)UvmRmGpuFormatTypeDefault); 532 BUILD_BUG_ON((int)UvmGpuFormatTypeBlockLinear != (int)UvmRmGpuFormatTypeBlockLinear); 533 BUILD_BUG_ON((int)UvmGpuFormatTypeCount != (int)UvmRmGpuFormatTypeCount); 534 535 BUILD_BUG_ON((int)UvmGpuFormatElementBitsDefault != (int)UvmRmGpuFormatElementBitsDefault); 536 BUILD_BUG_ON((int)UvmGpuFormatElementBits8 != (int)UvmRmGpuFormatElementBits8); 537 BUILD_BUG_ON((int)UvmGpuFormatElementBits16 != (int)UvmRmGpuFormatElementBits16); 538 BUILD_BUG_ON((int)UvmGpuFormatElementBits32 != (int)UvmRmGpuFormatElementBits32); 539 BUILD_BUG_ON((int)UvmGpuFormatElementBits64 != (int)UvmRmGpuFormatElementBits64); 540 BUILD_BUG_ON((int)UvmGpuFormatElementBits128 != (int)UvmRmGpuFormatElementBits128); 541 BUILD_BUG_ON((int)UvmGpuFormatElementBitsCount != (int)UvmRmGpuFormatElementBitsCount); 542 543 BUILD_BUG_ON((int)UvmGpuCompressionTypeDefault != (int)UvmRmGpuCompressionTypeDefault); 544 BUILD_BUG_ON((int)UvmGpuCompressionTypeEnabledNoPlc != (int)UvmRmGpuCompressionTypeEnabledNoPlc); 545 BUILD_BUG_ON((int)UvmGpuCompressionTypeCount != (int)UvmRmGpuCompressionTypeCount); 546 547 if (compression_type >= UvmGpuCompressionTypeCount) 548 return true; 549 550 switch (format_type) { 551 case UvmGpuFormatTypeDefault: 552 case UvmGpuFormatTypeBlockLinear: 553 break; 554 default: 555 return true; 556 } 557 558 switch (element_bits) { 559 case UvmGpuFormatElementBitsDefault: 560 case UvmGpuFormatElementBits8: 561 case UvmGpuFormatElementBits16: 562 // CUDA does not support 24-bit width 563 case UvmGpuFormatElementBits32: 564 case UvmGpuFormatElementBits64: 565 case UvmGpuFormatElementBits128: 566 break; 567 default: 568 return true; 569 } 570 571 if (((format_type != UvmGpuFormatTypeDefault) && (element_bits == UvmGpuFormatElementBitsDefault)) || 572 ((element_bits != UvmGpuFormatElementBitsDefault) && (format_type == UvmGpuFormatTypeDefault))) 573 return true; 574 575 return false; 576 } 577 578 static void uvm_release_rm_handle(struct nv_kref *ref) 579 { 580 uvm_ext_gpu_mem_handle *mem_handle = container_of(ref, uvm_ext_gpu_mem_handle, ref_count); 581 582 if (mem_handle->rm_handle) { 583 NV_STATUS status; 584 585 status = uvm_rm_locked_call(nvUvmInterfaceFreeDupedHandle(uvm_gpu_device_handle(mem_handle->gpu), 586 mem_handle->rm_handle)); 587 UVM_ASSERT(status == NV_OK); 588 } 589 uvm_kvfree(mem_handle); 590 } 591 592 static NV_STATUS uvm_create_external_range(uvm_va_space_t *va_space, UVM_CREATE_EXTERNAL_RANGE_PARAMS *params) 593 { 594 uvm_va_range_t *va_range = NULL; 595 struct mm_struct *mm; 596 NV_STATUS status = NV_OK; 597 598 // Before we know the page size used by the allocation, we can only enforce 599 // 4K alignment as that's the minimum page size used for GPU allocations. 600 // Later uvm_map_external_allocation_on_gpu() will enforce alignment to the 601 // page size used by the allocation. 602 if (uvm_api_range_invalid_4k(params->base, params->length)) 603 return NV_ERR_INVALID_ADDRESS; 604 605 // The mm needs to be locked in order to remove stale HMM va_blocks. 606 mm = uvm_va_space_mm_or_current_retain_lock(va_space); 607 uvm_va_space_down_write(va_space); 608 609 // Create the new external VA range. 610 // uvm_va_range_create_external handles any collisions when it attempts to 611 // insert the new range into the va_space range tree. 612 status = uvm_va_range_create_external(va_space, mm, params->base, params->length, &va_range); 613 if (status != NV_OK) { 614 UVM_DBG_PRINT_RL("Failed to create external VA range [0x%llx, 0x%llx)\n", 615 params->base, 616 params->base + params->length); 617 } 618 619 uvm_va_space_up_write(va_space); 620 uvm_va_space_mm_or_current_release_unlock(va_space, mm); 621 return status; 622 } 623 624 NV_STATUS uvm_api_create_external_range(UVM_CREATE_EXTERNAL_RANGE_PARAMS *params, struct file *filp) 625 { 626 uvm_va_space_t *va_space = uvm_va_space_get(filp); 627 return uvm_create_external_range(va_space, params); 628 } 629 630 static NV_STATUS set_ext_gpu_map_location(uvm_ext_gpu_map_t *ext_gpu_map, 631 uvm_va_space_t *va_space, 632 uvm_gpu_t *mapping_gpu, 633 const UvmGpuMemoryInfo *mem_info) 634 { 635 uvm_gpu_t *owning_gpu; 636 637 if (!mem_info->deviceDescendant && !mem_info->sysmem) { 638 ext_gpu_map->owning_gpu = NULL; 639 ext_gpu_map->is_sysmem = false; 640 return NV_OK; 641 } 642 // This is a local or peer allocation, so the owning GPU must have been 643 // registered. 644 owning_gpu = uvm_va_space_get_gpu_by_uuid(va_space, &mem_info->uuid); 645 if (!owning_gpu) 646 return NV_ERR_INVALID_DEVICE; 647 648 // Even if the allocation is in sysmem then it still matters which GPU owns 649 // it, because our dup is not enough to keep the owning GPU around and that 650 // exposes a bug in RM where the memory can outlast the GPU and then cause 651 // crashes when it's eventually freed. 652 // TODO: Bug 1811006: Bug tracking the RM issue, its fix might change the 653 // semantics of sysmem allocations. 654 if (mem_info->sysmem) { 655 ext_gpu_map->owning_gpu = owning_gpu; 656 ext_gpu_map->is_sysmem = true; 657 return NV_OK; 658 } 659 660 if (owning_gpu != mapping_gpu) { 661 // TODO: Bug 1757136: In SLI, the returned UUID may be different but a 662 // local mapping must be used. We need to query SLI groups to know 663 // that. 664 if (!uvm_va_space_peer_enabled(va_space, mapping_gpu, owning_gpu)) 665 return NV_ERR_INVALID_DEVICE; 666 } 667 668 ext_gpu_map->owning_gpu = owning_gpu; 669 ext_gpu_map->is_sysmem = false; 670 return NV_OK; 671 } 672 673 static uvm_ext_gpu_map_t *uvm_va_range_ext_gpu_map(uvm_va_range_t *va_range, uvm_gpu_t *mapping_gpu, NvU64 addr) 674 { 675 uvm_ext_gpu_map_t *ext_gpu_map = NULL; 676 uvm_range_tree_node_t *node; 677 uvm_ext_gpu_range_tree_t *range_tree; 678 679 UVM_ASSERT(va_range->type == UVM_VA_RANGE_TYPE_EXTERNAL); 680 uvm_assert_rwsem_locked(&va_range->va_space->lock); 681 682 range_tree = uvm_ext_gpu_range_tree(va_range, mapping_gpu); 683 684 if (uvm_processor_mask_test(&va_range->external.mapped_gpus, mapping_gpu->id)) { 685 UVM_ASSERT(!uvm_range_tree_empty(&range_tree->tree)); 686 node = uvm_range_tree_find(&range_tree->tree, addr); 687 if (node) { 688 ext_gpu_map = uvm_ext_gpu_map_container(node); 689 UVM_ASSERT(ext_gpu_map->gpu == mapping_gpu); 690 } 691 } 692 else { 693 UVM_ASSERT(uvm_range_tree_empty(&range_tree->tree)); 694 } 695 696 return ext_gpu_map; 697 } 698 699 static NV_STATUS uvm_ext_gpu_map_split(uvm_range_tree_t *tree, 700 uvm_ext_gpu_map_t *existing_map, 701 NvU64 new_end, 702 uvm_ext_gpu_map_t **new_map) 703 { 704 uvm_ext_gpu_map_t *new; 705 NV_STATUS status; 706 NvU64 new_start = new_end + 1; 707 708 if (!IS_ALIGNED(new_start, existing_map->pt_range_vec.page_size)) 709 return NV_ERR_INVALID_ADDRESS; 710 711 UVM_ASSERT(new_start >= existing_map->node.start && new_start < existing_map->node.end); 712 713 new = uvm_kvmalloc_zero(sizeof(*new)); 714 if (!new) 715 return NV_ERR_NO_MEMORY; 716 717 RB_CLEAR_NODE(&new->node.rb_node); 718 new->mem_handle = existing_map->mem_handle; 719 new->gpu = existing_map->gpu; 720 new->owning_gpu = existing_map->owning_gpu; 721 new->is_sysmem = existing_map->is_sysmem; 722 723 // Initialize the new ext_gpu_map tracker as a copy of the existing_map tracker. 724 // This way, any operations on any of the two ext_gpu_maps will be able to 725 // wait for any uncompleted work prior to the split. 726 status = uvm_tracker_init_from(&new->tracker, &existing_map->tracker); 727 if (status != NV_OK) { 728 uvm_kvfree(new); 729 return status; 730 } 731 732 status = uvm_page_table_range_vec_split_upper(&existing_map->pt_range_vec, new_start - 1, &new->pt_range_vec); 733 if (status != NV_OK) { 734 uvm_tracker_deinit(&new->tracker); 735 uvm_kvfree(new); 736 return status; 737 } 738 739 new->node.start = new_start; 740 741 // Sparse mappings don't have actual allocations. 742 if (new->mem_handle) 743 nv_kref_get(&new->mem_handle->ref_count); 744 745 uvm_range_tree_split(tree, &existing_map->node, &new->node); 746 747 if (new_map) 748 *new_map = new; 749 750 return NV_OK; 751 } 752 753 static NV_STATUS uvm_unmap_external_in_range(uvm_va_range_t *va_range, 754 uvm_gpu_t *gpu, 755 NvU64 start, 756 NvU64 end, 757 struct list_head *deferred_list) 758 { 759 uvm_ext_gpu_range_tree_t *range_tree = uvm_ext_gpu_range_tree(va_range, gpu); 760 uvm_ext_gpu_map_t *ext_map, *ext_map_next = NULL; 761 NV_STATUS status = NV_OK; 762 763 uvm_assert_mutex_locked(&range_tree->lock); 764 765 // If a previously existing sub-range is found (ext_map != NULL), the 766 // new sub-range can be overlapping with the existing one in one of the 767 // following ways: 768 // 769 // 1. complete overlap (exact start and end boundary match is special 770 // cases of this): 771 // [---- existing ----] 772 // [---- new ----] 773 // 2. partial overlap at the start (end boundary match is a special case 774 // of this): 775 // [---- existing ----] 776 // [---- new ----] 777 // 3. partial overlap at the end (start boundary match is a special case 778 // of this): 779 // [---- existing ----] 780 // [---- new ----] 781 // 4. completely contained (start of new != start of existing and end of 782 // new != end of existing, otherwise see 1): 783 // [---- existing ----] 784 // [-- new --] 785 // 786 // The algorithm below is: 787 // 1. If the start of the new mapping is greater than the start of the 788 // existing mapping, split the existing mapping at start. The newly 789 // created uvm_ext_gpu_map_t will be inserted into the tree. Note that 790 // the newly created uvm_ext_gpu_map_t is the one that we want to visit 791 // next. When the loop visits the newly created uvm_ext_gpu_map_t and 792 // its boundaries are completely overlapped by the new mapping, it will 793 // cause the algorithm to destroy it. 794 // 2. If the end of the new mapping is less than the end of the existing 795 // mapping, split the existing mapping at end. The newly created 796 // uvm_ext_gpu_map_t will be inserted into the tree. The overlapping 797 // portion of the existing mapping will be destroyed. 798 // 3. If the existing mapping is completely overlapped by the new mapping, 799 // the existing mapping is destroyed. 800 // 801 // The loop cannot use any of the existing iterators because: 802 // 1. It needs to be able to destroy ext_gpu_map structures. This means it 803 // can't use non-safe iterators. 804 // 2. It needs to visit newly created uvm_ext_gpu_map_t, as a result of 805 // splits. This means it can't use safe iterators as they will skip the 806 // newly created uvm_ext_gpu_map_t. 807 ext_map = uvm_ext_gpu_map_iter_first(va_range, gpu, start, end); 808 while (ext_map) { 809 if (start > ext_map->node.start) { 810 status = uvm_ext_gpu_map_split(&range_tree->tree, ext_map, start - 1, &ext_map_next); 811 if (status != NV_OK) 812 break; 813 } 814 else { 815 if (end < ext_map->node.end) { 816 status = uvm_ext_gpu_map_split(&range_tree->tree, ext_map, end, NULL); 817 if (status != NV_OK) 818 break; 819 ext_map_next = NULL; 820 } 821 else { 822 ext_map_next = uvm_ext_gpu_map_iter_next(va_range, ext_map, end); 823 } 824 825 uvm_ext_gpu_map_destroy(va_range, ext_map, deferred_list); 826 } 827 828 ext_map = ext_map_next; 829 } 830 831 return status; 832 } 833 834 static NV_STATUS uvm_map_external_allocation_on_gpu(uvm_va_range_t *va_range, 835 uvm_gpu_t *mapping_gpu, 836 const uvm_rm_user_object_t *user_rm_mem, 837 const uvm_map_rm_params_t *map_rm_params, 838 NvU64 base, 839 NvU64 length, 840 uvm_tracker_t *out_tracker) 841 { 842 uvm_va_space_t *va_space = va_range->va_space; 843 uvm_ext_gpu_map_t *ext_gpu_map = NULL; 844 uvm_ext_gpu_range_tree_t *range_tree = uvm_ext_gpu_range_tree(va_range, mapping_gpu); 845 UvmGpuMemoryInfo mem_info; 846 uvm_gpu_va_space_t *gpu_va_space = uvm_gpu_va_space_get(va_space, mapping_gpu); 847 NvU32 mapping_page_size; 848 NvU64 alignments; 849 NvU32 smallest_alignment; 850 NV_STATUS status; 851 852 uvm_assert_rwsem_locked_read(&va_space->lock); 853 854 if ((map_rm_params->compression_type == UvmGpuCompressionTypeEnabledNoPlc) && !mapping_gpu->parent->plc_supported) 855 return NV_ERR_INVALID_DEVICE; 856 857 // Check if the GPU can access the VA 858 if (!uvm_gpu_can_address(mapping_gpu, base, length)) 859 return NV_ERR_OUT_OF_RANGE; 860 861 uvm_mutex_lock(&range_tree->lock); 862 863 status = uvm_unmap_external_in_range(va_range, mapping_gpu, base, base + length - 1, NULL); 864 if (status != NV_OK) 865 goto error; 866 867 ext_gpu_map = uvm_kvmalloc_zero(sizeof(*ext_gpu_map)); 868 if (!ext_gpu_map) { 869 status = NV_ERR_NO_MEMORY; 870 goto error; 871 } 872 873 // Insert the ext_gpu_map into the VA range immediately since some of the 874 // below calls require it to be there. 875 ext_gpu_map->node.start = base; 876 ext_gpu_map->node.end = base + length - 1; 877 RB_CLEAR_NODE(&ext_gpu_map->node.rb_node); 878 uvm_tracker_init(&ext_gpu_map->tracker); 879 ext_gpu_map->mem_handle = uvm_kvmalloc_zero(sizeof(*ext_gpu_map->mem_handle)); 880 if (!ext_gpu_map->mem_handle) { 881 status = NV_ERR_NO_MEMORY; 882 goto error; 883 } 884 885 // Due to the fact that any overlapping mappings were already unmapped, 886 // adding the new mapping to the tree cannot fail. 887 status = uvm_range_tree_add(&range_tree->tree, &ext_gpu_map->node); 888 UVM_ASSERT(status == NV_OK); 889 890 uvm_processor_mask_set_atomic(&va_range->external.mapped_gpus, mapping_gpu->id); 891 ext_gpu_map->gpu = mapping_gpu; 892 ext_gpu_map->mem_handle->gpu = mapping_gpu; 893 nv_kref_init(&ext_gpu_map->mem_handle->ref_count); 894 895 // Error paths after this point may call uvm_va_range_ext_gpu_map, so do a 896 // sanity check now to make sure it doesn't trigger any asserts. 897 UVM_ASSERT(uvm_va_range_ext_gpu_map(va_range, mapping_gpu, base) == ext_gpu_map); 898 899 // Dup the memory. This verifies the input handles, takes a ref count on the 900 // physical allocation so it can't go away under us, and returns us the 901 // allocation info. 902 status = uvm_rm_locked_call(nvUvmInterfaceDupMemory(uvm_gpu_device_handle(mapping_gpu), 903 user_rm_mem->user_client, 904 user_rm_mem->user_object, 905 &ext_gpu_map->mem_handle->rm_handle, 906 &mem_info)); 907 if (status != NV_OK) { 908 UVM_DBG_PRINT("Failed to dup memory handle {0x%x, 0x%x}: %s, GPU: %s\n", 909 user_rm_mem->user_client, 910 user_rm_mem->user_object, 911 nvstatusToString(status), 912 uvm_gpu_name(mapping_gpu)); 913 goto error; 914 } 915 916 status = set_ext_gpu_map_location(ext_gpu_map, va_space, mapping_gpu, &mem_info); 917 if (status != NV_OK) 918 goto error; 919 920 // Determine the proper mapping page size. 921 // This will be the largest supported page size less than or equal to the 922 // smallest of the base VA address, length, offset, and allocation page size 923 // alignments. 924 alignments = mem_info.pageSize | base | length | map_rm_params->map_offset; 925 smallest_alignment = alignments & ~(alignments - 1); 926 927 // Check that alignment bits did not get truncated. 928 UVM_ASSERT(smallest_alignment); 929 930 mapping_page_size = uvm_mmu_biggest_page_size_up_to(&gpu_va_space->page_tables, smallest_alignment); 931 if (!mapping_page_size) { 932 status = NV_ERR_INVALID_ADDRESS; 933 goto error; 934 } 935 936 mem_info.pageSize = mapping_page_size; 937 938 status = uvm_va_range_map_rm_allocation(va_range, mapping_gpu, &mem_info, map_rm_params, ext_gpu_map, out_tracker); 939 if (status != NV_OK) 940 goto error; 941 942 uvm_mutex_unlock(&range_tree->lock); 943 return NV_OK; 944 945 error: 946 uvm_ext_gpu_map_destroy(va_range, ext_gpu_map, NULL); 947 uvm_mutex_unlock(&range_tree->lock); 948 return status; 949 } 950 951 // Actual implementation of UvmMapExternalAllocation 952 static NV_STATUS uvm_map_external_allocation(uvm_va_space_t *va_space, UVM_MAP_EXTERNAL_ALLOCATION_PARAMS *params) 953 { 954 uvm_va_range_t *va_range = NULL; 955 uvm_gpu_t *mapping_gpu; 956 uvm_processor_mask_t mapped_gpus; 957 NV_STATUS status = NV_OK; 958 size_t i; 959 uvm_map_rm_params_t map_rm_params; 960 uvm_rm_user_object_t user_rm_mem = 961 { 962 .rm_control_fd = params->rmCtrlFd, 963 .user_client = params->hClient, 964 .user_object = params->hMemory 965 }; 966 uvm_tracker_t tracker = UVM_TRACKER_INIT(); 967 968 if (uvm_api_range_invalid_4k(params->base, params->length)) 969 return NV_ERR_INVALID_ADDRESS; 970 971 if (params->gpuAttributesCount == 0 || params->gpuAttributesCount > UVM_MAX_GPUS) 972 return NV_ERR_INVALID_ARGUMENT; 973 974 uvm_va_space_down_read_rm(va_space); 975 va_range = uvm_va_range_find(va_space, params->base); 976 977 if (!va_range || 978 va_range->type != UVM_VA_RANGE_TYPE_EXTERNAL || 979 va_range->node.end < params->base + params->length - 1) { 980 uvm_va_space_up_read_rm(va_space); 981 return NV_ERR_INVALID_ADDRESS; 982 } 983 984 uvm_processor_mask_zero(&mapped_gpus); 985 for (i = 0; i < params->gpuAttributesCount; i++) { 986 if (uvm_api_mapping_type_invalid(params->perGpuAttributes[i].gpuMappingType) || 987 uvm_api_caching_type_invalid(params->perGpuAttributes[i].gpuCachingType) || 988 uvm_api_kind_type_invalid(params->perGpuAttributes[i].gpuFormatType, 989 params->perGpuAttributes[i].gpuElementBits, 990 params->perGpuAttributes[i].gpuCompressionType)) { 991 status = NV_ERR_INVALID_ARGUMENT; 992 goto error; 993 } 994 995 mapping_gpu = uvm_va_space_get_gpu_by_uuid_with_gpu_va_space(va_space, ¶ms->perGpuAttributes[i].gpuUuid); 996 if (!mapping_gpu) { 997 status = NV_ERR_INVALID_DEVICE; 998 goto error; 999 } 1000 1001 // Use a tracker to get as much parallelization as possible among GPUs, 1002 // so one GPU can have its PTE writes in flight while we're working on 1003 // the next one. 1004 map_rm_params.map_offset = params->offset; 1005 map_rm_params.mapping_type = params->perGpuAttributes[i].gpuMappingType; 1006 map_rm_params.caching_type = params->perGpuAttributes[i].gpuCachingType; 1007 map_rm_params.format_type = params->perGpuAttributes[i].gpuFormatType; 1008 map_rm_params.element_bits = params->perGpuAttributes[i].gpuElementBits; 1009 map_rm_params.compression_type = params->perGpuAttributes[i].gpuCompressionType; 1010 status = uvm_map_external_allocation_on_gpu(va_range, 1011 mapping_gpu, 1012 &user_rm_mem, 1013 &map_rm_params, 1014 params->base, 1015 params->length, 1016 &tracker); 1017 if (status != NV_OK) 1018 goto error; 1019 1020 uvm_processor_mask_set(&mapped_gpus, mapping_gpu->id); 1021 } 1022 1023 // Wait for outstanding page table operations to finish across all GPUs. We 1024 // just need to hold the VA space lock to prevent the GPUs on which we're 1025 // waiting from getting unregistered underneath us. 1026 status = uvm_tracker_wait_deinit(&tracker); 1027 1028 uvm_va_space_up_read_rm(va_space); 1029 return status; 1030 1031 error: 1032 // We still have to wait for page table writes to finish, since the teardown 1033 // could free them. 1034 (void)uvm_tracker_wait_deinit(&tracker); 1035 1036 // Tear down only those mappings we created during this call 1037 for_each_va_space_gpu_in_mask(mapping_gpu, va_space, &mapped_gpus) { 1038 uvm_ext_gpu_range_tree_t *range_tree = uvm_ext_gpu_range_tree(va_range, mapping_gpu); 1039 uvm_ext_gpu_map_t *ext_map, *ext_map_next; 1040 1041 uvm_mutex_lock(&range_tree->lock); 1042 uvm_ext_gpu_map_for_each_in_safe(ext_map, 1043 ext_map_next, 1044 va_range, 1045 mapping_gpu, 1046 params->base, 1047 params->base + params->length - 1) 1048 uvm_ext_gpu_map_destroy(va_range, ext_map, NULL); 1049 uvm_mutex_unlock(&range_tree->lock); 1050 } 1051 1052 uvm_va_space_up_read_rm(va_space); 1053 1054 return status; 1055 } 1056 1057 NV_STATUS uvm_api_map_external_allocation(UVM_MAP_EXTERNAL_ALLOCATION_PARAMS *params, struct file *filp) 1058 { 1059 uvm_va_space_t *va_space = uvm_va_space_get(filp); 1060 return uvm_map_external_allocation(va_space, params); 1061 } 1062 1063 static NvU64 external_sparse_pte_maker(uvm_page_table_range_vec_t *range_vec, NvU64 offset, void *caller_data) 1064 { 1065 return range_vec->tree->hal->make_sparse_pte(); 1066 } 1067 1068 static NV_STATUS uvm_map_external_sparse_on_gpu(uvm_va_range_t *va_range, 1069 uvm_gpu_t *mapping_gpu, 1070 NvU64 base, 1071 NvU64 length, 1072 struct list_head *deferred_free_list) 1073 { 1074 uvm_va_space_t *va_space = va_range->va_space; 1075 uvm_ext_gpu_map_t *ext_gpu_map = NULL; 1076 uvm_ext_gpu_range_tree_t *range_tree = uvm_ext_gpu_range_tree(va_range, mapping_gpu); 1077 uvm_gpu_va_space_t *gpu_va_space = uvm_gpu_va_space_get(va_space, mapping_gpu); 1078 uvm_page_tree_t *page_tree; 1079 NV_STATUS status; 1080 1081 uvm_assert_rwsem_locked(&va_space->lock); 1082 1083 if (!uvm_gpu_can_address(mapping_gpu, base, length)) 1084 return NV_ERR_OUT_OF_RANGE; 1085 1086 UVM_ASSERT(gpu_va_space); 1087 1088 page_tree = &gpu_va_space->page_tables; 1089 1090 uvm_mutex_lock(&range_tree->lock); 1091 1092 status = uvm_unmap_external_in_range(va_range, mapping_gpu, base, base + length - 1, deferred_free_list); 1093 if (status != NV_OK) 1094 goto error; 1095 1096 ext_gpu_map = uvm_kvmalloc_zero(sizeof(*ext_gpu_map)); 1097 if (!ext_gpu_map) { 1098 status = NV_ERR_NO_MEMORY; 1099 goto error; 1100 } 1101 1102 ext_gpu_map->node.start = base; 1103 ext_gpu_map->node.end = base + length - 1; 1104 RB_CLEAR_NODE(&ext_gpu_map->node.rb_node); 1105 uvm_tracker_init(&ext_gpu_map->tracker); 1106 1107 // Due to the fact that any overlapping mappings were already unmapped, 1108 // adding the new mapping to the tree cannot fail. 1109 status = uvm_range_tree_add(&range_tree->tree, &ext_gpu_map->node); 1110 UVM_ASSERT(status == NV_OK); 1111 1112 uvm_processor_mask_set_atomic(&va_range->external.mapped_gpus, mapping_gpu->id); 1113 ext_gpu_map->gpu = mapping_gpu; 1114 1115 UVM_ASSERT(uvm_va_range_ext_gpu_map(va_range, mapping_gpu, base) == ext_gpu_map); 1116 1117 status = uvm_page_table_range_vec_init(page_tree, 1118 ext_gpu_map->node.start, 1119 uvm_range_tree_node_size(&ext_gpu_map->node), 1120 UVM_PAGE_SIZE_64K, 1121 UVM_PMM_ALLOC_FLAGS_EVICT, 1122 &ext_gpu_map->pt_range_vec); 1123 if (status != NV_OK) 1124 goto error; 1125 1126 status = uvm_page_table_range_vec_write_ptes(&ext_gpu_map->pt_range_vec, 1127 UVM_MEMBAR_NONE, 1128 external_sparse_pte_maker, 1129 NULL); 1130 if (status != NV_OK) 1131 goto error; 1132 1133 uvm_mutex_unlock(&range_tree->lock); 1134 return NV_OK; 1135 1136 error: 1137 uvm_ext_gpu_map_destroy(va_range, ext_gpu_map, NULL); 1138 uvm_mutex_unlock(&range_tree->lock); 1139 return status; 1140 } 1141 1142 static NV_STATUS uvm_map_external_sparse(uvm_va_space_t *va_space, UVM_MAP_EXTERNAL_SPARSE_PARAMS *params) 1143 { 1144 uvm_va_range_t *va_range = NULL; 1145 uvm_gpu_t *mapping_gpu = NULL; 1146 NV_STATUS status = NV_OK; 1147 LIST_HEAD(deferred_free_list); 1148 1149 if (uvm_api_range_invalid_64k(params->base, params->length)) 1150 return NV_ERR_INVALID_ADDRESS; 1151 1152 uvm_va_space_down_read(va_space); 1153 va_range = uvm_va_range_find(va_space, params->base); 1154 if (!va_range || 1155 va_range->type != UVM_VA_RANGE_TYPE_EXTERNAL || 1156 va_range->node.end < params->base + params->length - 1) { 1157 status = NV_ERR_INVALID_ADDRESS; 1158 goto out; 1159 } 1160 1161 mapping_gpu = uvm_va_space_get_gpu_by_uuid_with_gpu_va_space(va_space, ¶ms->gpuUuid); 1162 if (!mapping_gpu) { 1163 status = NV_ERR_INVALID_DEVICE; 1164 goto out; 1165 } 1166 1167 // Sparse mappings are unsupported on GPUs prior to Pascal. 1168 if (!mapping_gpu->parent->sparse_mappings_supported) { 1169 status = NV_ERR_INVALID_DEVICE; 1170 goto out; 1171 } 1172 1173 status = uvm_map_external_sparse_on_gpu(va_range, mapping_gpu, params->base, params->length, &deferred_free_list); 1174 1175 if (!list_empty(&deferred_free_list)) 1176 uvm_gpu_retain(mapping_gpu); 1177 1178 out: 1179 uvm_va_space_up_read(va_space); 1180 1181 if (!list_empty(&deferred_free_list)) { 1182 uvm_deferred_free_object_list(&deferred_free_list); 1183 uvm_gpu_release(mapping_gpu); 1184 } 1185 1186 return status; 1187 } 1188 1189 NV_STATUS uvm_api_map_external_sparse(UVM_MAP_EXTERNAL_SPARSE_PARAMS *params, struct file *filp) 1190 { 1191 uvm_va_space_t *va_space = uvm_va_space_get(filp); 1192 return uvm_map_external_sparse(va_space, params); 1193 } 1194 1195 // Version of free which returns but doesn't release the owning GPU 1196 static uvm_gpu_t *uvm_ext_gpu_map_free_internal(uvm_ext_gpu_map_t *ext_gpu_map) 1197 { 1198 uvm_gpu_t *owning_gpu; 1199 1200 if (!ext_gpu_map) 1201 return NULL; 1202 1203 UVM_ASSERT(!ext_gpu_map->pt_range_vec.ranges); 1204 1205 if (ext_gpu_map->mem_handle) 1206 nv_kref_put(&ext_gpu_map->mem_handle->ref_count, uvm_release_rm_handle); 1207 1208 owning_gpu = ext_gpu_map->owning_gpu; 1209 uvm_kvfree(ext_gpu_map); 1210 1211 return owning_gpu; 1212 } 1213 1214 void uvm_ext_gpu_map_free(uvm_ext_gpu_map_t *ext_gpu_map) 1215 { 1216 uvm_gpu_t *owning_gpu = uvm_ext_gpu_map_free_internal(ext_gpu_map); 1217 if (owning_gpu) 1218 uvm_gpu_release(owning_gpu); 1219 } 1220 1221 void uvm_ext_gpu_map_destroy(uvm_va_range_t *va_range, 1222 uvm_ext_gpu_map_t *ext_gpu_map, 1223 struct list_head *deferred_free_list) 1224 { 1225 uvm_membar_t membar; 1226 uvm_ext_gpu_range_tree_t *range_tree; 1227 uvm_gpu_t *mapped_gpu; 1228 1229 if (!ext_gpu_map) 1230 return; 1231 1232 (void)uvm_tracker_wait_deinit(&ext_gpu_map->tracker); 1233 1234 // The external map is inserted into the tree prior to the rest of the mapping 1235 // steps. So, if it has not been inserted yet, there is nothing to clean up. Just 1236 // free the memory. 1237 if (RB_EMPTY_NODE(&ext_gpu_map->node.rb_node)) { 1238 uvm_kvfree(ext_gpu_map->mem_handle); 1239 uvm_kvfree(ext_gpu_map); 1240 return; 1241 } 1242 1243 mapped_gpu = ext_gpu_map->gpu; 1244 1245 range_tree = uvm_ext_gpu_range_tree(va_range, mapped_gpu); 1246 1247 uvm_assert_mutex_locked(&range_tree->lock); 1248 UVM_ASSERT(uvm_gpu_va_space_get(va_range->va_space, mapped_gpu)); 1249 1250 uvm_range_tree_remove(&range_tree->tree, &ext_gpu_map->node); 1251 1252 // Unmap the PTEs 1253 if (ext_gpu_map->pt_range_vec.ranges) { 1254 membar = va_range_downgrade_membar(va_range, ext_gpu_map); 1255 uvm_page_table_range_vec_clear_ptes(&ext_gpu_map->pt_range_vec, membar); 1256 uvm_page_table_range_vec_deinit(&ext_gpu_map->pt_range_vec); 1257 } 1258 1259 if (deferred_free_list && ext_gpu_map->mem_handle) { 1260 // If this is a GPU allocation, we have to prevent that GPU from going 1261 // away until we've freed the handle. 1262 if (ext_gpu_map->owning_gpu) 1263 uvm_gpu_retain(ext_gpu_map->owning_gpu); 1264 1265 uvm_deferred_free_object_add(deferred_free_list, 1266 &ext_gpu_map->deferred_free, 1267 UVM_DEFERRED_FREE_OBJECT_TYPE_EXTERNAL_ALLOCATION); 1268 } 1269 else { 1270 uvm_ext_gpu_map_free_internal(ext_gpu_map); 1271 } 1272 1273 // Check if the sub-range tree is empty. Only then can the GPU be removed from 1274 // the mapped_gpus bitmap. 1275 if (uvm_range_tree_empty(&range_tree->tree)) 1276 uvm_processor_mask_clear_atomic(&va_range->external.mapped_gpus, mapped_gpu->id); 1277 } 1278 1279 static NV_STATUS uvm_unmap_external(uvm_va_space_t *va_space, 1280 NvU64 base, 1281 NvU64 length, 1282 const NvProcessorUuid *gpu_uuid) 1283 { 1284 uvm_va_range_t *va_range; 1285 uvm_gpu_t *gpu = NULL; 1286 NV_STATUS status = NV_OK; 1287 uvm_ext_gpu_range_tree_t *range_tree; 1288 LIST_HEAD(deferred_free_list); 1289 1290 if (uvm_api_range_invalid_4k(base, length)) 1291 return NV_ERR_INVALID_ADDRESS; 1292 1293 uvm_va_space_down_read(va_space); 1294 1295 va_range = uvm_va_range_find(va_space, base); 1296 if (!va_range || va_range->type != UVM_VA_RANGE_TYPE_EXTERNAL || base + length - 1 > va_range->node.end) { 1297 status = NV_ERR_INVALID_ADDRESS; 1298 goto out; 1299 } 1300 1301 gpu = uvm_va_space_get_gpu_by_uuid(va_space, gpu_uuid); 1302 if (!gpu) { 1303 status = NV_ERR_INVALID_DEVICE; 1304 goto out; 1305 } 1306 1307 range_tree = uvm_ext_gpu_range_tree(va_range, gpu); 1308 uvm_mutex_lock(&range_tree->lock); 1309 status = uvm_unmap_external_in_range(va_range, gpu, base, base + length - 1, &deferred_free_list); 1310 uvm_mutex_unlock(&range_tree->lock); 1311 1312 // If the deferred_free_list is not empty, retain the GPU which maps the 1313 // allocation because it's the parent of dup_handle. The owning GPU (if any) 1314 // is retained internally by the deferred free layer. 1315 if (!list_empty(&deferred_free_list)) 1316 uvm_gpu_retain(gpu); 1317 1318 out: 1319 uvm_va_space_up_read(va_space); 1320 1321 if (!list_empty(&deferred_free_list)) { 1322 uvm_deferred_free_object_list(&deferred_free_list); 1323 uvm_gpu_release(gpu); 1324 } 1325 1326 return status; 1327 } 1328 1329 NV_STATUS uvm_api_unmap_external(UVM_UNMAP_EXTERNAL_PARAMS *params, struct file *filp) 1330 { 1331 uvm_va_space_t *va_space = uvm_va_space_get(filp); 1332 return uvm_unmap_external(va_space, params->base, params->length, ¶ms->gpuUuid); 1333 } 1334 1335 // This destroys VA ranges created by UvmMapExternalAllocation, 1336 // UvmMapDynamicParallelismRegion, and UvmAllocSemaphorePool *only*. VA ranges 1337 // created by UvmMemMap and UvmAlloc go through mmap/munmap. 1338 static NV_STATUS uvm_free(uvm_va_space_t *va_space, NvU64 base, NvU64 length) 1339 { 1340 uvm_va_range_t *va_range; 1341 NV_STATUS status = NV_OK; 1342 uvm_global_processor_mask_t retained_mask; 1343 LIST_HEAD(deferred_free_list); 1344 1345 if (uvm_api_range_invalid_4k(base, length)) 1346 return NV_ERR_INVALID_ADDRESS; 1347 1348 uvm_va_space_down_write(va_space); 1349 1350 // Non-managed ranges are defined to not require splitting, so a partial 1351 // free attempt is an error. 1352 // 1353 // TODO: Bug 1763676: The length parameter may be needed for MPS. If not, it 1354 // should be removed from the ioctl. 1355 va_range = uvm_va_range_find(va_space, base); 1356 if (!va_range || 1357 (va_range->type != UVM_VA_RANGE_TYPE_EXTERNAL && 1358 va_range->type != UVM_VA_RANGE_TYPE_SKED_REFLECTED && 1359 va_range->type != UVM_VA_RANGE_TYPE_SEMAPHORE_POOL) || 1360 va_range->node.start != base || 1361 va_range->node.end != base + length - 1) { 1362 status = NV_ERR_INVALID_ADDRESS; 1363 goto out; 1364 } 1365 1366 if ((va_range->type == UVM_VA_RANGE_TYPE_SEMAPHORE_POOL) && 1367 uvm_mem_mapped_on_cpu_user(va_range->semaphore_pool.mem)) { 1368 // Semaphore pools must be first unmapped from the CPU with munmap to 1369 // invalidate the vma. 1370 status = NV_ERR_INVALID_ARGUMENT; 1371 goto out; 1372 } 1373 1374 if (va_range->type == UVM_VA_RANGE_TYPE_EXTERNAL) { 1375 // External ranges may have deferred free work, so the GPUs may have to 1376 // be retained. Construct the mask of all the GPUs that need to be 1377 // retained. 1378 uvm_va_space_global_gpus_in_mask(va_space, &retained_mask, &va_range->external.mapped_gpus); 1379 } 1380 1381 uvm_va_range_destroy(va_range, &deferred_free_list); 1382 1383 // If there is deferred work, retain the required GPUs. 1384 if (!list_empty(&deferred_free_list)) 1385 uvm_global_mask_retain(&retained_mask); 1386 1387 out: 1388 uvm_va_space_up_write(va_space); 1389 1390 if (!list_empty(&deferred_free_list)) { 1391 UVM_ASSERT(status == NV_OK); 1392 uvm_deferred_free_object_list(&deferred_free_list); 1393 uvm_global_mask_release(&retained_mask); 1394 } 1395 1396 return status; 1397 } 1398 1399 NV_STATUS uvm_api_free(UVM_FREE_PARAMS *params, struct file *filp) 1400 { 1401 uvm_va_space_t *va_space = uvm_va_space_get(filp); 1402 return uvm_free(va_space, params->base, params->length); 1403 } 1404