1 /******************************************************************************* 2 Copyright (c) 2015-2023 NVIDIA Corporation 3 4 Permission is hereby granted, free of charge, to any person obtaining a copy 5 of this software and associated documentation files (the "Software"), to 6 deal in the Software without restriction, including without limitation the 7 rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 8 sell copies of the Software, and to permit persons to whom the Software is 9 furnished to do so, subject to the following conditions: 10 11 The above copyright notice and this permission notice shall be 12 included in all copies or substantial portions of the Software. 13 14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 DEALINGS IN THE SOFTWARE. 21 22 *******************************************************************************/ 23 24 #include "uvm_common.h" 25 #include "uvm_linux.h" 26 #include "uvm_types.h" 27 #include "uvm_api.h" 28 #include "uvm_global.h" 29 #include "uvm_hal.h" 30 #include "uvm_va_range.h" 31 #include "uvm_va_block.h" 32 #include "uvm_kvmalloc.h" 33 #include "uvm_map_external.h" 34 #include "uvm_perf_thrashing.h" 35 #include "nv_uvm_interface.h" 36 37 static struct kmem_cache *g_uvm_va_range_cache __read_mostly; 38 static struct kmem_cache *g_uvm_vma_wrapper_cache __read_mostly; 39 40 NV_STATUS uvm_va_range_init(void) 41 { 42 g_uvm_va_range_cache = NV_KMEM_CACHE_CREATE("uvm_va_range_t", uvm_va_range_t); 43 if (!g_uvm_va_range_cache) 44 return NV_ERR_NO_MEMORY; 45 46 g_uvm_vma_wrapper_cache = NV_KMEM_CACHE_CREATE("uvm_vma_wrapper_t", uvm_vma_wrapper_t); 47 if (!g_uvm_vma_wrapper_cache) 48 return NV_ERR_NO_MEMORY; 49 50 return uvm_va_block_init(); 51 } 52 53 void uvm_va_range_exit(void) 54 { 55 uvm_va_block_exit(); 56 kmem_cache_destroy_safe(&g_uvm_va_range_cache); 57 kmem_cache_destroy_safe(&g_uvm_vma_wrapper_cache); 58 } 59 60 static NvU64 block_calc_start(uvm_va_range_t *va_range, size_t index) 61 { 62 NvU64 range_start = UVM_VA_BLOCK_ALIGN_DOWN(va_range->node.start); 63 NvU64 block_start = range_start + index * UVM_VA_BLOCK_SIZE; 64 NvU64 start = max(va_range->node.start, block_start); 65 UVM_ASSERT(start < va_range->node.end); 66 return start; 67 } 68 69 static NvU64 block_calc_end(uvm_va_range_t *va_range, size_t index) 70 { 71 NvU64 start = block_calc_start(va_range, index); 72 NvU64 block_end = UVM_VA_BLOCK_ALIGN_UP(start + 1) - 1; // Inclusive end 73 NvU64 end = min(va_range->node.end, block_end); 74 UVM_ASSERT(end > va_range->node.start); 75 return end; 76 } 77 78 // Called before the range's bounds have been adjusted. This may not actually 79 // shrink the blocks array. For example, if the shrink attempt fails then 80 // va_range's old array is left intact. This may waste memory, but it means this 81 // function cannot fail. 82 static void blocks_array_shrink(uvm_va_range_t *va_range, size_t new_num_blocks) 83 { 84 size_t new_size = new_num_blocks * sizeof(va_range->blocks[0]); 85 atomic_long_t *new_blocks; 86 87 UVM_ASSERT(va_range->type == UVM_VA_RANGE_TYPE_MANAGED); 88 UVM_ASSERT(va_range->blocks); 89 UVM_ASSERT(uvm_kvsize(va_range->blocks) >= uvm_va_range_num_blocks(va_range) * sizeof(va_range->blocks[0])); 90 UVM_ASSERT(new_num_blocks); 91 UVM_ASSERT(new_num_blocks <= uvm_va_range_num_blocks(va_range)); 92 93 // TODO: Bug 1766579: This could be optimized by only shrinking the array 94 // when the new size is half of the old size or some similar 95 // threshold. Need to profile this on real apps to see if that's worth 96 // doing. 97 98 new_blocks = uvm_kvrealloc(va_range->blocks, new_size); 99 if (!new_blocks) { 100 // If we failed to allocate a smaller array, just leave the old one as-is 101 UVM_DBG_PRINT("Failed to shrink range [0x%llx, 0x%llx] from %zu blocks to %zu blocks\n", 102 va_range->node.start, 103 va_range->node.end, 104 uvm_kvsize(va_range->blocks) / sizeof(va_range->blocks[0]), 105 new_num_blocks); 106 return; 107 } 108 109 va_range->blocks = new_blocks; 110 } 111 112 static uvm_va_range_t *uvm_va_range_alloc(uvm_va_space_t *va_space, NvU64 start, NvU64 end) 113 { 114 uvm_va_range_t *va_range = nv_kmem_cache_zalloc(g_uvm_va_range_cache, NV_UVM_GFP_FLAGS); 115 if (!va_range) 116 return NULL; 117 118 uvm_assert_rwsem_locked_write(&va_space->lock); 119 120 va_range->va_space = va_space; 121 va_range->node.start = start; 122 va_range->node.end = end; 123 124 // The range is inserted into the VA space tree only at the end of creation, 125 // so clear the node so the destroy path knows whether to remove it. 126 RB_CLEAR_NODE(&va_range->node.rb_node); 127 128 return va_range; 129 } 130 131 static NV_STATUS uvm_va_range_alloc_reclaim(uvm_va_space_t *va_space, 132 struct mm_struct *mm, 133 uvm_va_range_type_t type, 134 NvU64 start, 135 NvU64 end, 136 uvm_va_range_t **out_va_range) 137 { 138 uvm_va_range_t *va_range; 139 NV_STATUS status; 140 141 // Check for no overlap with HMM blocks. 142 status = uvm_hmm_va_block_reclaim(va_space, mm, start, end); 143 if (status != NV_OK) 144 return status; 145 146 va_range = uvm_va_range_alloc(va_space, start, end); 147 if (!va_range) 148 return NV_ERR_NO_MEMORY; 149 150 va_range->type = type; 151 152 *out_va_range = va_range; 153 return NV_OK; 154 } 155 156 static uvm_va_range_t *uvm_va_range_alloc_managed(uvm_va_space_t *va_space, NvU64 start, NvU64 end) 157 { 158 uvm_va_range_t *va_range = NULL; 159 160 va_range = uvm_va_range_alloc(va_space, start, end); 161 if (!va_range) 162 goto error; 163 164 va_range->type = UVM_VA_RANGE_TYPE_MANAGED; 165 va_range->managed.policy = uvm_va_policy_default; 166 167 va_range->blocks = uvm_kvmalloc_zero(uvm_va_range_num_blocks(va_range) * sizeof(va_range->blocks[0])); 168 if (!va_range->blocks) { 169 UVM_DBG_PRINT("Failed to allocate %zu blocks\n", uvm_va_range_num_blocks(va_range)); 170 goto error; 171 } 172 173 return va_range; 174 175 error: 176 uvm_va_range_destroy(va_range, NULL); 177 return NULL; 178 } 179 180 NV_STATUS uvm_va_range_create_mmap(uvm_va_space_t *va_space, 181 struct mm_struct *mm, 182 uvm_vma_wrapper_t *vma_wrapper, 183 uvm_va_range_t **out_va_range) 184 { 185 NV_STATUS status; 186 struct vm_area_struct *vma = vma_wrapper->vma; 187 uvm_va_range_t *va_range = NULL; 188 189 // Check for no overlap with HMM blocks. 190 status = uvm_hmm_va_block_reclaim(va_space, mm, vma->vm_start, vma->vm_end - 1); 191 if (status != NV_OK) 192 return status; 193 194 // vma->vm_end is exclusive but va_range end is inclusive 195 va_range = uvm_va_range_alloc_managed(va_space, vma->vm_start, vma->vm_end - 1); 196 if (!va_range) { 197 status = NV_ERR_NO_MEMORY; 198 goto error; 199 } 200 201 va_range->managed.vma_wrapper = vma_wrapper; 202 203 status = uvm_range_tree_add(&va_space->va_range_tree, &va_range->node); 204 if (status != NV_OK) 205 goto error; 206 207 if (out_va_range) 208 *out_va_range = va_range; 209 210 return NV_OK; 211 212 error: 213 uvm_va_range_destroy(va_range, NULL); 214 return status; 215 } 216 217 NV_STATUS uvm_va_range_create_external(uvm_va_space_t *va_space, 218 struct mm_struct *mm, 219 NvU64 start, 220 NvU64 length, 221 uvm_va_range_t **out_va_range) 222 { 223 NV_STATUS status; 224 uvm_va_range_t *va_range = NULL; 225 NvU32 i; 226 227 status = uvm_va_range_alloc_reclaim(va_space, 228 mm, 229 UVM_VA_RANGE_TYPE_EXTERNAL, 230 start, 231 start + length - 1, 232 &va_range); 233 if (status != NV_OK) 234 return status; 235 236 for (i = 0; i < ARRAY_SIZE(va_range->external.gpu_ranges); i++) { 237 uvm_mutex_init(&va_range->external.gpu_ranges[i].lock, UVM_LOCK_ORDER_EXT_RANGE_TREE); 238 uvm_range_tree_init(&va_range->external.gpu_ranges[i].tree); 239 } 240 241 status = uvm_range_tree_add(&va_space->va_range_tree, &va_range->node); 242 if (status != NV_OK) 243 goto error; 244 245 if (out_va_range) 246 *out_va_range = va_range; 247 248 return NV_OK; 249 250 error: 251 uvm_va_range_destroy(va_range, NULL); 252 return status; 253 } 254 255 NV_STATUS uvm_va_range_create_channel(uvm_va_space_t *va_space, 256 struct mm_struct *mm, 257 NvU64 start, 258 NvU64 end, 259 uvm_va_range_t **out_va_range) 260 { 261 NV_STATUS status; 262 uvm_va_range_t *va_range = NULL; 263 264 status = uvm_va_range_alloc_reclaim(va_space, 265 mm, 266 UVM_VA_RANGE_TYPE_CHANNEL, 267 start, 268 end, 269 &va_range); 270 if (status != NV_OK) 271 return status; 272 273 INIT_LIST_HEAD(&va_range->channel.list_node); 274 275 status = uvm_range_tree_add(&va_space->va_range_tree, &va_range->node); 276 if (status != NV_OK) 277 goto error; 278 279 if (out_va_range) 280 *out_va_range = va_range; 281 282 return NV_OK; 283 284 error: 285 uvm_va_range_destroy(va_range, NULL); 286 return status; 287 } 288 289 NV_STATUS uvm_va_range_create_sked_reflected(uvm_va_space_t *va_space, 290 struct mm_struct *mm, 291 NvU64 start, 292 NvU64 length, 293 uvm_va_range_t **out_va_range) 294 { 295 NV_STATUS status; 296 uvm_va_range_t *va_range = NULL; 297 298 status = uvm_va_range_alloc_reclaim(va_space, 299 mm, 300 UVM_VA_RANGE_TYPE_SKED_REFLECTED, 301 start, 302 start + length - 1, 303 &va_range); 304 if (status != NV_OK) 305 return status; 306 307 status = uvm_range_tree_add(&va_space->va_range_tree, &va_range->node); 308 if (status != NV_OK) 309 goto error; 310 311 if (out_va_range) 312 *out_va_range = va_range; 313 314 return NV_OK; 315 316 error: 317 uvm_va_range_destroy(va_range, NULL); 318 return status; 319 } 320 321 NV_STATUS uvm_va_range_create_semaphore_pool(uvm_va_space_t *va_space, 322 struct mm_struct *mm, 323 NvU64 start, 324 NvU64 length, 325 const UvmGpuMappingAttributes *per_gpu_attrs, 326 NvU32 per_gpu_attrs_count, 327 uvm_va_range_t **out_va_range) 328 { 329 static const uvm_mem_gpu_mapping_attrs_t default_attrs = { 330 .protection = UVM_PROT_READ_WRITE_ATOMIC, 331 .is_cacheable = false 332 }; 333 334 NV_STATUS status; 335 uvm_va_range_t *va_range = NULL; 336 uvm_mem_alloc_params_t mem_alloc_params = { 0 }; 337 NvU32 i; 338 uvm_gpu_id_t gpu_id; 339 340 status = uvm_va_range_alloc_reclaim(va_space, 341 mm, 342 UVM_VA_RANGE_TYPE_SEMAPHORE_POOL, 343 start, 344 start + length - 1, 345 &va_range); 346 if (status != NV_OK) 347 return status; 348 349 uvm_tracker_init(&va_range->semaphore_pool.tracker); 350 uvm_mutex_init(&va_range->semaphore_pool.tracker_lock, UVM_LOCK_ORDER_SEMA_POOL_TRACKER); 351 352 status = uvm_range_tree_add(&va_space->va_range_tree, &va_range->node); 353 if (status != NV_OK) 354 goto error; 355 356 // The semaphore pool memory is located in sysmem, and must be zeroed upon 357 // allocation because it may be mapped on the user VA space. 358 mem_alloc_params.page_size = UVM_PAGE_SIZE_DEFAULT; 359 mem_alloc_params.size = length; 360 mem_alloc_params.zero = true; 361 mem_alloc_params.mm = mm; 362 363 va_range->semaphore_pool.default_gpu_attrs = default_attrs; 364 va_range->semaphore_pool.owner = NULL; 365 366 for_each_gpu_id(gpu_id) 367 va_range->semaphore_pool.gpu_attrs[uvm_id_gpu_index(gpu_id)] = default_attrs; 368 369 for (i = 0; i < per_gpu_attrs_count; i++) { 370 uvm_gpu_t *gpu; 371 uvm_mem_gpu_mapping_attrs_t attrs = default_attrs; 372 373 status = uvm_mem_translate_gpu_attributes(&per_gpu_attrs[i], va_space, &gpu, &attrs); 374 if (status != NV_OK) 375 goto error; 376 377 if (i == 0 && g_uvm_global.conf_computing_enabled) 378 mem_alloc_params.dma_owner = gpu; 379 380 if (attrs.is_cacheable) { 381 // At most 1 GPU can have this memory cached, in which case it is 382 // the 'owner' GPU. 383 if (va_range->semaphore_pool.owner != NULL) { 384 UVM_DBG_PRINT("Caching of semaphore pool requested on >1 GPU."); 385 status = NV_ERR_INVALID_ARGUMENT; 386 goto error; 387 } 388 389 va_range->semaphore_pool.owner = gpu; 390 } 391 392 va_range->semaphore_pool.gpu_attrs[uvm_id_gpu_index(gpu->id)] = attrs; 393 } 394 395 status = uvm_mem_alloc(&mem_alloc_params, &va_range->semaphore_pool.mem); 396 if (status != NV_OK) 397 goto error; 398 399 status = uvm_mem_map_cpu_kernel(va_range->semaphore_pool.mem); 400 if (status != NV_OK) 401 goto error; 402 403 if (out_va_range) 404 *out_va_range = va_range; 405 406 return NV_OK; 407 408 error: 409 uvm_va_range_destroy(va_range, NULL); 410 return status; 411 } 412 413 static void uvm_va_range_destroy_managed(uvm_va_range_t *va_range) 414 { 415 uvm_va_block_t *block; 416 uvm_va_block_t *block_tmp; 417 uvm_perf_event_data_t event_data; 418 NV_STATUS status; 419 420 UVM_ASSERT(va_range->type == UVM_VA_RANGE_TYPE_MANAGED); 421 422 if (va_range->blocks) { 423 // Unmap and drop our ref count on each block 424 for_each_va_block_in_va_range_safe(va_range, block, block_tmp) 425 uvm_va_block_kill(block); 426 427 uvm_kvfree(va_range->blocks); 428 } 429 430 event_data.range_destroy.range = va_range; 431 uvm_perf_event_notify(&va_range->va_space->perf_events, UVM_PERF_EVENT_RANGE_DESTROY, &event_data); 432 433 status = uvm_range_group_assign_range(va_range->va_space, NULL, va_range->node.start, va_range->node.end); 434 UVM_ASSERT(status == NV_OK); 435 } 436 437 static void uvm_va_range_destroy_external(uvm_va_range_t *va_range, struct list_head *deferred_free_list) 438 { 439 uvm_gpu_t *gpu; 440 441 if (uvm_processor_mask_empty(&va_range->external.mapped_gpus)) 442 return; 443 444 UVM_ASSERT(deferred_free_list); 445 446 for_each_va_space_gpu_in_mask(gpu, va_range->va_space, &va_range->external.mapped_gpus) { 447 uvm_ext_gpu_range_tree_t *range_tree = uvm_ext_gpu_range_tree(va_range, gpu); 448 uvm_ext_gpu_map_t *ext_map, *ext_map_next; 449 450 uvm_mutex_lock(&range_tree->lock); 451 uvm_ext_gpu_map_for_each_safe(ext_map, ext_map_next, va_range, gpu) 452 uvm_ext_gpu_map_destroy(va_range, ext_map, deferred_free_list); 453 uvm_mutex_unlock(&range_tree->lock); 454 } 455 456 UVM_ASSERT(uvm_processor_mask_empty(&va_range->external.mapped_gpus)); 457 } 458 459 static void uvm_va_range_destroy_channel(uvm_va_range_t *va_range) 460 { 461 uvm_gpu_va_space_t *gpu_va_space = va_range->channel.gpu_va_space; 462 uvm_membar_t membar; 463 464 UVM_ASSERT(va_range->channel.ref_count == 0); 465 466 // Unmap the buffer 467 if (gpu_va_space && va_range->channel.pt_range_vec.ranges) { 468 membar = uvm_hal_downgrade_membar_type(gpu_va_space->gpu, va_range->channel.aperture == UVM_APERTURE_VID); 469 uvm_page_table_range_vec_clear_ptes(&va_range->channel.pt_range_vec, membar); 470 uvm_page_table_range_vec_deinit(&va_range->channel.pt_range_vec); 471 } 472 473 list_del(&va_range->channel.list_node); 474 475 // Channel unregister handles releasing this descriptor back to RM 476 va_range->channel.rm_descriptor = 0; 477 } 478 479 static void uvm_va_range_destroy_sked_reflected(uvm_va_range_t *va_range) 480 { 481 uvm_gpu_va_space_t *gpu_va_space = va_range->sked_reflected.gpu_va_space; 482 483 if (!gpu_va_space || !va_range->sked_reflected.pt_range_vec.ranges) 484 return; 485 486 // The SKED reflected mapping has no physical backing and hence no physical 487 // accesses can be pending to it and no membar is needed. 488 uvm_page_table_range_vec_clear_ptes(&va_range->sked_reflected.pt_range_vec, UVM_MEMBAR_NONE); 489 uvm_page_table_range_vec_deinit(&va_range->sked_reflected.pt_range_vec); 490 491 va_range->sked_reflected.gpu_va_space = NULL; 492 } 493 494 static void uvm_va_range_destroy_semaphore_pool(uvm_va_range_t *va_range) 495 { 496 NV_STATUS status = uvm_tracker_wait_deinit(&va_range->semaphore_pool.tracker); 497 if (status != NV_OK) { 498 UVM_ASSERT_MSG(status == uvm_global_get_status(), 499 "uvm_tracker_wait() returned %d (%s) in uvm_va_range_destroy_semaphore_pool()\n", 500 status, 501 nvstatusToString(status)); 502 } 503 uvm_mem_free(va_range->semaphore_pool.mem); 504 va_range->semaphore_pool.mem = NULL; 505 } 506 507 void uvm_va_range_destroy(uvm_va_range_t *va_range, struct list_head *deferred_free_list) 508 { 509 if (!va_range) 510 return; 511 512 if (!RB_EMPTY_NODE(&va_range->node.rb_node)) 513 uvm_range_tree_remove(&va_range->va_space->va_range_tree, &va_range->node); 514 515 switch (va_range->type) { 516 case UVM_VA_RANGE_TYPE_INVALID: 517 // Skip partially-created ranges with unset types 518 break; 519 case UVM_VA_RANGE_TYPE_MANAGED: 520 uvm_va_range_destroy_managed(va_range); 521 break; 522 case UVM_VA_RANGE_TYPE_EXTERNAL: 523 uvm_va_range_destroy_external(va_range, deferred_free_list); 524 break; 525 case UVM_VA_RANGE_TYPE_CHANNEL: 526 uvm_va_range_destroy_channel(va_range); 527 break; 528 case UVM_VA_RANGE_TYPE_SKED_REFLECTED: 529 uvm_va_range_destroy_sked_reflected(va_range); 530 break; 531 case UVM_VA_RANGE_TYPE_SEMAPHORE_POOL: 532 uvm_va_range_destroy_semaphore_pool(va_range); 533 break; 534 default: 535 UVM_ASSERT_MSG(0, "[0x%llx, 0x%llx] has type %d\n", 536 va_range->node.start, va_range->node.end, va_range->type); 537 } 538 539 kmem_cache_free(g_uvm_va_range_cache, va_range); 540 } 541 542 void uvm_va_range_zombify(uvm_va_range_t *va_range) 543 { 544 if (!va_range) 545 return; 546 547 UVM_ASSERT(va_range->type == UVM_VA_RANGE_TYPE_MANAGED); 548 UVM_ASSERT(va_range->managed.vma_wrapper); 549 550 // Destroy will be done by uvm_destroy_vma_managed 551 va_range->managed.vma_wrapper = NULL; 552 } 553 554 NV_STATUS uvm_api_clean_up_zombie_resources(UVM_CLEAN_UP_ZOMBIE_RESOURCES_PARAMS *params, struct file *filp) 555 { 556 uvm_va_space_t *va_space = uvm_va_space_get(filp); 557 uvm_va_range_t *va_range, *va_range_next; 558 559 uvm_va_space_down_write(va_space); 560 561 uvm_for_each_va_range_safe(va_range, va_range_next, va_space) { 562 if (uvm_va_range_is_managed_zombie(va_range)) 563 uvm_va_range_destroy(va_range, NULL); 564 } 565 566 uvm_va_space_up_write(va_space); 567 568 return NV_OK; 569 } 570 571 NV_STATUS uvm_api_validate_va_range(UVM_VALIDATE_VA_RANGE_PARAMS *params, struct file *filp) 572 { 573 NV_STATUS status = NV_ERR_INVALID_ADDRESS; 574 uvm_va_space_t *va_space = uvm_va_space_get(filp); 575 uvm_va_range_t *va_range; 576 577 uvm_va_space_down_read(va_space); 578 579 va_range = uvm_va_range_find(va_space, params->base); 580 if (va_range && va_range->node.start == params->base && va_range->node.end + 1 == params->base + params->length) 581 status = NV_OK; 582 583 uvm_va_space_up_read(va_space); 584 585 return status; 586 } 587 588 static NV_STATUS va_range_add_gpu_va_space_managed(uvm_va_range_t *va_range, 589 uvm_gpu_va_space_t *gpu_va_space, 590 struct mm_struct *mm) 591 { 592 uvm_va_space_t *va_space = va_range->va_space; 593 uvm_gpu_t *gpu = gpu_va_space->gpu; 594 NV_STATUS status = NV_OK; 595 const bool should_add_remote_mappings = 596 uvm_processor_mask_test(&uvm_va_range_get_policy(va_range)->accessed_by, gpu->id) || 597 uvm_processor_mask_test(&va_range->uvm_lite_gpus, gpu->id); 598 599 // By this time, the gpu is already in the registration mask. 600 const bool should_disable_read_duplication = 601 uvm_va_range_get_policy(va_range)->read_duplication == UVM_READ_DUPLICATION_ENABLED && 602 (uvm_va_space_can_read_duplicate(va_space, NULL) != uvm_va_space_can_read_duplicate(va_space, gpu)); 603 604 // Combine conditions to perform a single VA block traversal 605 if (gpu_va_space->ats.enabled || should_add_remote_mappings || should_disable_read_duplication) { 606 uvm_va_block_t *va_block; 607 uvm_va_block_context_t *va_block_context = uvm_va_space_block_context(va_space, mm); 608 609 610 // TODO: Bug 2090378. Consolidate all per-VA block operations within 611 // uvm_va_block_add_gpu_va_space so we only need to take the VA block 612 // once. 613 for_each_va_block_in_va_range(va_range, va_block) { 614 if (gpu_va_space->ats.enabled) { 615 // Notify that a new GPU VA space has been created. This is only 616 // currently used for PDE1 pre-population on ATS systems. 617 status = UVM_VA_BLOCK_LOCK_RETRY(va_block, NULL, uvm_va_block_add_gpu_va_space(va_block, gpu_va_space)); 618 if (status != NV_OK) 619 break; 620 } 621 622 if (should_add_remote_mappings) { 623 // Now that we have a GPU VA space, map any VA ranges for which 624 // this GPU is a UVM-Lite GPU or has accessed_by set. 625 status = uvm_va_block_set_accessed_by(va_block, va_block_context, gpu->id); 626 if (status != NV_OK) 627 break; 628 } 629 630 if (should_disable_read_duplication) { 631 status = uvm_va_block_unset_read_duplication(va_block, va_block_context); 632 if (status != NV_OK) 633 break; 634 } 635 } 636 } 637 638 return status; 639 } 640 641 static NV_STATUS va_range_add_gpu_va_space_semaphore_pool(uvm_va_range_t *va_range, uvm_gpu_t *gpu) 642 { 643 uvm_mem_gpu_mapping_attrs_t *attrs; 644 645 UVM_ASSERT(va_range->type == UVM_VA_RANGE_TYPE_SEMAPHORE_POOL); 646 UVM_ASSERT(uvm_mem_mapped_on_gpu_kernel(va_range->semaphore_pool.mem, gpu)); 647 648 attrs = &va_range->semaphore_pool.gpu_attrs[uvm_id_gpu_index(gpu->id)]; 649 650 return uvm_mem_map_gpu_user(va_range->semaphore_pool.mem, 651 gpu, 652 va_range->va_space, 653 (void *)va_range->node.start, 654 attrs); 655 } 656 657 NV_STATUS uvm_va_range_add_gpu_va_space(uvm_va_range_t *va_range, 658 uvm_gpu_va_space_t *gpu_va_space, 659 struct mm_struct *mm) 660 { 661 UVM_ASSERT(va_range->type < UVM_VA_RANGE_TYPE_MAX); 662 663 if (va_range->inject_add_gpu_va_space_error) { 664 va_range->inject_add_gpu_va_space_error = false; 665 return NV_ERR_NO_MEMORY; 666 } 667 668 switch (va_range->type) { 669 case UVM_VA_RANGE_TYPE_MANAGED: 670 return va_range_add_gpu_va_space_managed(va_range, gpu_va_space, mm); 671 case UVM_VA_RANGE_TYPE_SEMAPHORE_POOL: 672 return va_range_add_gpu_va_space_semaphore_pool(va_range, gpu_va_space->gpu); 673 default: 674 return NV_OK; 675 } 676 } 677 678 static void va_range_remove_gpu_va_space_managed(uvm_va_range_t *va_range, 679 uvm_gpu_va_space_t *gpu_va_space, 680 struct mm_struct *mm) 681 { 682 uvm_va_block_t *va_block; 683 uvm_va_space_t *va_space = va_range->va_space; 684 bool should_enable_read_duplicate; 685 uvm_va_block_context_t *va_block_context = uvm_va_space_block_context(va_space, mm); 686 687 should_enable_read_duplicate = 688 uvm_va_range_get_policy(va_range)->read_duplication == UVM_READ_DUPLICATION_ENABLED && 689 uvm_va_space_can_read_duplicate(va_space, NULL) != uvm_va_space_can_read_duplicate(va_space, gpu_va_space->gpu); 690 691 for_each_va_block_in_va_range(va_range, va_block) { 692 uvm_mutex_lock(&va_block->lock); 693 uvm_va_block_remove_gpu_va_space(va_block, gpu_va_space, va_block_context); 694 uvm_mutex_unlock(&va_block->lock); 695 696 if (should_enable_read_duplicate) 697 uvm_va_block_set_read_duplication(va_block, va_block_context); 698 } 699 } 700 701 static void va_range_remove_gpu_va_space_external(uvm_va_range_t *va_range, 702 uvm_gpu_t *gpu, 703 struct list_head *deferred_free_list) 704 { 705 uvm_ext_gpu_range_tree_t *range_tree; 706 uvm_ext_gpu_map_t *ext_map, *ext_map_next; 707 708 UVM_ASSERT(deferred_free_list); 709 710 range_tree = uvm_ext_gpu_range_tree(va_range, gpu); 711 uvm_mutex_lock(&range_tree->lock); 712 713 uvm_ext_gpu_map_for_each_safe(ext_map, ext_map_next, va_range, gpu) 714 uvm_ext_gpu_map_destroy(va_range, ext_map, deferred_free_list); 715 716 uvm_mutex_unlock(&range_tree->lock); 717 } 718 719 static void va_range_remove_gpu_va_space_semaphore_pool(uvm_va_range_t *va_range, uvm_gpu_t *gpu) 720 { 721 UVM_ASSERT(va_range->type == UVM_VA_RANGE_TYPE_SEMAPHORE_POOL); 722 723 if (g_uvm_global.conf_computing_enabled && (va_range->semaphore_pool.mem->dma_owner == gpu)) 724 uvm_va_range_destroy(va_range, NULL); 725 else 726 uvm_mem_unmap_gpu_user(va_range->semaphore_pool.mem, gpu); 727 } 728 729 void uvm_va_range_remove_gpu_va_space(uvm_va_range_t *va_range, 730 uvm_gpu_va_space_t *gpu_va_space, 731 struct mm_struct *mm, 732 struct list_head *deferred_free_list) 733 { 734 switch (va_range->type) { 735 case UVM_VA_RANGE_TYPE_MANAGED: 736 va_range_remove_gpu_va_space_managed(va_range, gpu_va_space, mm); 737 break; 738 case UVM_VA_RANGE_TYPE_EXTERNAL: 739 va_range_remove_gpu_va_space_external(va_range, gpu_va_space->gpu, deferred_free_list); 740 break; 741 case UVM_VA_RANGE_TYPE_CHANNEL: 742 // All channels under this GPU VA space should've been removed before 743 // removing the GPU VA space. 744 UVM_ASSERT(va_range->channel.gpu_va_space != gpu_va_space); 745 break; 746 case UVM_VA_RANGE_TYPE_SKED_REFLECTED: 747 if (va_range->sked_reflected.gpu_va_space == gpu_va_space) 748 uvm_va_range_destroy_sked_reflected(va_range); 749 break; 750 case UVM_VA_RANGE_TYPE_SEMAPHORE_POOL: 751 va_range_remove_gpu_va_space_semaphore_pool(va_range, gpu_va_space->gpu); 752 break; 753 default: 754 UVM_ASSERT_MSG(0, "[0x%llx, 0x%llx] has type %d\n", 755 va_range->node.start, va_range->node.end, va_range->type); 756 } 757 } 758 759 static NV_STATUS uvm_va_range_enable_peer_managed(uvm_va_range_t *va_range, uvm_gpu_t *gpu0, uvm_gpu_t *gpu1) 760 { 761 NV_STATUS status; 762 uvm_va_block_t *va_block; 763 bool gpu0_accessed_by = uvm_processor_mask_test(&uvm_va_range_get_policy(va_range)->accessed_by, gpu0->id); 764 bool gpu1_accessed_by = uvm_processor_mask_test(&uvm_va_range_get_policy(va_range)->accessed_by, gpu1->id); 765 uvm_va_space_t *va_space = va_range->va_space; 766 uvm_va_block_context_t *va_block_context = uvm_va_space_block_context(va_space, NULL); 767 768 769 for_each_va_block_in_va_range(va_range, va_block) { 770 // TODO: Bug 1767224: Refactor the uvm_va_block_set_accessed_by logic 771 // into uvm_va_block_enable_peer. 772 uvm_mutex_lock(&va_block->lock); 773 status = uvm_va_block_enable_peer(va_block, gpu0, gpu1); 774 uvm_mutex_unlock(&va_block->lock); 775 776 if (status != NV_OK) 777 return status; 778 779 // For UVM-Lite at most one GPU needs to map the peer GPU if it's the 780 // preferred location, but it doesn't hurt to just try mapping both. 781 if (gpu0_accessed_by) { 782 status = uvm_va_block_set_accessed_by(va_block, 783 va_block_context, 784 gpu0->id); 785 if (status != NV_OK) 786 return status; 787 } 788 789 if (gpu1_accessed_by) { 790 status = uvm_va_block_set_accessed_by(va_block, 791 va_block_context, 792 gpu1->id); 793 if (status != NV_OK) 794 return status; 795 } 796 } 797 798 return NV_OK; 799 } 800 801 NV_STATUS uvm_va_range_enable_peer(uvm_va_range_t *va_range, uvm_gpu_t *gpu0, uvm_gpu_t *gpu1) 802 { 803 switch (va_range->type) { 804 case UVM_VA_RANGE_TYPE_MANAGED: 805 return uvm_va_range_enable_peer_managed(va_range, gpu0, gpu1); 806 case UVM_VA_RANGE_TYPE_EXTERNAL: 807 // UVM_VA_RANGE_TYPE_EXTERNAL doesn't create new mappings when enabling peer access 808 return NV_OK; 809 case UVM_VA_RANGE_TYPE_CHANNEL: 810 // UVM_VA_RANGE_TYPE_CHANNEL should never have peer mappings 811 return NV_OK; 812 case UVM_VA_RANGE_TYPE_SKED_REFLECTED: 813 // UVM_VA_RANGE_TYPE_SKED_REFLECTED should never have peer mappings 814 return NV_OK; 815 case UVM_VA_RANGE_TYPE_SEMAPHORE_POOL: 816 // UVM_VA_RANGE_TYPE_SEMAPHORE_POOL should never have peer mappings 817 return NV_OK; 818 default: 819 UVM_ASSERT_MSG(0, "[0x%llx, 0x%llx] has type %d\n", 820 va_range->node.start, va_range->node.end, va_range->type); 821 return NV_ERR_NOT_SUPPORTED; 822 } 823 } 824 825 static void uvm_va_range_disable_peer_external(uvm_va_range_t *va_range, 826 uvm_gpu_t *mapping_gpu, 827 uvm_gpu_t *owning_gpu, 828 struct list_head *deferred_free_list) 829 { 830 uvm_ext_gpu_range_tree_t *range_tree; 831 uvm_ext_gpu_map_t *ext_map, *ext_map_next; 832 833 range_tree = uvm_ext_gpu_range_tree(va_range, mapping_gpu); 834 uvm_mutex_lock(&range_tree->lock); 835 uvm_ext_gpu_map_for_each_safe(ext_map, ext_map_next, va_range, mapping_gpu) { 836 if (ext_map->owning_gpu == owning_gpu && (!ext_map->is_sysmem || ext_map->is_egm)) { 837 UVM_ASSERT(deferred_free_list); 838 uvm_ext_gpu_map_destroy(va_range, ext_map, deferred_free_list); 839 } 840 } 841 uvm_mutex_unlock(&range_tree->lock); 842 } 843 844 static void uvm_va_range_disable_peer_managed(uvm_va_range_t *va_range, uvm_gpu_t *gpu0, uvm_gpu_t *gpu1) 845 { 846 uvm_va_block_t *va_block; 847 uvm_gpu_t *uvm_lite_gpu_to_unmap = NULL; 848 849 bool uvm_lite_mode = uvm_processor_mask_test(&va_range->uvm_lite_gpus, gpu0->id) && 850 uvm_processor_mask_test(&va_range->uvm_lite_gpus, gpu1->id); 851 852 if (uvm_lite_mode) { 853 // In UVM-Lite mode, the UVM-Lite GPUs can only have mappings to the the 854 // preferred location. If peer mappings are being disabled to the 855 // preferred location, then unmap the other GPU. 856 // Nothing to do otherwise. 857 if (uvm_id_equal(uvm_va_range_get_policy(va_range)->preferred_location, gpu0->id)) 858 uvm_lite_gpu_to_unmap = gpu1; 859 else if (uvm_id_equal(uvm_va_range_get_policy(va_range)->preferred_location, gpu1->id)) 860 uvm_lite_gpu_to_unmap = gpu0; 861 else 862 return; 863 } 864 865 for_each_va_block_in_va_range(va_range, va_block) { 866 uvm_mutex_lock(&va_block->lock); 867 if (uvm_lite_mode) 868 uvm_va_block_unmap_preferred_location_uvm_lite(va_block, uvm_lite_gpu_to_unmap); 869 else 870 uvm_va_block_disable_peer(va_block, gpu0, gpu1); 871 uvm_mutex_unlock(&va_block->lock); 872 } 873 874 if (uvm_lite_mode && !uvm_range_group_all_migratable(va_range->va_space, va_range->node.start, va_range->node.end)) { 875 UVM_ASSERT(uvm_lite_gpu_to_unmap); 876 877 // Migration is prevented, but we had to unmap a UVM-Lite GPU. Update 878 // the accessed by and UVM-Lite GPUs masks as it cannot be considered a 879 // UVM-Lite GPU any more. 880 uvm_va_range_unset_accessed_by(va_range, uvm_lite_gpu_to_unmap->id, NULL); 881 } 882 } 883 884 void uvm_va_range_disable_peer(uvm_va_range_t *va_range, 885 uvm_gpu_t *gpu0, 886 uvm_gpu_t *gpu1, 887 struct list_head *deferred_free_list) 888 { 889 890 switch (va_range->type) { 891 case UVM_VA_RANGE_TYPE_MANAGED: 892 uvm_va_range_disable_peer_managed(va_range, gpu0, gpu1); 893 break; 894 case UVM_VA_RANGE_TYPE_EXTERNAL: 895 // If GPU 0 has a mapping to GPU 1, remove GPU 0's mapping 896 uvm_va_range_disable_peer_external(va_range, gpu0, gpu1, deferred_free_list); 897 // If GPU 1 has a mapping to GPU 0, remove GPU 1's mapping 898 uvm_va_range_disable_peer_external(va_range, gpu1, gpu0, deferred_free_list); 899 break; 900 case UVM_VA_RANGE_TYPE_CHANNEL: 901 // UVM_VA_RANGE_TYPE_CHANNEL should never have peer mappings 902 break; 903 case UVM_VA_RANGE_TYPE_SKED_REFLECTED: 904 // UVM_VA_RANGE_TYPE_SKED_REFLECTED should never have peer mappings 905 break; 906 case UVM_VA_RANGE_TYPE_SEMAPHORE_POOL: 907 // UVM_VA_RANGE_TYPE_SEMAPHORE_POOL should never have peer mappings 908 break; 909 default: 910 UVM_ASSERT_MSG(0, "[0x%llx, 0x%llx] has type %d\n", 911 va_range->node.start, va_range->node.end, va_range->type); 912 } 913 } 914 915 static NV_STATUS va_range_register_gpu_semaphore_pool(uvm_va_range_t *va_range, uvm_gpu_t *gpu) 916 { 917 // TODO: Bug 1812419: pass GPU mapping attributes to uvm_mem_map_gpu_kernel 918 // once that function accepts them. 919 return uvm_mem_map_gpu_kernel(va_range->semaphore_pool.mem, gpu); 920 } 921 922 NV_STATUS uvm_va_range_register_gpu(uvm_va_range_t *va_range, uvm_gpu_t *gpu) 923 { 924 UVM_ASSERT(va_range->type < UVM_VA_RANGE_TYPE_MAX); 925 uvm_assert_rwsem_locked_write(&va_range->va_space->lock); 926 927 if (va_range->type == UVM_VA_RANGE_TYPE_SEMAPHORE_POOL) 928 return va_range_register_gpu_semaphore_pool(va_range, gpu); 929 930 return NV_OK; 931 } 932 933 static void va_range_unregister_gpu_managed(uvm_va_range_t *va_range, uvm_gpu_t *gpu, struct mm_struct *mm) 934 { 935 uvm_va_block_t *va_block; 936 937 // Reset preferred location and accessed-by of VA ranges if needed 938 // Note: ignoring the return code of uvm_va_range_set_preferred_location since this 939 // will only return on error when setting a preferred location, not on a reset 940 if (uvm_id_equal(uvm_va_range_get_policy(va_range)->preferred_location, gpu->id)) 941 (void)uvm_va_range_set_preferred_location(va_range, UVM_ID_INVALID, NUMA_NO_NODE, mm, NULL); 942 943 uvm_va_range_unset_accessed_by(va_range, gpu->id, NULL); 944 945 // Migrate and free any remaining resident allocations on this GPU 946 for_each_va_block_in_va_range(va_range, va_block) 947 uvm_va_block_unregister_gpu(va_block, gpu, mm); 948 } 949 950 // The GPU being unregistered can't have any remaining mappings, since those 951 // were removed when the corresponding GPU VA space was removed. However, other 952 // GPUs could still have mappings to memory resident on this GPU, so we have to 953 // unmap those. 954 static void va_range_unregister_gpu_external(uvm_va_range_t *va_range, 955 uvm_gpu_t *gpu, 956 struct list_head *deferred_free_list) 957 { 958 uvm_ext_gpu_map_t *ext_map, *ext_map_next; 959 uvm_gpu_t *other_gpu; 960 961 for_each_va_space_gpu_in_mask(other_gpu, va_range->va_space, &va_range->external.mapped_gpus) { 962 uvm_ext_gpu_range_tree_t *range_tree = uvm_ext_gpu_range_tree(va_range, other_gpu); 963 UVM_ASSERT(other_gpu != gpu); 964 965 uvm_mutex_lock(&range_tree->lock); 966 uvm_ext_gpu_map_for_each_safe(ext_map, ext_map_next, va_range, other_gpu) { 967 if (ext_map->owning_gpu == gpu) { 968 UVM_ASSERT(deferred_free_list); 969 uvm_ext_gpu_map_destroy(va_range, ext_map, deferred_free_list); 970 } 971 } 972 uvm_mutex_unlock(&range_tree->lock); 973 } 974 } 975 976 static void va_range_unregister_gpu_semaphore_pool(uvm_va_range_t *va_range, uvm_gpu_t *gpu) 977 { 978 NV_STATUS status; 979 980 // Ranges for this GPU should have been previously unmapped from the user VA 981 // space during GPU VA space unregister, which should have already happened. 982 UVM_ASSERT(!uvm_mem_mapped_on_gpu_user(va_range->semaphore_pool.mem, gpu)); 983 UVM_ASSERT(uvm_mem_mapped_on_gpu_kernel(va_range->semaphore_pool.mem, gpu)); 984 985 uvm_mutex_lock(&va_range->semaphore_pool.tracker_lock); 986 status = uvm_tracker_wait(&va_range->semaphore_pool.tracker); 987 uvm_mutex_unlock(&va_range->semaphore_pool.tracker_lock); 988 if (status != NV_OK) 989 UVM_ASSERT(status == uvm_global_get_status()); 990 991 uvm_mem_unmap_gpu_phys(va_range->semaphore_pool.mem, gpu); 992 993 va_range->semaphore_pool.gpu_attrs[uvm_id_gpu_index(gpu->id)] = va_range->semaphore_pool.default_gpu_attrs; 994 if (va_range->semaphore_pool.owner == gpu) 995 va_range->semaphore_pool.owner = NULL; 996 } 997 998 void uvm_va_range_unregister_gpu(uvm_va_range_t *va_range, 999 uvm_gpu_t *gpu, 1000 struct mm_struct *mm, 1001 struct list_head *deferred_free_list) 1002 { 1003 switch (va_range->type) { 1004 case UVM_VA_RANGE_TYPE_MANAGED: 1005 va_range_unregister_gpu_managed(va_range, gpu, mm); 1006 break; 1007 case UVM_VA_RANGE_TYPE_EXTERNAL: 1008 va_range_unregister_gpu_external(va_range, gpu, deferred_free_list); 1009 break; 1010 case UVM_VA_RANGE_TYPE_CHANNEL: 1011 // All ranges should have been destroyed by GPU VA space unregister, 1012 // which should have already happened. 1013 UVM_ASSERT(va_range->channel.gpu_va_space->gpu != gpu); 1014 break; 1015 case UVM_VA_RANGE_TYPE_SKED_REFLECTED: 1016 // All ranges for this GPU should have been unmapped by GPU VA space 1017 // unregister (uvm_va_range_destroy_sked_reflected), which should 1018 // have already happened. 1019 if (va_range->sked_reflected.gpu_va_space != NULL) 1020 UVM_ASSERT(va_range->sked_reflected.gpu_va_space->gpu != gpu); 1021 break; 1022 case UVM_VA_RANGE_TYPE_SEMAPHORE_POOL: 1023 va_range_unregister_gpu_semaphore_pool(va_range, gpu); 1024 break; 1025 default: 1026 UVM_ASSERT_MSG(0, "[0x%llx, 0x%llx] has type %d\n", 1027 va_range->node.start, va_range->node.end, va_range->type); 1028 } 1029 } 1030 1031 // Split existing's blocks into new. new's blocks array has already been 1032 // allocated. This is called before existing's range node is split, so it 1033 // overlaps new. new is always in the upper region of existing. 1034 // 1035 // The caller will do the range tree split. 1036 // 1037 // If this fails it leaves existing unchanged. 1038 static NV_STATUS uvm_va_range_split_blocks(uvm_va_range_t *existing, uvm_va_range_t *new) 1039 { 1040 uvm_va_block_t *old_block, *block = NULL; 1041 size_t existing_blocks, split_index, new_index = 0; 1042 NV_STATUS status; 1043 1044 UVM_ASSERT(new->node.start > existing->node.start); 1045 UVM_ASSERT(new->node.end <= existing->node.end); 1046 1047 split_index = uvm_va_range_block_index(existing, new->node.start); 1048 1049 // Handle a block spanning the split point 1050 if (block_calc_start(existing, split_index) != new->node.start) { 1051 // If a populated block actually spans the split point, we have to split 1052 // the block. Otherwise just account for the extra entry in the arrays. 1053 old_block = uvm_va_range_block(existing, split_index); 1054 if (old_block) { 1055 UVM_ASSERT(old_block->start < new->node.start); 1056 status = uvm_va_block_split(old_block, new->node.start - 1, &block, new); 1057 if (status != NV_OK) 1058 return status; 1059 1060 // No memory barrier is needed since we're holding the va_space lock in 1061 // write mode, so no other thread can access the blocks array. 1062 atomic_long_set(&new->blocks[0], (long)block); 1063 } 1064 1065 new_index = 1; 1066 } 1067 1068 // uvm_va_block_split gets first crack at injecting an error. If it did so, 1069 // we wouldn't be here. However, not all va_range splits will call 1070 // uvm_va_block_split so we need an extra check here. We can't push this 1071 // injection later since all paths past this point assume success, so they 1072 // modify the state of 'existing' range. 1073 // 1074 // Even if there was no block split above, there is no guarantee that one 1075 // of our blocks doesn't have the 'inject_split_error' flag set. We clear 1076 // that here to prevent multiple errors caused by one 1077 // 'uvm_test_va_range_inject_split_error' call. 1078 if (existing->inject_split_error) { 1079 UVM_ASSERT(!block); 1080 existing->inject_split_error = false; 1081 1082 for_each_va_block_in_va_range(existing, block) { 1083 uvm_va_block_test_t *block_test = uvm_va_block_get_test(block); 1084 if (block_test) 1085 block_test->inject_split_error = false; 1086 } 1087 1088 return NV_ERR_NO_MEMORY; 1089 } 1090 1091 existing_blocks = split_index + new_index; 1092 1093 // Copy existing's blocks over to the new range, accounting for the explicit 1094 // assignment above in case we did a block split. There are two general 1095 // cases: 1096 // 1097 // No split: 1098 // split_index 1099 // v 1100 // existing (before) [----- A ----][----- B ----][----- C ----] 1101 // existing (after) [----- A ----] 1102 // new [----- B ----][----- C ----] 1103 // 1104 // Split: 1105 // split_index 1106 // v 1107 // existing (before) [----- A ----][----- B ----][----- C ----] 1108 // existing (after [----- A ----][- B -] 1109 // new [- N -][----- C ----] 1110 // ^new->blocks[0] 1111 1112 // Note, if we split the last block of existing, this won't iterate at all. 1113 for (; new_index < uvm_va_range_num_blocks(new); new_index++) { 1114 block = uvm_va_range_block(existing, split_index + new_index); 1115 if (!block) { 1116 // new's array was cleared at allocation 1117 UVM_ASSERT(uvm_va_range_block(new, new_index) == NULL); 1118 continue; 1119 } 1120 1121 // As soon as we make this assignment and drop the lock, the reverse 1122 // mapping code can start looking at new, so new must be ready to go. 1123 uvm_mutex_lock(&block->lock); 1124 UVM_ASSERT(block->va_range == existing); 1125 block->va_range = new; 1126 uvm_mutex_unlock(&block->lock); 1127 1128 // No memory barrier is needed since we're holding the va_space lock in 1129 // write mode, so no other thread can access the blocks array. 1130 atomic_long_set(&new->blocks[new_index], (long)block); 1131 atomic_long_set(&existing->blocks[split_index + new_index], (long)NULL); 1132 } 1133 1134 blocks_array_shrink(existing, existing_blocks); 1135 1136 return NV_OK; 1137 } 1138 1139 NV_STATUS uvm_va_range_split(uvm_va_range_t *existing_va_range, 1140 NvU64 new_end, 1141 uvm_va_range_t **new_va_range) 1142 { 1143 uvm_va_space_t *va_space = existing_va_range->va_space; 1144 uvm_va_range_t *new = NULL; 1145 uvm_perf_event_data_t event_data; 1146 NV_STATUS status; 1147 1148 UVM_ASSERT(existing_va_range->type == UVM_VA_RANGE_TYPE_MANAGED); 1149 UVM_ASSERT(new_end > existing_va_range->node.start); 1150 UVM_ASSERT(new_end < existing_va_range->node.end); 1151 UVM_ASSERT(PAGE_ALIGNED(new_end + 1)); 1152 uvm_assert_rwsem_locked_write(&va_space->lock); 1153 1154 new = uvm_va_range_alloc_managed(va_space, new_end + 1, existing_va_range->node.end); 1155 if (!new) { 1156 status = NV_ERR_NO_MEMORY; 1157 goto error; 1158 } 1159 1160 // The new va_range is under the same vma. If this is a uvm_vm_open, the 1161 // caller takes care of updating existing's vma_wrapper for us. 1162 new->managed.vma_wrapper = existing_va_range->managed.vma_wrapper; 1163 1164 // Copy over state before splitting blocks so any block lookups happening 1165 // concurrently on the eviction path will see the new range's data. 1166 uvm_va_range_get_policy(new)->read_duplication = uvm_va_range_get_policy(existing_va_range)->read_duplication; 1167 uvm_va_range_get_policy(new)->preferred_location = uvm_va_range_get_policy(existing_va_range)->preferred_location; 1168 uvm_va_range_get_policy(new)->preferred_nid = uvm_va_range_get_policy(existing_va_range)->preferred_nid; 1169 uvm_processor_mask_copy(&uvm_va_range_get_policy(new)->accessed_by, 1170 &uvm_va_range_get_policy(existing_va_range)->accessed_by); 1171 uvm_processor_mask_copy(&new->uvm_lite_gpus, &existing_va_range->uvm_lite_gpus); 1172 1173 status = uvm_va_range_split_blocks(existing_va_range, new); 1174 if (status != NV_OK) 1175 goto error; 1176 1177 // Finally, update the VA range tree 1178 uvm_range_tree_split(&va_space->va_range_tree, &existing_va_range->node, &new->node); 1179 1180 if (new->type == UVM_VA_RANGE_TYPE_MANAGED) { 1181 event_data.range_shrink.range = new; 1182 uvm_perf_event_notify(&va_space->perf_events, UVM_PERF_EVENT_RANGE_SHRINK, &event_data); 1183 } 1184 1185 if (new_va_range) 1186 *new_va_range = new; 1187 return NV_OK; 1188 1189 error: 1190 uvm_va_range_destroy(new, NULL); 1191 return status; 1192 1193 } 1194 1195 uvm_va_range_t *uvm_va_range_find(uvm_va_space_t *va_space, NvU64 addr) 1196 { 1197 uvm_assert_rwsem_locked(&va_space->lock); 1198 return uvm_va_range_container(uvm_range_tree_find(&va_space->va_range_tree, addr)); 1199 } 1200 1201 uvm_va_range_t *uvm_va_space_iter_first(uvm_va_space_t *va_space, NvU64 start, NvU64 end) 1202 { 1203 uvm_range_tree_node_t *node = uvm_range_tree_iter_first(&va_space->va_range_tree, start, end); 1204 return uvm_va_range_container(node); 1205 } 1206 1207 uvm_va_range_t *uvm_va_space_iter_next(uvm_va_range_t *va_range, NvU64 end) 1208 { 1209 uvm_range_tree_node_t *node; 1210 1211 // Handling a NULL va_range here makes uvm_for_each_va_range_in_safe much 1212 // less messy 1213 if (!va_range) 1214 return NULL; 1215 1216 node = uvm_range_tree_iter_next(&va_range->va_space->va_range_tree, &va_range->node, end); 1217 return uvm_va_range_container(node); 1218 } 1219 1220 size_t uvm_va_range_num_blocks(uvm_va_range_t *va_range) 1221 { 1222 NvU64 start = UVM_VA_BLOCK_ALIGN_DOWN(va_range->node.start); 1223 NvU64 end = UVM_VA_BLOCK_ALIGN_UP(va_range->node.end); // End is inclusive 1224 return (end - start) / UVM_VA_BLOCK_SIZE; 1225 } 1226 1227 size_t uvm_va_range_block_index(uvm_va_range_t *va_range, NvU64 addr) 1228 { 1229 size_t addr_index, start_index, index; 1230 1231 UVM_ASSERT(addr >= va_range->node.start); 1232 UVM_ASSERT(addr <= va_range->node.end); 1233 UVM_ASSERT(va_range->type == UVM_VA_RANGE_TYPE_MANAGED); 1234 1235 // Each block will cover as much space as possible within the aligned 1236 // UVM_VA_BLOCK_SIZE, up to the parent VA range boundaries. In other words, 1237 // the entire VA space can be broken into UVM_VA_BLOCK_SIZE chunks. Even if 1238 // there are multiple ranges (and thus multiple blocks) per actual 1239 // UVM_VA_BLOCK_SIZE chunk, none of those will have more than 1 block unless 1240 // they span a UVM_VA_BLOCK_SIZE alignment boundary. 1241 addr_index = (size_t)(addr / UVM_VA_BLOCK_SIZE); 1242 start_index = (size_t)(va_range->node.start / UVM_VA_BLOCK_SIZE); 1243 1244 index = addr_index - start_index; 1245 UVM_ASSERT(index < uvm_va_range_num_blocks(va_range)); 1246 return index; 1247 } 1248 1249 NV_STATUS uvm_va_range_block_create(uvm_va_range_t *va_range, size_t index, uvm_va_block_t **out_block) 1250 { 1251 uvm_va_block_t *block, *old; 1252 NV_STATUS status; 1253 1254 block = uvm_va_range_block(va_range, index); 1255 if (!block) { 1256 // No block has been created here yet, so allocate one and attempt to 1257 // insert it. Note that this runs the risk of an out-of-memory error 1258 // when multiple threads race and all concurrently allocate a block for 1259 // the same address. This should be extremely rare. There is also 1260 // precedent in the Linux kernel, which does the same thing for demand- 1261 // allocation of anonymous pages. 1262 status = uvm_va_block_create(va_range, 1263 block_calc_start(va_range, index), 1264 block_calc_end(va_range, index), 1265 &block); 1266 if (status != NV_OK) 1267 return status; 1268 1269 // Try to insert it 1270 old = (uvm_va_block_t *)nv_atomic_long_cmpxchg(&va_range->blocks[index], 1271 (long)NULL, 1272 (long)block); 1273 if (old) { 1274 // Someone else beat us on the insert 1275 uvm_va_block_release(block); 1276 block = old; 1277 } 1278 } 1279 1280 *out_block = block; 1281 return NV_OK; 1282 } 1283 1284 uvm_va_block_t *uvm_va_range_block_next(uvm_va_range_t *va_range, uvm_va_block_t *va_block) 1285 { 1286 uvm_va_space_t *va_space = va_range->va_space; 1287 size_t i = 0; 1288 1289 uvm_assert_rwsem_locked(&va_space->lock); 1290 1291 UVM_ASSERT(va_range->type == UVM_VA_RANGE_TYPE_MANAGED); 1292 1293 if (va_block) 1294 i = uvm_va_range_block_index(va_range, va_block->start) + 1; 1295 1296 for (; i < uvm_va_range_num_blocks(va_range); i++) { 1297 va_block = uvm_va_range_block(va_range, i); 1298 if (va_block) { 1299 UVM_ASSERT(va_block->va_range == va_range); 1300 UVM_ASSERT(uvm_va_range_block_index(va_range, va_block->start) == i); 1301 return va_block; 1302 } 1303 } 1304 1305 return NULL; 1306 } 1307 1308 static NV_STATUS range_unmap_mask(uvm_va_range_t *va_range, 1309 const uvm_processor_mask_t *mask, 1310 uvm_tracker_t *out_tracker) 1311 { 1312 uvm_va_space_t *va_space = va_range->va_space; 1313 uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, NULL); 1314 uvm_va_block_t *block; 1315 1316 UVM_ASSERT_MSG(va_range->type == UVM_VA_RANGE_TYPE_MANAGED, "type 0x%x\n", va_range->type); 1317 1318 if (uvm_processor_mask_empty(mask)) 1319 return NV_OK; 1320 1321 1322 for_each_va_block_in_va_range(va_range, block) { 1323 NV_STATUS status; 1324 uvm_va_block_region_t region = uvm_va_block_region_from_block(block); 1325 1326 uvm_mutex_lock(&block->lock); 1327 status = uvm_va_block_unmap_mask(block, block_context, mask, region, NULL); 1328 if (out_tracker) 1329 uvm_tracker_add_tracker_safe(out_tracker, &block->tracker); 1330 1331 uvm_mutex_unlock(&block->lock); 1332 if (status != NV_OK) 1333 return status; 1334 } 1335 1336 return NV_OK; 1337 } 1338 1339 static NV_STATUS range_unmap(uvm_va_range_t *va_range, uvm_processor_id_t processor, uvm_tracker_t *out_tracker) 1340 { 1341 uvm_processor_mask_t mask; 1342 1343 UVM_ASSERT_MSG(va_range->type == UVM_VA_RANGE_TYPE_MANAGED, "type 0x%x\n", va_range->type); 1344 1345 uvm_processor_mask_zero(&mask); 1346 uvm_processor_mask_set(&mask, processor); 1347 1348 return range_unmap_mask(va_range, &mask, out_tracker); 1349 } 1350 1351 static NV_STATUS range_map_uvm_lite_gpus(uvm_va_range_t *va_range, uvm_tracker_t *out_tracker) 1352 { 1353 NV_STATUS status = NV_OK; 1354 uvm_va_block_t *va_block; 1355 uvm_va_block_context_t *va_block_context = uvm_va_space_block_context(va_range->va_space, NULL); 1356 1357 UVM_ASSERT(va_range->type == UVM_VA_RANGE_TYPE_MANAGED); 1358 1359 if (uvm_processor_mask_empty(&va_range->uvm_lite_gpus)) 1360 return NV_OK; 1361 1362 1363 for_each_va_block_in_va_range(va_range, va_block) { 1364 // UVM-Lite GPUs always map with RWA 1365 uvm_mutex_lock(&va_block->lock); 1366 status = UVM_VA_BLOCK_RETRY_LOCKED(va_block, NULL, 1367 uvm_va_block_map_mask(va_block, 1368 va_block_context, 1369 &va_range->uvm_lite_gpus, 1370 uvm_va_block_region_from_block(va_block), 1371 NULL, 1372 UVM_PROT_READ_WRITE_ATOMIC, 1373 UvmEventMapRemoteCauseCoherence)); 1374 if (status == NV_OK && out_tracker) 1375 status = uvm_tracker_add_tracker(out_tracker, &va_block->tracker); 1376 1377 uvm_mutex_unlock(&va_block->lock); 1378 if (status != NV_OK) 1379 break; 1380 } 1381 1382 return status; 1383 } 1384 1385 // Calculate the mask of GPUs that should follow the UVM-Lite behaviour 1386 static void calc_uvm_lite_gpus_mask(uvm_va_space_t *va_space, 1387 uvm_processor_id_t preferred_location, 1388 const uvm_processor_mask_t *accessed_by_mask, 1389 uvm_processor_mask_t *uvm_lite_gpus) 1390 { 1391 uvm_gpu_id_t gpu_id; 1392 1393 uvm_assert_rwsem_locked_write(&va_space->lock); 1394 1395 // Zero out the mask first 1396 uvm_processor_mask_zero(uvm_lite_gpus); 1397 1398 // If no preferred location is set then there are no GPUs following the UVM-Lite behavior 1399 if (UVM_ID_IS_INVALID(preferred_location)) 1400 return; 1401 1402 // If the preferred location is a faultable GPU, then no GPUs should follow 1403 // the UVM-Lite behaviour. 1404 if (UVM_ID_IS_GPU(preferred_location) && 1405 uvm_processor_mask_test(&va_space->faultable_processors, preferred_location)) { 1406 return; 1407 } 1408 1409 // Otherwise add all non-faultable GPUs to the UVM-Lite mask that have 1410 // accessed by set. 1411 for_each_gpu_id_in_mask(gpu_id, accessed_by_mask) { 1412 if (!uvm_processor_mask_test(&va_space->faultable_processors, gpu_id)) 1413 uvm_processor_mask_set(uvm_lite_gpus, gpu_id); 1414 } 1415 1416 // And the preferred location if it's a GPU 1417 if (UVM_ID_IS_GPU(preferred_location)) 1418 uvm_processor_mask_set(uvm_lite_gpus, preferred_location); 1419 } 1420 1421 // Update the mask of GPUs that follow the UVM-Lite behaviour 1422 static void range_update_uvm_lite_gpus_mask(uvm_va_range_t *va_range) 1423 { 1424 UVM_ASSERT(va_range->type == UVM_VA_RANGE_TYPE_MANAGED); 1425 calc_uvm_lite_gpus_mask(va_range->va_space, 1426 uvm_va_range_get_policy(va_range)->preferred_location, 1427 &uvm_va_range_get_policy(va_range)->accessed_by, 1428 &va_range->uvm_lite_gpus); 1429 } 1430 1431 NV_STATUS uvm_va_range_set_preferred_location(uvm_va_range_t *va_range, 1432 uvm_processor_id_t preferred_location, 1433 int preferred_cpu_nid, 1434 struct mm_struct *mm, 1435 uvm_tracker_t *out_tracker) 1436 { 1437 NV_STATUS status; 1438 uvm_processor_mask_t all_uvm_lite_gpus; 1439 uvm_processor_mask_t new_uvm_lite_gpus; 1440 uvm_processor_mask_t set_accessed_by_processors; 1441 uvm_range_group_range_iter_t iter; 1442 uvm_range_group_range_t *rgr = NULL; 1443 uvm_va_space_t *va_space = va_range->va_space; 1444 uvm_va_block_t *va_block; 1445 uvm_va_block_context_t *va_block_context; 1446 uvm_va_policy_t *va_range_policy; 1447 1448 uvm_assert_rwsem_locked_write(&va_space->lock); 1449 UVM_ASSERT(va_range->type == UVM_VA_RANGE_TYPE_MANAGED); 1450 1451 va_range_policy = uvm_va_range_get_policy(va_range); 1452 if (uvm_va_policy_preferred_location_equal(va_range_policy, preferred_location, preferred_cpu_nid)) 1453 return NV_OK; 1454 1455 // Mark all range group ranges within this VA range as migrated since the preferred location has changed. 1456 uvm_range_group_for_each_range_in(rgr, va_space, va_range->node.start, va_range->node.end) { 1457 uvm_spin_lock(&rgr->range_group->migrated_ranges_lock); 1458 if (list_empty(&rgr->range_group_migrated_list_node)) 1459 list_move_tail(&rgr->range_group_migrated_list_node, &rgr->range_group->migrated_ranges); 1460 uvm_spin_unlock(&rgr->range_group->migrated_ranges_lock); 1461 } 1462 1463 // Calculate the new UVM-Lite GPUs mask, but don't update va_range state so 1464 // that we can keep block_page_check_mappings() happy while updating the 1465 // mappings. 1466 calc_uvm_lite_gpus_mask(va_space, preferred_location, &va_range_policy->accessed_by, &new_uvm_lite_gpus); 1467 1468 // If the range contains non-migratable range groups, check that new UVM-Lite GPUs 1469 // can all map the new preferred location. 1470 if (!uvm_range_group_all_migratable(va_space, va_range->node.start, va_range->node.end) && 1471 UVM_ID_IS_VALID(preferred_location) && 1472 !uvm_processor_mask_subset(&new_uvm_lite_gpus, &va_space->accessible_from[uvm_id_value(preferred_location)])) { 1473 return NV_ERR_INVALID_DEVICE; 1474 } 1475 1476 if (UVM_ID_IS_INVALID(preferred_location)) { 1477 uvm_range_group_for_each_migratability_in_safe(&iter, va_space, va_range->node.start, va_range->node.end) { 1478 if (!iter.migratable) { 1479 // Clear the range group assocation for any unmigratable ranges if there is no preferred location 1480 status = uvm_range_group_assign_range(va_space, NULL, iter.start, iter.end); 1481 if (status != NV_OK) 1482 return status; 1483 } 1484 } 1485 } 1486 1487 // Unmap all old and new UVM-Lite GPUs 1488 // - GPUs that stop being UVM-Lite need to be unmapped so that they don't 1489 // have stale mappings to the old preferred location. 1490 // - GPUs that will continue to be UVM-Lite GPUs or are new UVM-Lite GPUs 1491 // need to be unmapped so that the new preferred location can be mapped. 1492 uvm_processor_mask_or(&all_uvm_lite_gpus, &va_range->uvm_lite_gpus, &new_uvm_lite_gpus); 1493 status = range_unmap_mask(va_range, &all_uvm_lite_gpus, out_tracker); 1494 if (status != NV_OK) 1495 return status; 1496 1497 // GPUs that stop being UVM-Lite, but are in the accessed_by mask need to 1498 // have any possible mappings established. 1499 uvm_processor_mask_andnot(&set_accessed_by_processors, &va_range->uvm_lite_gpus, &new_uvm_lite_gpus); 1500 1501 // A GPU which had been in UVM-Lite mode before must still be in UVM-Lite 1502 // mode if it is the new preferred location. Otherwise we'd have to be more 1503 // careful below to not establish remote mappings to the new preferred 1504 // location. 1505 if (UVM_ID_IS_GPU(preferred_location)) 1506 UVM_ASSERT(!uvm_processor_mask_test(&set_accessed_by_processors, preferred_location)); 1507 1508 // The old preferred location should establish new remote mappings if it has 1509 // accessed-by set. 1510 if (UVM_ID_IS_VALID(va_range_policy->preferred_location)) 1511 uvm_processor_mask_set(&set_accessed_by_processors, va_range_policy->preferred_location); 1512 1513 uvm_processor_mask_and(&set_accessed_by_processors, &set_accessed_by_processors, &va_range_policy->accessed_by); 1514 1515 // Now update the va_range state 1516 va_range_policy->preferred_location = preferred_location; 1517 va_range_policy->preferred_nid = preferred_cpu_nid; 1518 uvm_processor_mask_copy(&va_range->uvm_lite_gpus, &new_uvm_lite_gpus); 1519 1520 va_block_context = uvm_va_space_block_context(va_space, mm); 1521 1522 for_each_va_block_in_va_range(va_range, va_block) { 1523 uvm_processor_id_t id; 1524 uvm_va_block_region_t region = uvm_va_block_region_from_block(va_block); 1525 1526 for_each_id_in_mask(id, &set_accessed_by_processors) { 1527 status = uvm_va_block_set_accessed_by(va_block, va_block_context, id); 1528 if (status != NV_OK) 1529 return status; 1530 } 1531 1532 // Also, mark CPU pages as dirty and remove remote mappings from the new 1533 // preferred location 1534 uvm_mutex_lock(&va_block->lock); 1535 status = UVM_VA_BLOCK_RETRY_LOCKED(va_block, 1536 NULL, 1537 uvm_va_block_set_preferred_location_locked(va_block, 1538 va_block_context, 1539 region)); 1540 1541 if (out_tracker) { 1542 NV_STATUS tracker_status; 1543 1544 tracker_status = uvm_tracker_add_tracker_safe(out_tracker, &va_block->tracker); 1545 if (status == NV_OK) 1546 status = tracker_status; 1547 } 1548 1549 uvm_mutex_unlock(&va_block->lock); 1550 1551 if (status != NV_OK) 1552 return status; 1553 } 1554 1555 // And lastly map all of the current UVM-Lite GPUs to the resident pages on 1556 // the new preferred location. Anything that's not resident right now will 1557 // get mapped on the next PreventMigration(). 1558 return range_map_uvm_lite_gpus(va_range, out_tracker); 1559 } 1560 1561 NV_STATUS uvm_va_range_set_accessed_by(uvm_va_range_t *va_range, 1562 uvm_processor_id_t processor_id, 1563 struct mm_struct *mm, 1564 uvm_tracker_t *out_tracker) 1565 { 1566 NV_STATUS status; 1567 uvm_va_block_t *va_block; 1568 uvm_processor_mask_t new_uvm_lite_gpus; 1569 uvm_va_space_t *va_space = va_range->va_space; 1570 uvm_va_policy_t *policy = uvm_va_range_get_policy(va_range); 1571 uvm_va_block_context_t *va_block_context; 1572 1573 // If the range belongs to a non-migratable range group and that processor_id is a non-faultable GPU, 1574 // check it can map the preferred location 1575 if (!uvm_range_group_all_migratable(va_space, va_range->node.start, va_range->node.end) && 1576 UVM_ID_IS_GPU(processor_id) && 1577 !uvm_processor_mask_test(&va_space->faultable_processors, processor_id) && 1578 !uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(policy->preferred_location)], processor_id)) 1579 return NV_ERR_INVALID_DEVICE; 1580 1581 uvm_processor_mask_set(&policy->accessed_by, processor_id); 1582 1583 // If a GPU is already a UVM-Lite GPU then there is nothing else to do. 1584 if (uvm_processor_mask_test(&va_range->uvm_lite_gpus, processor_id)) 1585 return NV_OK; 1586 1587 // Calculate the new UVM-Lite GPUs mask, but don't update it in the va range 1588 // yet so that we can keep block_page_check_mappings() happy while updating 1589 // the mappings. 1590 calc_uvm_lite_gpus_mask(va_space, policy->preferred_location, &policy->accessed_by, &new_uvm_lite_gpus); 1591 1592 if (uvm_processor_mask_test(&new_uvm_lite_gpus, processor_id)) { 1593 // GPUs that become UVM-Lite GPUs need to unmap everything so that they 1594 // can map the preferred location. 1595 status = range_unmap(va_range, processor_id, out_tracker); 1596 if (status != NV_OK) 1597 return status; 1598 } 1599 1600 uvm_processor_mask_copy(&va_range->uvm_lite_gpus, &new_uvm_lite_gpus); 1601 va_block_context = uvm_va_space_block_context(va_space, mm); 1602 1603 for_each_va_block_in_va_range(va_range, va_block) { 1604 status = uvm_va_block_set_accessed_by(va_block, va_block_context, processor_id); 1605 if (status != NV_OK) 1606 return status; 1607 } 1608 1609 return NV_OK; 1610 } 1611 1612 void uvm_va_range_unset_accessed_by(uvm_va_range_t *va_range, 1613 uvm_processor_id_t processor_id, 1614 uvm_tracker_t *out_tracker) 1615 { 1616 uvm_range_group_range_t *rgr = NULL; 1617 1618 // Mark all range group ranges within this VA range as migrated. We do this to force 1619 // uvm_range_group_set_migration_policy to re-check the policy state since we're changing it here. 1620 uvm_range_group_for_each_range_in(rgr, va_range->va_space, va_range->node.start, va_range->node.end) { 1621 uvm_spin_lock(&rgr->range_group->migrated_ranges_lock); 1622 if (list_empty(&rgr->range_group_migrated_list_node)) 1623 list_move_tail(&rgr->range_group_migrated_list_node, &rgr->range_group->migrated_ranges); 1624 uvm_spin_unlock(&rgr->range_group->migrated_ranges_lock); 1625 } 1626 1627 uvm_processor_mask_clear(&uvm_va_range_get_policy(va_range)->accessed_by, processor_id); 1628 1629 // If a UVM-Lite GPU is being removed from the accessed_by mask, it will 1630 // also stop being a UVM-Lite GPU unless it's also the preferred location. 1631 if (uvm_processor_mask_test(&va_range->uvm_lite_gpus, processor_id) && 1632 !uvm_id_equal(uvm_va_range_get_policy(va_range)->preferred_location, processor_id)) { 1633 range_unmap(va_range, processor_id, out_tracker); 1634 } 1635 1636 range_update_uvm_lite_gpus_mask(va_range); 1637 } 1638 1639 NV_STATUS uvm_va_range_set_read_duplication(uvm_va_range_t *va_range, struct mm_struct *mm) 1640 { 1641 uvm_va_block_t *va_block; 1642 uvm_va_block_context_t *va_block_context; 1643 1644 if (uvm_va_range_get_policy(va_range)->read_duplication == UVM_READ_DUPLICATION_ENABLED) 1645 return NV_OK; 1646 1647 va_block_context = uvm_va_space_block_context(va_range->va_space, mm); 1648 1649 for_each_va_block_in_va_range(va_range, va_block) { 1650 NV_STATUS status = uvm_va_block_set_read_duplication(va_block, va_block_context); 1651 1652 if (status != NV_OK) 1653 return status; 1654 } 1655 1656 return NV_OK; 1657 } 1658 1659 NV_STATUS uvm_va_range_unset_read_duplication(uvm_va_range_t *va_range, struct mm_struct *mm) 1660 { 1661 uvm_va_block_t *va_block; 1662 uvm_va_block_context_t *va_block_context; 1663 NV_STATUS status; 1664 1665 if (uvm_va_range_get_policy(va_range)->read_duplication == UVM_READ_DUPLICATION_DISABLED) 1666 return NV_OK; 1667 1668 va_block_context = uvm_va_space_block_context(va_range->va_space, mm); 1669 1670 for_each_va_block_in_va_range(va_range, va_block) { 1671 status = uvm_va_block_unset_read_duplication(va_block, va_block_context); 1672 1673 if (status != NV_OK) 1674 return status; 1675 } 1676 1677 return NV_OK; 1678 } 1679 1680 uvm_vma_wrapper_t *uvm_vma_wrapper_alloc(struct vm_area_struct *vma) 1681 { 1682 uvm_vma_wrapper_t *vma_wrapper = nv_kmem_cache_zalloc(g_uvm_vma_wrapper_cache, NV_UVM_GFP_FLAGS); 1683 if (!vma_wrapper) 1684 return NULL; 1685 1686 vma_wrapper->vma = vma; 1687 uvm_init_rwsem(&vma_wrapper->lock, UVM_LOCK_ORDER_LEAF); 1688 1689 return vma_wrapper; 1690 } 1691 1692 void uvm_vma_wrapper_destroy(uvm_vma_wrapper_t *vma_wrapper) 1693 { 1694 if (!vma_wrapper) 1695 return; 1696 1697 uvm_assert_rwsem_unlocked(&vma_wrapper->lock); 1698 1699 kmem_cache_free(g_uvm_vma_wrapper_cache, vma_wrapper); 1700 } 1701 1702 static NvU64 sked_reflected_pte_maker(uvm_page_table_range_vec_t *range_vec, NvU64 offset, void *caller_data) 1703 { 1704 (void)caller_data; 1705 1706 return range_vec->tree->hal->make_sked_reflected_pte(); 1707 } 1708 1709 static NV_STATUS uvm_map_sked_reflected_range(uvm_va_space_t *va_space, UVM_MAP_DYNAMIC_PARALLELISM_REGION_PARAMS *params) 1710 { 1711 NV_STATUS status; 1712 uvm_va_range_t *va_range = NULL; 1713 uvm_gpu_t *gpu; 1714 uvm_gpu_va_space_t *gpu_va_space; 1715 uvm_page_tree_t *page_tables; 1716 struct mm_struct *mm; 1717 1718 if (uvm_api_range_invalid_4k(params->base, params->length)) 1719 return NV_ERR_INVALID_ADDRESS; 1720 1721 // The mm needs to be locked in order to remove stale HMM va_blocks. 1722 mm = uvm_va_space_mm_or_current_retain_lock(va_space); 1723 uvm_va_space_down_write(va_space); 1724 1725 gpu = uvm_va_space_get_gpu_by_uuid_with_gpu_va_space(va_space, ¶ms->gpuUuid); 1726 if (!gpu) { 1727 status = NV_ERR_INVALID_DEVICE; 1728 goto done; 1729 } 1730 1731 // Check if the GPU can access the VA 1732 if (!uvm_gpu_can_address(gpu, params->base, params->length)) { 1733 status = NV_ERR_OUT_OF_RANGE; 1734 goto done; 1735 } 1736 1737 gpu_va_space = va_space->gpu_va_spaces[uvm_id_gpu_index(gpu->id)]; 1738 page_tables = &gpu_va_space->page_tables; 1739 1740 // The VA range must exactly cover one supported GPU page 1741 if (!is_power_of_2(params->length) || 1742 !IS_ALIGNED(params->base, params->length) || 1743 !uvm_mmu_page_size_supported(page_tables, params->length)) { 1744 status = NV_ERR_INVALID_ADDRESS; 1745 goto done; 1746 } 1747 1748 status = uvm_va_range_create_sked_reflected(va_space, mm, params->base, params->length, &va_range); 1749 if (status != NV_OK) { 1750 UVM_DBG_PRINT_RL("Failed to create sked reflected VA range [0x%llx, 0x%llx)\n", 1751 params->base, params->base + params->length); 1752 goto done; 1753 } 1754 1755 va_range->sked_reflected.gpu_va_space = gpu_va_space; 1756 1757 status = uvm_page_table_range_vec_init(page_tables, 1758 va_range->node.start, 1759 uvm_va_range_size(va_range), 1760 params->length, 1761 UVM_PMM_ALLOC_FLAGS_EVICT, 1762 &va_range->sked_reflected.pt_range_vec); 1763 if (status != NV_OK) 1764 goto done; 1765 1766 status = uvm_page_table_range_vec_write_ptes(&va_range->sked_reflected.pt_range_vec, 1767 UVM_MEMBAR_NONE, sked_reflected_pte_maker, NULL); 1768 1769 if (status != NV_OK) 1770 goto done; 1771 1772 done: 1773 if (status != NV_OK && va_range != NULL) 1774 uvm_va_range_destroy(va_range, NULL); 1775 1776 uvm_va_space_up_write(va_space); 1777 uvm_va_space_mm_or_current_release_unlock(va_space, mm); 1778 1779 return status; 1780 } 1781 1782 NV_STATUS uvm_api_map_dynamic_parallelism_region(UVM_MAP_DYNAMIC_PARALLELISM_REGION_PARAMS *params, struct file *filp) 1783 { 1784 uvm_va_space_t *va_space = uvm_va_space_get(filp); 1785 1786 // Notably the ranges created by the UvmMapDynamicParallelismRegion() API 1787 // are referred to as "SKED reflected ranges" internally as it's more 1788 // descriptive. 1789 return uvm_map_sked_reflected_range(va_space, params); 1790 } 1791 1792 NV_STATUS uvm_api_alloc_semaphore_pool(UVM_ALLOC_SEMAPHORE_POOL_PARAMS *params, struct file *filp) 1793 { 1794 NV_STATUS status; 1795 uvm_va_space_t *va_space = uvm_va_space_get(filp); 1796 uvm_va_range_t *va_range = NULL; 1797 uvm_gpu_t *gpu; 1798 struct mm_struct *mm; 1799 1800 if (uvm_api_range_invalid(params->base, params->length)) 1801 return NV_ERR_INVALID_ADDRESS; 1802 if (params->gpuAttributesCount > UVM_MAX_GPUS) 1803 return NV_ERR_INVALID_ARGUMENT; 1804 1805 if (g_uvm_global.conf_computing_enabled && params->gpuAttributesCount == 0) 1806 return NV_ERR_INVALID_ARGUMENT; 1807 1808 // The mm needs to be locked in order to remove stale HMM va_blocks. 1809 mm = uvm_va_space_mm_or_current_retain_lock(va_space); 1810 uvm_va_space_down_write(va_space); 1811 1812 status = uvm_va_range_create_semaphore_pool(va_space, 1813 mm, 1814 params->base, 1815 params->length, 1816 params->perGpuAttributes, 1817 params->gpuAttributesCount, 1818 &va_range); 1819 if (status != NV_OK) 1820 goto unlock; 1821 1822 for_each_va_space_gpu(gpu, va_space) { 1823 status = va_range_register_gpu_semaphore_pool(va_range, gpu); 1824 if (status != NV_OK) 1825 goto done; 1826 1827 if (!uvm_processor_mask_test(&va_space->registered_gpu_va_spaces, gpu->id)) 1828 continue; 1829 1830 status = va_range_add_gpu_va_space_semaphore_pool(va_range, gpu); 1831 if (status != NV_OK) 1832 goto done; 1833 } 1834 1835 done: 1836 if (status != NV_OK) 1837 uvm_va_range_destroy(va_range, NULL); 1838 1839 unlock: 1840 uvm_va_space_up_write(va_space); 1841 uvm_va_space_mm_or_current_release_unlock(va_space, mm); 1842 return status; 1843 } 1844 1845 NV_STATUS uvm_test_va_range_info(UVM_TEST_VA_RANGE_INFO_PARAMS *params, struct file *filp) 1846 { 1847 uvm_va_space_t *va_space; 1848 uvm_va_range_t *va_range; 1849 uvm_processor_id_t processor_id; 1850 uvm_va_policy_t *policy; 1851 struct vm_area_struct *vma; 1852 NV_STATUS status = NV_OK; 1853 struct mm_struct *mm; 1854 1855 va_space = uvm_va_space_get(filp); 1856 1857 mm = uvm_va_space_mm_or_current_retain_lock(va_space); 1858 uvm_va_space_down_read(va_space); 1859 1860 va_range = uvm_va_range_find(va_space, params->lookup_address); 1861 if (!va_range) { 1862 status = uvm_hmm_va_range_info(va_space, mm, params); 1863 goto out; 1864 } 1865 1866 policy = uvm_va_range_get_policy(va_range); 1867 params->va_range_start = va_range->node.start; 1868 params->va_range_end = va_range->node.end; 1869 1870 // -Wall implies -Wenum-compare, so cast through int to avoid warnings 1871 BUILD_BUG_ON((int)UVM_READ_DUPLICATION_UNSET != (int)UVM_TEST_READ_DUPLICATION_UNSET); 1872 BUILD_BUG_ON((int)UVM_READ_DUPLICATION_ENABLED != (int)UVM_TEST_READ_DUPLICATION_ENABLED); 1873 BUILD_BUG_ON((int)UVM_READ_DUPLICATION_DISABLED != (int)UVM_TEST_READ_DUPLICATION_DISABLED); 1874 BUILD_BUG_ON((int)UVM_READ_DUPLICATION_MAX != (int)UVM_TEST_READ_DUPLICATION_MAX); 1875 params->read_duplication = policy->read_duplication; 1876 1877 if (UVM_ID_IS_INVALID(policy->preferred_location)) { 1878 memset(¶ms->preferred_location, 0, sizeof(params->preferred_location)); 1879 params->preferred_cpu_nid = NUMA_NO_NODE; 1880 } 1881 else { 1882 uvm_va_space_processor_uuid(va_space, ¶ms->preferred_location, policy->preferred_location); 1883 params->preferred_cpu_nid = policy->preferred_nid; 1884 } 1885 1886 params->accessed_by_count = 0; 1887 for_each_id_in_mask(processor_id, &policy->accessed_by) 1888 uvm_va_space_processor_uuid(va_space, ¶ms->accessed_by[params->accessed_by_count++], processor_id); 1889 1890 // -Wall implies -Wenum-compare, so cast through int to avoid warnings 1891 BUILD_BUG_ON((int)UVM_TEST_VA_RANGE_TYPE_INVALID != (int)UVM_VA_RANGE_TYPE_INVALID); 1892 BUILD_BUG_ON((int)UVM_TEST_VA_RANGE_TYPE_MANAGED != (int)UVM_VA_RANGE_TYPE_MANAGED); 1893 BUILD_BUG_ON((int)UVM_TEST_VA_RANGE_TYPE_EXTERNAL != (int)UVM_VA_RANGE_TYPE_EXTERNAL); 1894 BUILD_BUG_ON((int)UVM_TEST_VA_RANGE_TYPE_CHANNEL != (int)UVM_VA_RANGE_TYPE_CHANNEL); 1895 BUILD_BUG_ON((int)UVM_TEST_VA_RANGE_TYPE_SKED_REFLECTED != (int)UVM_VA_RANGE_TYPE_SKED_REFLECTED); 1896 BUILD_BUG_ON((int)UVM_TEST_VA_RANGE_TYPE_SEMAPHORE_POOL != (int)UVM_VA_RANGE_TYPE_SEMAPHORE_POOL); 1897 BUILD_BUG_ON((int)UVM_TEST_VA_RANGE_TYPE_MAX != (int)UVM_VA_RANGE_TYPE_MAX); 1898 params->type = va_range->type; 1899 1900 switch (va_range->type) { 1901 case UVM_VA_RANGE_TYPE_MANAGED: 1902 1903 params->managed.subtype = UVM_TEST_RANGE_SUBTYPE_UVM; 1904 if (!va_range->managed.vma_wrapper) { 1905 params->managed.is_zombie = NV_TRUE; 1906 goto out; 1907 } 1908 params->managed.is_zombie = NV_FALSE; 1909 vma = uvm_va_range_vma_check(va_range, mm); 1910 if (!vma) { 1911 // We aren't in the same mm as the one which owns the vma, and 1912 // we don't have that mm locked. 1913 params->managed.owned_by_calling_process = NV_FALSE; 1914 goto out; 1915 } 1916 params->managed.owned_by_calling_process = (mm == current->mm ? NV_TRUE : NV_FALSE); 1917 params->managed.vma_start = vma->vm_start; 1918 params->managed.vma_end = vma->vm_end - 1; 1919 break; 1920 default: 1921 break; 1922 } 1923 1924 out: 1925 uvm_va_space_up_read(va_space); 1926 uvm_va_space_mm_or_current_release_unlock(va_space, mm); 1927 return status; 1928 } 1929 1930 NV_STATUS uvm_test_va_range_split(UVM_TEST_VA_RANGE_SPLIT_PARAMS *params, struct file *filp) 1931 { 1932 uvm_va_space_t *va_space = uvm_va_space_get(filp); 1933 uvm_va_range_t *va_range; 1934 NV_STATUS status = NV_OK; 1935 1936 if (!PAGE_ALIGNED(params->split_address + 1)) 1937 return NV_ERR_INVALID_ADDRESS; 1938 1939 uvm_va_space_down_write(va_space); 1940 1941 va_range = uvm_va_range_find(va_space, params->split_address); 1942 if (!va_range || 1943 va_range->node.end == params->split_address || 1944 va_range->type != UVM_VA_RANGE_TYPE_MANAGED) { 1945 status = NV_ERR_INVALID_ADDRESS; 1946 goto out; 1947 } 1948 1949 status = uvm_va_range_split(va_range, params->split_address, NULL); 1950 1951 out: 1952 uvm_va_space_up_write(va_space); 1953 return status; 1954 } 1955 1956 NV_STATUS uvm_test_va_range_inject_split_error(UVM_TEST_VA_RANGE_INJECT_SPLIT_ERROR_PARAMS *params, struct file *filp) 1957 { 1958 uvm_va_space_t *va_space = uvm_va_space_get(filp); 1959 uvm_va_range_t *va_range; 1960 struct mm_struct *mm; 1961 NV_STATUS status = NV_OK; 1962 1963 mm = uvm_va_space_mm_or_current_retain_lock(va_space); 1964 uvm_va_space_down_write(va_space); 1965 1966 va_range = uvm_va_range_find(va_space, params->lookup_address); 1967 if (!va_range) { 1968 if (!mm) 1969 status = NV_ERR_INVALID_ADDRESS; 1970 else 1971 status = uvm_hmm_test_va_block_inject_split_error(va_space, params->lookup_address); 1972 } 1973 else if (va_range->type != UVM_VA_RANGE_TYPE_MANAGED) { 1974 status = NV_ERR_INVALID_ADDRESS; 1975 } 1976 else { 1977 uvm_va_block_t *va_block; 1978 size_t split_index; 1979 1980 va_range->inject_split_error = true; 1981 1982 split_index = uvm_va_range_block_index(va_range, params->lookup_address); 1983 va_block = uvm_va_range_block(va_range, split_index); 1984 if (va_block) { 1985 uvm_va_block_test_t *block_test = uvm_va_block_get_test(va_block); 1986 1987 if (block_test) 1988 block_test->inject_split_error = true; 1989 } 1990 } 1991 1992 uvm_va_space_up_write(va_space); 1993 uvm_va_space_mm_or_current_release_unlock(va_space, mm); 1994 return status; 1995 } 1996 1997 NV_STATUS uvm_test_va_range_inject_add_gpu_va_space_error(UVM_TEST_VA_RANGE_INJECT_ADD_GPU_VA_SPACE_ERROR_PARAMS *params, 1998 struct file *filp) 1999 { 2000 uvm_va_space_t *va_space = uvm_va_space_get(filp); 2001 uvm_va_range_t *va_range; 2002 NV_STATUS status = NV_OK; 2003 2004 uvm_va_space_down_write(va_space); 2005 2006 va_range = uvm_va_range_find(va_space, params->lookup_address); 2007 if (!va_range) { 2008 status = NV_ERR_INVALID_ADDRESS; 2009 goto out; 2010 } 2011 2012 va_range->inject_add_gpu_va_space_error = true; 2013 2014 out: 2015 uvm_va_space_up_write(va_space); 2016 return status; 2017 } 2018 2019