1 /******************************************************************************* 2 Copyright (c) 2016-2023 NVIDIA Corporation 3 4 Permission is hereby granted, free of charge, to any person obtaining a copy 5 of this software and associated documentation files (the "Software"), to 6 deal in the Software without restriction, including without limitation the 7 rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 8 sell copies of the Software, and to permit persons to whom the Software is 9 furnished to do so, subject to the following conditions: 10 11 The above copyright notice and this permission notice shall be 12 included in all copies or substantial portions of the Software. 13 14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 DEALINGS IN THE SOFTWARE. 21 22 *******************************************************************************/ 23 24 #include "uvm_common.h" 25 #include "uvm_ioctl.h" 26 #include "uvm_linux.h" 27 #include "uvm_global.h" 28 #include "uvm_gpu.h" 29 #include "uvm_lock.h" 30 #include "uvm_va_space.h" 31 #include "uvm_va_range.h" 32 #include "uvm_va_block.h" 33 #include "uvm_tracker.h" 34 #include "uvm_api.h" 35 #include "uvm_channel.h" 36 #include "uvm_processors.h" 37 #include "uvm_push.h" 38 #include "uvm_hal.h" 39 #include "uvm_tools.h" 40 #include "uvm_migrate.h" 41 #include "uvm_migrate_pageable.h" 42 #include "uvm_va_space_mm.h" 43 #include "nv_speculation_barrier.h" 44 45 typedef enum 46 { 47 UVM_MIGRATE_PASS_FIRST, 48 UVM_MIGRATE_PASS_SECOND 49 } uvm_migrate_pass_t; 50 51 static int uvm_perf_migrate_cpu_preunmap_enable = 1; 52 module_param(uvm_perf_migrate_cpu_preunmap_enable, int, S_IRUGO); 53 54 #define UVM_PERF_MIGRATE_CPU_PREUNMAP_BLOCK_ORDER_DEFAULT 2 55 #define UVM_PERF_MIGRATE_CPU_PREUNMAP_BLOCK_ORDER_MAX 10 56 static unsigned uvm_perf_migrate_cpu_preunmap_block_order = UVM_PERF_MIGRATE_CPU_PREUNMAP_BLOCK_ORDER_DEFAULT; 57 module_param(uvm_perf_migrate_cpu_preunmap_block_order, uint, S_IRUGO); 58 59 // Global post-processed values of the module parameters 60 static bool g_uvm_perf_migrate_cpu_preunmap_enable __read_mostly; 61 static NvU64 g_uvm_perf_migrate_cpu_preunmap_size __read_mostly; 62 63 static bool is_migration_single_block(uvm_va_range_t *first_va_range, NvU64 base, NvU64 length) 64 { 65 NvU64 end = base + length - 1; 66 67 if (end > first_va_range->node.end) 68 return false; 69 70 return uvm_va_range_block_index(first_va_range, base) == uvm_va_range_block_index(first_va_range, end); 71 } 72 73 static NV_STATUS block_migrate_map_mapped_pages(uvm_va_block_t *va_block, 74 uvm_va_block_retry_t *va_block_retry, 75 uvm_va_block_context_t *va_block_context, 76 uvm_va_block_region_t region, 77 uvm_processor_id_t dest_id) 78 { 79 uvm_prot_t prot; 80 uvm_page_index_t page_index; 81 NV_STATUS status = NV_OK; 82 const uvm_page_mask_t *pages_mapped_on_destination = uvm_va_block_map_mask_get(va_block, dest_id); 83 84 for (prot = UVM_PROT_READ_ONLY; prot <= UVM_PROT_READ_WRITE_ATOMIC; ++prot) 85 va_block_context->mask_by_prot[prot - 1].count = 0; 86 87 // Only map those pages that are not already mapped on destination 88 for_each_va_block_unset_page_in_region_mask(page_index, pages_mapped_on_destination, region) { 89 prot = uvm_va_block_page_compute_highest_permission(va_block, va_block_context, dest_id, page_index); 90 if (prot == UVM_PROT_NONE) 91 continue; 92 93 if (va_block_context->mask_by_prot[prot - 1].count++ == 0) 94 uvm_page_mask_zero(&va_block_context->mask_by_prot[prot - 1].page_mask); 95 96 uvm_page_mask_set(&va_block_context->mask_by_prot[prot - 1].page_mask, page_index); 97 } 98 99 for (prot = UVM_PROT_READ_ONLY; prot <= UVM_PROT_READ_WRITE_ATOMIC; ++prot) { 100 if (va_block_context->mask_by_prot[prot - 1].count == 0) 101 continue; 102 103 // We pass UvmEventMapRemoteCauseInvalid since the destination processor 104 // of a migration will never be mapped remotely 105 status = uvm_va_block_map(va_block, 106 va_block_context, 107 dest_id, 108 region, 109 &va_block_context->mask_by_prot[prot - 1].page_mask, 110 prot, 111 UvmEventMapRemoteCauseInvalid, 112 &va_block->tracker); 113 if (status != NV_OK) 114 break; 115 116 // Whoever added the other mapping(s) should have already added 117 // SetAccessedBy processors 118 } 119 120 return status; 121 } 122 123 static NV_STATUS block_migrate_map_unmapped_pages(uvm_va_block_t *va_block, 124 uvm_va_block_retry_t *va_block_retry, 125 uvm_va_block_context_t *va_block_context, 126 uvm_va_block_region_t region, 127 uvm_processor_id_t dest_id) 128 129 { 130 uvm_tracker_t local_tracker = UVM_TRACKER_INIT(); 131 NV_STATUS status = NV_OK; 132 NV_STATUS tracker_status; 133 134 // Get the mask of unmapped pages because it will change after the 135 // first map operation 136 uvm_va_block_unmapped_pages_get(va_block, region, &va_block_context->caller_page_mask); 137 138 if (uvm_va_block_is_hmm(va_block) && !UVM_ID_IS_CPU(dest_id)) { 139 // Do not map pages that are already resident on the CPU. This is in 140 // order to avoid breaking system-wide atomic operations on HMM. HMM's 141 // implementation of system-side atomic operations involves restricting 142 // mappings to one processor (CPU or a GPU) at a time. If we were to 143 // grant a GPU a mapping to system memory, this gets into trouble 144 // because, on the CPU side, Linux can silently upgrade PTE permissions 145 // (move from read-only, to read-write, without any MMU notifiers 146 // firing), thus breaking the model by allowing simultaneous read-write 147 // access from two separate processors. To avoid that, just don't map 148 // such pages at all, when migrating. 149 uvm_page_mask_andnot(&va_block_context->caller_page_mask, 150 &va_block_context->caller_page_mask, 151 uvm_va_block_resident_mask_get(va_block, UVM_ID_CPU, NUMA_NO_NODE)); 152 } 153 154 // Only map those pages that are not mapped anywhere else (likely due 155 // to a first touch or a migration). We pass 156 // UvmEventMapRemoteCauseInvalid since the destination processor of a 157 // migration will never be mapped remotely. 158 status = uvm_va_block_map(va_block, 159 va_block_context, 160 dest_id, 161 region, 162 &va_block_context->caller_page_mask, 163 UVM_PROT_READ_WRITE_ATOMIC, 164 UvmEventMapRemoteCauseInvalid, 165 &local_tracker); 166 if (status != NV_OK) 167 goto out; 168 169 // Add mappings for AccessedBy processors 170 // 171 // No mappings within this call will operate on dest_id, so we don't 172 // need to acquire the map operation above. 173 status = uvm_va_block_add_mappings_after_migration(va_block, 174 va_block_context, 175 dest_id, 176 dest_id, 177 region, 178 &va_block_context->caller_page_mask, 179 UVM_PROT_READ_WRITE_ATOMIC, 180 NULL); 181 182 out: 183 tracker_status = uvm_tracker_add_tracker_safe(&va_block->tracker, &local_tracker); 184 uvm_tracker_deinit(&local_tracker); 185 return status == NV_OK ? tracker_status : status; 186 } 187 188 // Pages that are not mapped anywhere can be safely mapped with RWA permission. 189 // The rest of pages need to individually compute the maximum permission that 190 // does not require a revocation. 191 static NV_STATUS block_migrate_add_mappings(uvm_va_block_t *va_block, 192 uvm_va_block_retry_t *va_block_retry, 193 uvm_va_block_context_t *va_block_context, 194 uvm_va_block_region_t region, 195 uvm_processor_id_t dest_id) 196 197 { 198 NV_STATUS status; 199 200 status = block_migrate_map_unmapped_pages(va_block, 201 va_block_retry, 202 va_block_context, 203 region, 204 dest_id); 205 if (status != NV_OK) 206 return status; 207 208 return block_migrate_map_mapped_pages(va_block, 209 va_block_retry, 210 va_block_context, 211 region, 212 dest_id); 213 } 214 215 NV_STATUS uvm_va_block_migrate_locked(uvm_va_block_t *va_block, 216 uvm_va_block_retry_t *va_block_retry, 217 uvm_service_block_context_t *service_context, 218 uvm_va_block_region_t region, 219 uvm_processor_id_t dest_id, 220 uvm_migrate_mode_t mode, 221 uvm_tracker_t *out_tracker) 222 { 223 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 224 uvm_va_block_context_t *va_block_context = service_context->block_context; 225 NV_STATUS status, tracker_status = NV_OK; 226 227 uvm_assert_mutex_locked(&va_block->lock); 228 UVM_ASSERT(uvm_hmm_check_context_vma_is_valid(va_block, va_block_context->hmm.vma, region)); 229 230 if (uvm_va_block_is_hmm(va_block)) { 231 status = uvm_hmm_va_block_migrate_locked(va_block, 232 va_block_retry, 233 service_context, 234 dest_id, 235 region, 236 UVM_MAKE_RESIDENT_CAUSE_API_MIGRATE); 237 } 238 else { 239 uvm_va_policy_t *policy = uvm_va_range_get_policy(va_block->va_range); 240 241 if (uvm_va_policy_is_read_duplicate(policy, va_space)) { 242 status = uvm_va_block_make_resident_read_duplicate(va_block, 243 va_block_retry, 244 va_block_context, 245 dest_id, 246 region, 247 NULL, 248 NULL, 249 UVM_MAKE_RESIDENT_CAUSE_API_MIGRATE); 250 } 251 else { 252 status = uvm_va_block_make_resident(va_block, 253 va_block_retry, 254 va_block_context, 255 dest_id, 256 region, 257 NULL, 258 NULL, 259 UVM_MAKE_RESIDENT_CAUSE_API_MIGRATE); 260 } 261 } 262 263 if (status == NV_OK && mode == UVM_MIGRATE_MODE_MAKE_RESIDENT_AND_MAP) { 264 // block_migrate_add_mappings will acquire the work from the above 265 // make_resident call and update the VA block tracker. 266 status = block_migrate_add_mappings(va_block, va_block_retry, va_block_context, region, dest_id); 267 } 268 269 if (out_tracker) 270 tracker_status = uvm_tracker_add_tracker_safe(out_tracker, &va_block->tracker); 271 272 return status == NV_OK ? tracker_status : status; 273 } 274 275 // Unmapping CPU pages on P9 systems is very costly, to the point that it 276 // becomes the bottleneck of UvmMigrate. We have measured up to 3x lower BW for 277 // migrations that need to remove CPU mappings compared to migrations that only 278 // create CPU mappings. The overhead can be fully attributed to the TLB 279 // shootdown. When a CPU page is unmapped, it needs to (1) invalidate any copy 280 // in the P9 cores, and (2) if ATS is enabled, issue ATSD messages over NVLINK 281 // to remove the corresponding entries in the GPUs' TLBs. ATSDs are not even 282 // required when migration managed memory since UVM ensures that there are no 283 // ATS entries cached in the GPU TLBs for the managed VA ranges. However, we 284 // don't have a way to skip them as of today. 285 // 286 // In order to minimize the overhead of CPU unmaps during UvmMigrate we try to 287 // call unmap_mapping_range on VA regions larger than the VA block granularity 288 // before the actual migration so that TLB invalidations are batched better by 289 // the OS. This also has an impact in the number of ATSD messages issued. This 290 // is because the NPU code uses MMU notifiers in order to get a callback 291 // (invalidate_range) when a TLB invalidation is required. Fortunately, this 292 // callback is not called if there is nothing to be invalidated. Therefore, if 293 // we issue a large unmap, subsequent unmaps within that region will not invoke 294 // the callback. 295 // 296 // However, due to (1), even issuing a single invalidate for the whole migrated 297 // range introduces a noticeable overhead (20-30%) on systems with 3xNVLINK2. 298 // This is only expected to get worse if CPU-GPU interconnects' BW keeps 299 // increasing. 300 // 301 // Thus, VA range migrations are split into groups of contiguous VA blocks, and 302 // trigger a single pre-unmap of the group of VA blocks in the Linux kernel 303 // before the VA blocks' migration starts. This way, we trigger larger (more 304 // efficient) TLB invalidations than when we do it one VA block a time, while 305 // still being able to pipeline the migration, which allows to hide most of the 306 // costs of (1). 307 // 308 // However, there are some cases in which the CPU has mappings to the pages 309 // being migrated but they don't need to be removed (which can introduce 310 // unnecessary CPU faults later on). Therefore, we skip the pre-unmap step 311 // under the following conditions: 312 // - Pages mapped by the CPU that are *already* in the destination. 313 // - Pages mapped by the CPU that are *not* in the destination but 314 // read-duplication is enabled in the VA range. 315 316 // This function checks if the pre-unmap optimization is required given the 317 // system capabilities and the destination of the migration. This is to skip 318 // any subsequent checks required by the optimization, which can be costly. 319 // 320 // The current logic checks that: 321 // - We are in the first pass of the migration (see the explanation of the 322 // two-pass strategy in uvm_migrate). 323 // - The CPU has an NVLINK interconnect to the GPUs. Otherwise, we don't 324 // need this optimization since we are already limited by PCIe BW. 325 // - If the migration spans several VA blocks, otherwise skip the preunmap to 326 // avoid the overhead. 327 static bool migration_should_do_cpu_preunmap(uvm_va_space_t *va_space, 328 uvm_migrate_pass_t pass, 329 bool is_single_block) 330 331 { 332 if (!g_uvm_perf_migrate_cpu_preunmap_enable) 333 return false; 334 335 if (pass != UVM_MIGRATE_PASS_FIRST || is_single_block) 336 return false; 337 338 if (uvm_processor_mask_get_gpu_count(&va_space->has_nvlink[UVM_ID_CPU_VALUE]) == 0) 339 return false; 340 341 return true; 342 } 343 344 // This function determines if the VA range properties avoid the need to remove 345 // CPU mappings on UvmMigrate. Currently, it only checks whether 346 // read-duplication is enabled in the VA range. This is because, when migrating 347 // read-duplicated VA blocks, the source processor doesn't need to be unmapped 348 // (though it may need write access revoked). 349 static bool va_range_should_do_cpu_preunmap(const uvm_va_policy_t *policy, 350 uvm_va_space_t *va_space) 351 { 352 return !uvm_va_policy_is_read_duplicate(policy, va_space); 353 } 354 355 // Function that determines if the VA block to be migrated contains pages with 356 // CPU mappings that don't need to be removed (see the comment above). In that 357 // case false is returned. Otherwise it returns true, and stores in the 358 // variable pointed by num_unmap_pages the number of pages that do need to 359 // remove their CPU mappings. 360 static bool va_block_should_do_cpu_preunmap(uvm_va_block_t *va_block, 361 uvm_va_block_context_t *va_block_context, 362 NvU64 start, 363 NvU64 end, 364 uvm_processor_id_t dest_id, 365 NvU32 *num_unmap_pages) 366 { 367 const uvm_page_mask_t *mapped_pages_cpu; 368 NvU32 num_cpu_unchanged_pages = 0; 369 uvm_va_block_region_t region; 370 371 *num_unmap_pages = 0; 372 373 if (!va_block) 374 return true; 375 376 region = uvm_va_block_region_from_start_end(va_block, max(start, va_block->start), min(end, va_block->end)); 377 378 uvm_mutex_lock(&va_block->lock); 379 380 mapped_pages_cpu = uvm_va_block_map_mask_get(va_block, UVM_ID_CPU); 381 if (uvm_processor_mask_test(&va_block->resident, dest_id)) { 382 const uvm_page_mask_t *resident_pages_dest = uvm_va_block_resident_mask_get(va_block, dest_id, NUMA_NO_NODE); 383 uvm_page_mask_t *do_not_unmap_pages = &va_block_context->scratch_page_mask; 384 385 // TODO: Bug 1877578 386 // 387 // We assume that if pages are mapped on the CPU and not resident on 388 // the destination, the pages will change residency so the CPU must be 389 // unmapped. If we implement automatic read-duplication heuristics in 390 // the future, we'll also need to check if the pages are being 391 // read-duplicated. 392 uvm_page_mask_and(do_not_unmap_pages, mapped_pages_cpu, resident_pages_dest); 393 394 num_cpu_unchanged_pages = uvm_page_mask_region_weight(do_not_unmap_pages, region); 395 } 396 397 *num_unmap_pages = uvm_page_mask_region_weight(mapped_pages_cpu, region) - num_cpu_unchanged_pages; 398 399 uvm_mutex_unlock(&va_block->lock); 400 401 return num_cpu_unchanged_pages == 0; 402 } 403 404 static void preunmap_multi_block(uvm_va_range_t *va_range, 405 uvm_va_block_context_t *va_block_context, 406 NvU64 start, 407 NvU64 end, 408 uvm_processor_id_t dest_id) 409 { 410 size_t i; 411 const size_t first_block_index = uvm_va_range_block_index(va_range, start); 412 const size_t last_block_index = uvm_va_range_block_index(va_range, end); 413 NvU32 num_unmap_pages = 0; 414 415 UVM_ASSERT(start >= va_range->node.start); 416 UVM_ASSERT(end <= va_range->node.end); 417 UVM_ASSERT(va_range->type == UVM_VA_RANGE_TYPE_MANAGED); 418 uvm_assert_rwsem_locked(&va_range->va_space->lock); 419 420 UVM_ASSERT(uvm_range_group_all_migratable(va_range->va_space, start, end)); 421 422 for (i = first_block_index; i <= last_block_index; i++) { 423 NvU32 num_block_unmap_pages; 424 425 if (!va_block_should_do_cpu_preunmap(uvm_va_range_block(va_range, i), 426 va_block_context, 427 start, 428 end, 429 dest_id, 430 &num_block_unmap_pages)) { 431 return; 432 } 433 434 num_unmap_pages += num_block_unmap_pages; 435 } 436 437 if (num_unmap_pages > 0) 438 unmap_mapping_range(va_range->va_space->mapping, start, end - start + 1, 1); 439 } 440 441 static NV_STATUS uvm_va_range_migrate_multi_block(uvm_va_range_t *va_range, 442 uvm_service_block_context_t *service_context, 443 NvU64 start, 444 NvU64 end, 445 uvm_processor_id_t dest_id, 446 uvm_migrate_mode_t mode, 447 uvm_tracker_t *out_tracker) 448 { 449 size_t i; 450 const size_t first_block_index = uvm_va_range_block_index(va_range, start); 451 const size_t last_block_index = uvm_va_range_block_index(va_range, end); 452 453 UVM_ASSERT(start >= va_range->node.start); 454 UVM_ASSERT(end <= va_range->node.end); 455 UVM_ASSERT(va_range->type == UVM_VA_RANGE_TYPE_MANAGED); 456 uvm_assert_rwsem_locked(&va_range->va_space->lock); 457 458 UVM_ASSERT(uvm_range_group_all_migratable(va_range->va_space, start, end)); 459 460 // Iterate over blocks, populating them if necessary 461 for (i = first_block_index; i <= last_block_index; i++) { 462 uvm_va_block_retry_t va_block_retry; 463 uvm_va_block_region_t region; 464 uvm_va_block_t *va_block; 465 NV_STATUS status = uvm_va_range_block_create(va_range, i, &va_block); 466 467 if (status != NV_OK) 468 return status; 469 470 region = uvm_va_block_region_from_start_end(va_block, 471 max(start, va_block->start), 472 min(end, va_block->end)); 473 474 status = UVM_VA_BLOCK_LOCK_RETRY(va_block, 475 &va_block_retry, 476 uvm_va_block_migrate_locked(va_block, 477 &va_block_retry, 478 service_context, 479 region, 480 dest_id, 481 mode, 482 out_tracker)); 483 if (status != NV_OK) 484 return status; 485 } 486 487 return NV_OK; 488 } 489 490 static NV_STATUS uvm_va_range_migrate(uvm_va_range_t *va_range, 491 uvm_service_block_context_t *service_context, 492 NvU64 start, 493 NvU64 end, 494 uvm_processor_id_t dest_id, 495 uvm_migrate_mode_t mode, 496 bool should_do_cpu_preunmap, 497 uvm_tracker_t *out_tracker) 498 { 499 NvU64 preunmap_range_start = start; 500 uvm_va_policy_t *policy = uvm_va_range_get_policy(va_range); 501 502 should_do_cpu_preunmap = should_do_cpu_preunmap && va_range_should_do_cpu_preunmap(policy, va_range->va_space); 503 504 // Divide migrations into groups of contiguous VA blocks. This is to trigger 505 // CPU unmaps for that region before the migration starts. 506 while (preunmap_range_start < end) { 507 NV_STATUS status; 508 NvU64 preunmap_range_end; 509 510 if (should_do_cpu_preunmap) { 511 preunmap_range_end = UVM_ALIGN_UP(preunmap_range_start + 1, g_uvm_perf_migrate_cpu_preunmap_size); 512 preunmap_range_end = min(preunmap_range_end - 1, end); 513 514 preunmap_multi_block(va_range, 515 service_context->block_context, 516 preunmap_range_start, 517 preunmap_range_end, 518 dest_id); 519 } 520 else { 521 preunmap_range_end = end; 522 } 523 524 status = uvm_va_range_migrate_multi_block(va_range, 525 service_context, 526 preunmap_range_start, 527 preunmap_range_end, 528 dest_id, 529 mode, 530 out_tracker); 531 if (status != NV_OK) 532 return status; 533 534 preunmap_range_start = preunmap_range_end + 1; 535 } 536 537 return NV_OK; 538 } 539 540 static NV_STATUS uvm_migrate_ranges(uvm_va_space_t *va_space, 541 uvm_service_block_context_t *service_context, 542 uvm_va_range_t *first_va_range, 543 NvU64 base, 544 NvU64 length, 545 uvm_processor_id_t dest_id, 546 uvm_migrate_mode_t mode, 547 bool should_do_cpu_preunmap, 548 uvm_tracker_t *out_tracker) 549 { 550 uvm_va_range_t *va_range, *va_range_last; 551 NvU64 end = base + length - 1; 552 NV_STATUS status = NV_OK; 553 bool skipped_migrate = false; 554 555 if (!first_va_range) { 556 // For HMM, we iterate over va_blocks since there is no va_range. 557 return uvm_hmm_migrate_ranges(va_space, service_context, base, length, dest_id, mode, out_tracker); 558 } 559 560 UVM_ASSERT(first_va_range == uvm_va_space_iter_first(va_space, base, base)); 561 562 va_range_last = NULL; 563 uvm_for_each_va_range_in_contig_from(va_range, va_space, first_va_range, end) { 564 uvm_range_group_range_iter_t iter; 565 uvm_va_policy_t *policy = uvm_va_range_get_policy(va_range); 566 567 va_range_last = va_range; 568 569 // Only managed ranges can be migrated 570 if (va_range->type != UVM_VA_RANGE_TYPE_MANAGED) { 571 status = NV_ERR_INVALID_ADDRESS; 572 break; 573 } 574 575 // For UVM-Lite GPUs, the CUDA driver may suballocate a single va_range 576 // into many range groups. For this reason, we iterate over each va_range first 577 // then through the range groups within. 578 uvm_range_group_for_each_migratability_in(&iter, 579 va_space, 580 max(base, va_range->node.start), 581 min(end, va_range->node.end)) { 582 // Skip non-migratable VA ranges 583 if (!iter.migratable) { 584 // Only return NV_WARN_MORE_PROCESSING_REQUIRED if the pages aren't 585 // already resident at dest_id. 586 if (!uvm_va_policy_preferred_location_equal(policy, 587 dest_id, 588 service_context->block_context->make_resident.dest_nid)) 589 skipped_migrate = true; 590 } 591 else if (uvm_processor_mask_test(&va_range->uvm_lite_gpus, dest_id) && 592 !uvm_id_equal(dest_id, policy->preferred_location)) { 593 // Don't migrate to a non-faultable GPU that is in UVM-Lite mode, 594 // unless it's the preferred location 595 status = NV_ERR_INVALID_DEVICE; 596 break; 597 } 598 else { 599 status = uvm_va_range_migrate(va_range, 600 service_context, 601 iter.start, 602 iter.end, 603 dest_id, 604 mode, 605 should_do_cpu_preunmap, 606 out_tracker); 607 if (status != NV_OK) 608 break; 609 } 610 } 611 } 612 613 if (status != NV_OK) 614 return status; 615 616 // Check that we were able to iterate over the entire range without any gaps 617 if (!va_range_last || va_range_last->node.end < end) 618 return NV_ERR_INVALID_ADDRESS; 619 620 if (skipped_migrate) 621 return NV_WARN_MORE_PROCESSING_REQUIRED; 622 623 return NV_OK; 624 } 625 626 static NV_STATUS uvm_migrate(uvm_va_space_t *va_space, 627 struct mm_struct *mm, 628 NvU64 base, 629 NvU64 length, 630 uvm_processor_id_t dest_id, 631 int dest_nid, 632 NvU32 migrate_flags, 633 uvm_va_range_t *first_va_range, 634 uvm_tracker_t *out_tracker) 635 { 636 NV_STATUS status = NV_OK; 637 uvm_service_block_context_t *service_context; 638 bool do_mappings; 639 bool do_two_passes; 640 bool is_single_block; 641 bool should_do_cpu_preunmap; 642 643 uvm_assert_rwsem_locked(&va_space->lock); 644 645 // If the GPU has its memory disabled, just skip the migration and let 646 // faults take care of things. 647 if (!uvm_va_space_processor_has_memory(va_space, dest_id)) 648 return NV_OK; 649 650 if (mm) 651 uvm_assert_mmap_lock_locked(mm); 652 else if (!first_va_range) 653 return NV_ERR_INVALID_ADDRESS; 654 655 service_context = uvm_service_block_context_alloc(mm); 656 if (!service_context) 657 return NV_ERR_NO_MEMORY; 658 659 service_context->block_context->make_resident.dest_nid = dest_nid; 660 661 // We perform two passes (unless the migration only covers a single VA 662 // block or UVM_MIGRATE_FLAG_SKIP_CPU_MAP is passed). This helps in the 663 // following scenarios: 664 // 665 // - Migrations that add CPU mappings, since they are synchronous operations 666 // that delay the migration of the next VA blocks. 667 // - Concurrent migrations. This is due to our current channel selection 668 // logic that doesn't prevent false dependencies between independent 669 // operations. For example, removal of mappings for outgoing transfers are 670 // delayed by the mappings added by incoming transfers. 671 // TODO: Bug 1764953: Re-evaluate the two-pass logic when channel selection 672 // is overhauled. 673 // 674 // The two passes are as follows: 675 // 676 // 1- Transfer all VA blocks (do not add mappings) 677 // 2- Go block by block reexecuting the transfer (in case someone moved it 678 // since the first pass), and adding the mappings. 679 // 680 // For HMM (!first_va_range), we always do a single pass. 681 is_single_block = !first_va_range || is_migration_single_block(first_va_range, base, length); 682 do_mappings = UVM_ID_IS_GPU(dest_id) || !(migrate_flags & UVM_MIGRATE_FLAG_SKIP_CPU_MAP); 683 do_two_passes = do_mappings && !is_single_block; 684 685 if (do_two_passes) { 686 should_do_cpu_preunmap = migration_should_do_cpu_preunmap(va_space, UVM_MIGRATE_PASS_FIRST, is_single_block); 687 688 status = uvm_migrate_ranges(va_space, 689 service_context, 690 first_va_range, 691 base, 692 length, 693 dest_id, 694 UVM_MIGRATE_MODE_MAKE_RESIDENT, 695 should_do_cpu_preunmap, 696 out_tracker); 697 } 698 699 if (status == NV_OK) { 700 uvm_migrate_mode_t mode = do_mappings? UVM_MIGRATE_MODE_MAKE_RESIDENT_AND_MAP: 701 UVM_MIGRATE_MODE_MAKE_RESIDENT; 702 uvm_migrate_pass_t pass = do_two_passes? UVM_MIGRATE_PASS_SECOND: 703 UVM_MIGRATE_PASS_FIRST; 704 should_do_cpu_preunmap = migration_should_do_cpu_preunmap(va_space, pass, is_single_block); 705 706 status = uvm_migrate_ranges(va_space, 707 service_context, 708 first_va_range, 709 base, 710 length, 711 dest_id, 712 mode, 713 should_do_cpu_preunmap, 714 out_tracker); 715 } 716 717 uvm_service_block_context_free(service_context); 718 719 return status; 720 } 721 722 static NV_STATUS semaphore_release_from_gpu(uvm_gpu_t *gpu, 723 uvm_va_range_semaphore_pool_t *semaphore_va_range, 724 NvU64 semaphore_user_addr, 725 NvU32 semaphore_payload, 726 uvm_tracker_t *release_after_tracker) 727 { 728 NV_STATUS status; 729 uvm_push_t push; 730 uvm_channel_type_t channel_type; 731 NvU64 semaphore_gpu_va; 732 NvU64 semaphore_offset; 733 734 UVM_ASSERT(uvm_mem_mapped_on_gpu_kernel(semaphore_va_range->mem, gpu)); 735 736 semaphore_offset = semaphore_user_addr - (NvU64)(uintptr_t)semaphore_va_range->mem->user->addr; 737 semaphore_gpu_va = uvm_mem_get_gpu_va_kernel(semaphore_va_range->mem, gpu) + semaphore_offset; 738 739 // Outside of SR-IOV heavy, using UVM_CHANNEL_TYPE_MEMOPS is optimal from a 740 // performance standpoint because if the migration is targeting a GPU, it is 741 // likely that the channel used for the GPU page table update (pushed to 742 // UVM_CHANNEL_TYPE_MEMOPS) will also be used for the release. The 743 // inter-channel dependency avoided by using a single channel can add a 744 // significant overhead to the enclosing migration. 745 // 746 // In SR-IOV heavy, the user semaphore release is functionally forbidden 747 // from being pushed to a UVM_CHANNEL_TYPE_MEMOPS channel, because it is not 748 // a page tree operation. 749 if (uvm_parent_gpu_is_virt_mode_sriov_heavy(gpu->parent)) 750 channel_type = UVM_CHANNEL_TYPE_GPU_INTERNAL; 751 else 752 channel_type = UVM_CHANNEL_TYPE_MEMOPS; 753 754 status = uvm_push_begin_acquire(gpu->channel_manager, 755 channel_type, 756 release_after_tracker, 757 &push, 758 "Pushing semaphore release (*0x%llx = %u)", 759 semaphore_user_addr, 760 semaphore_payload); 761 if (status != NV_OK) 762 return status; 763 764 gpu->parent->ce_hal->semaphore_release(&push, semaphore_gpu_va, semaphore_payload); 765 uvm_push_end(&push); 766 767 uvm_mutex_lock(&semaphore_va_range->tracker_lock); 768 status = uvm_tracker_add_push_safe(&semaphore_va_range->tracker, &push); 769 uvm_tracker_remove_completed(&semaphore_va_range->tracker); 770 uvm_mutex_unlock(&semaphore_va_range->tracker_lock); 771 772 return status; 773 } 774 775 static void semaphore_release_from_cpu(uvm_mem_t *semaphore_mem, NvU64 semaphore_user_addr, NvU32 semaphore_payload) 776 { 777 char *semaphore_cpu_va; 778 NvU64 semaphore_offset; 779 780 UVM_ASSERT(uvm_mem_mapped_on_cpu_kernel(semaphore_mem)); 781 782 semaphore_offset = semaphore_user_addr - (NvU64)(uintptr_t)semaphore_mem->user->addr; 783 784 // Prevent processor speculation prior to accessing user-mapped memory to 785 // avoid leaking information from side-channel attacks. Under speculation, a 786 // valid VA range which does not contain this semaphore could be used by the 787 // caller. It's unclear but likely that the user might be able to control 788 // the data at that address. Auditing all potential ways that could happen 789 // is difficult and error-prone, so to be on the safe side we'll just always 790 // block speculation. 791 nv_speculation_barrier(); 792 793 semaphore_cpu_va = (char *) uvm_mem_get_cpu_addr_kernel(semaphore_mem) + semaphore_offset; 794 795 UVM_WRITE_ONCE(*(NvU32 *)semaphore_cpu_va, semaphore_payload); 796 } 797 798 static NV_STATUS semaphore_release(NvU64 semaphore_address, 799 NvU32 semaphore_payload, 800 uvm_va_range_semaphore_pool_t *semaphore_pool, 801 uvm_gpu_t *dest_gpu, 802 uvm_tracker_t *tracker_ptr) 803 { 804 uvm_gpu_t *gpu; 805 uvm_gpu_t *gpu_owner = semaphore_pool->owner; 806 807 // If there is a GPU owner, release the semaphore from it. 808 if (gpu_owner != NULL) 809 return semaphore_release_from_gpu(gpu_owner, semaphore_pool, semaphore_address, semaphore_payload, tracker_ptr); 810 811 // Attempt eager release from CPU if the tracker is already completed. 812 if (uvm_tracker_is_completed(tracker_ptr)) { 813 semaphore_release_from_cpu(semaphore_pool->mem, semaphore_address, semaphore_payload); 814 return NV_OK; 815 } 816 817 if (dest_gpu == NULL) { 818 // The destination is the CPU, but we didn't do a CPU release above 819 // because the previous work is not complete. This situation arises when 820 // accessed_by mappings are being set up asynchronously, or the 821 // test-only flag UVM_MIGRATE_FLAG_SKIP_CPU_MAP is used. So there should 822 // be a registered GPU, since all CPU work is synchronous, and the 823 // semaphore must be mapped on that GPU. 824 // 825 // Note that the GPU selected for the release may not be the same device 826 // that prevented the tracker from being complete. 827 gpu = uvm_processor_mask_find_first_gpu(&semaphore_pool->mem->kernel.mapped_on); 828 829 UVM_ASSERT(gpu != NULL); 830 } 831 else { 832 gpu = dest_gpu; 833 } 834 835 return semaphore_release_from_gpu(gpu, semaphore_pool, semaphore_address, semaphore_payload, tracker_ptr); 836 } 837 838 NV_STATUS uvm_migrate_init(void) 839 { 840 NV_STATUS status = uvm_migrate_pageable_init(); 841 if (status != NV_OK) 842 return status; 843 844 g_uvm_perf_migrate_cpu_preunmap_enable = uvm_perf_migrate_cpu_preunmap_enable != 0; 845 846 BUILD_BUG_ON((UVM_VA_BLOCK_SIZE) & (UVM_VA_BLOCK_SIZE - 1)); 847 848 if (g_uvm_perf_migrate_cpu_preunmap_enable) { 849 if (uvm_perf_migrate_cpu_preunmap_block_order <= UVM_PERF_MIGRATE_CPU_PREUNMAP_BLOCK_ORDER_MAX) { 850 g_uvm_perf_migrate_cpu_preunmap_size = UVM_VA_BLOCK_SIZE << uvm_perf_migrate_cpu_preunmap_block_order; 851 } 852 else { 853 g_uvm_perf_migrate_cpu_preunmap_size = UVM_VA_BLOCK_SIZE << UVM_PERF_MIGRATE_CPU_PREUNMAP_BLOCK_ORDER_DEFAULT; 854 855 pr_info("Invalid value %u for uvm_perf_migrate_cpu_preunmap_block_order. Using %u instead\n", 856 uvm_perf_migrate_cpu_preunmap_block_order, 857 UVM_PERF_MIGRATE_CPU_PREUNMAP_BLOCK_ORDER_DEFAULT); 858 } 859 } 860 861 return NV_OK; 862 } 863 864 void uvm_migrate_exit(void) 865 { 866 uvm_migrate_pageable_exit(); 867 } 868 869 NV_STATUS uvm_api_migrate(UVM_MIGRATE_PARAMS *params, struct file *filp) 870 { 871 uvm_va_space_t *va_space = uvm_va_space_get(filp); 872 uvm_tracker_t tracker = UVM_TRACKER_INIT(); 873 uvm_tracker_t *tracker_ptr = NULL; 874 uvm_gpu_t *dest_gpu = NULL; 875 uvm_va_range_t *sema_va_range = NULL; 876 struct mm_struct *mm; 877 NV_STATUS status = NV_OK; 878 bool flush_events = false; 879 const bool synchronous = !(params->flags & UVM_MIGRATE_FLAG_ASYNC); 880 int cpu_numa_node = (int)params->cpuNumaNode; 881 882 // We temporarily allow 0 length in the IOCTL parameters as a signal to 883 // only release the semaphore. This is because user-space is in charge of 884 // migrating pageable memory in some cases. 885 // 886 // TODO: Bug 2419180: do not allow 0 length migrations when we fully switch 887 // to migrate_vma for all types of vmas. 888 if (params->length > 0 || synchronous || params->semaphoreAddress == 0) { 889 if (uvm_api_range_invalid(params->base, params->length)) 890 return NV_ERR_INVALID_ADDRESS; 891 } 892 893 if (params->flags & ~UVM_MIGRATE_FLAGS_ALL) 894 return NV_ERR_INVALID_ARGUMENT; 895 896 if ((params->flags & UVM_MIGRATE_FLAGS_TEST_ALL) && !uvm_enable_builtin_tests) { 897 UVM_INFO_PRINT("Test flag set for UVM_MIGRATE. Did you mean to insmod with uvm_enable_builtin_tests=1?\n"); 898 UVM_INFO_PRINT("TEMP\n"); 899 return NV_ERR_INVALID_ARGUMENT; 900 } 901 902 // mmap_lock will be needed if we have to create CPU mappings 903 mm = uvm_va_space_mm_or_current_retain_lock(va_space); 904 uvm_va_space_down_read(va_space); 905 906 if (synchronous) { 907 if (params->semaphoreAddress != 0) { 908 status = NV_ERR_INVALID_ARGUMENT; 909 goto done; 910 } 911 } 912 else { 913 if (params->semaphoreAddress == 0) { 914 if (params->semaphorePayload != 0) { 915 status = NV_ERR_INVALID_ARGUMENT; 916 goto done; 917 } 918 } 919 else { 920 sema_va_range = uvm_va_range_find(va_space, params->semaphoreAddress); 921 if (!IS_ALIGNED(params->semaphoreAddress, sizeof(params->semaphorePayload)) || 922 !sema_va_range || sema_va_range->type != UVM_VA_RANGE_TYPE_SEMAPHORE_POOL) { 923 status = NV_ERR_INVALID_ADDRESS; 924 goto done; 925 } 926 } 927 } 928 929 if (!uvm_uuid_is_cpu(¶ms->destinationUuid)) { 930 if (params->flags & UVM_MIGRATE_FLAG_NO_GPU_VA_SPACE) 931 dest_gpu = uvm_va_space_get_gpu_by_uuid(va_space, ¶ms->destinationUuid); 932 else 933 dest_gpu = uvm_va_space_get_gpu_by_uuid_with_gpu_va_space(va_space, ¶ms->destinationUuid); 934 935 if (!dest_gpu) { 936 status = NV_ERR_INVALID_DEVICE; 937 goto done; 938 } 939 940 if (params->length > 0 && !uvm_gpu_can_address(dest_gpu, params->base, params->length)) { 941 status = NV_ERR_OUT_OF_RANGE; 942 goto done; 943 } 944 } 945 else { 946 // If cpu_numa_node is not -1, we only check that it is a valid node in 947 // the system, it has memory, and it doesn't correspond to a GPU node. 948 // 949 // For pageable memory, this is fine because alloc_pages_node will clamp 950 // the allocation to cpuset_current_mems_allowed when uvm_migrate 951 //_pageable is called from process context (uvm_migrate) when dst_id is 952 // CPU. UVM bottom half calls uvm_migrate_pageable with CPU dst_id only 953 // when the VMA memory policy is set to dst_node_id and dst_node_id is 954 // not NUMA_NO_NODE. 955 if (cpu_numa_node != -1 && 956 (!nv_numa_node_has_memory(cpu_numa_node) || 957 !node_isset(cpu_numa_node, node_possible_map) || 958 uvm_va_space_find_gpu_with_memory_node_id(va_space, cpu_numa_node))) { 959 status = NV_ERR_INVALID_ARGUMENT; 960 goto done; 961 } 962 } 963 964 UVM_ASSERT(status == NV_OK); 965 966 // If we're synchronous or if we need to release a semaphore, use a tracker. 967 if (synchronous || params->semaphoreAddress) 968 tracker_ptr = &tracker; 969 970 if (params->length > 0) { 971 uvm_api_range_type_t type; 972 uvm_processor_id_t dest_id = dest_gpu ? dest_gpu->id : UVM_ID_CPU; 973 974 type = uvm_api_range_type_check(va_space, mm, params->base, params->length); 975 if (type == UVM_API_RANGE_TYPE_INVALID) { 976 status = NV_ERR_INVALID_ADDRESS; 977 goto done; 978 } 979 980 if (type == UVM_API_RANGE_TYPE_ATS) { 981 uvm_migrate_args_t uvm_migrate_args = 982 { 983 .va_space = va_space, 984 .mm = mm, 985 .start = params->base, 986 .length = params->length, 987 .dst_id = dest_id, 988 .dst_node_id = cpu_numa_node, 989 .populate_permissions = UVM_POPULATE_PERMISSIONS_INHERIT, 990 .touch = false, 991 .skip_mapped = false, 992 .populate_on_cpu_alloc_failures = false, 993 .user_space_start = ¶ms->userSpaceStart, 994 .user_space_length = ¶ms->userSpaceLength, 995 }; 996 997 status = uvm_migrate_pageable(&uvm_migrate_args); 998 } 999 else { 1000 status = uvm_migrate(va_space, 1001 mm, 1002 params->base, 1003 params->length, 1004 dest_id, 1005 (UVM_ID_IS_CPU(dest_id) ? cpu_numa_node : NUMA_NO_NODE), 1006 params->flags, 1007 uvm_va_space_iter_first(va_space, params->base, params->base), 1008 tracker_ptr); 1009 } 1010 } 1011 1012 done: 1013 // We only need to hold mmap_lock to create new CPU mappings, so drop it if 1014 // we need to wait for the tracker to finish. 1015 // 1016 // TODO: Bug 1766650: For large migrations with destination CPU, try 1017 // benchmarks to see if a two-pass approach would be faster (first 1018 // pass pushes all GPU work asynchronously, second pass updates CPU 1019 // mappings synchronously). 1020 if (mm) 1021 uvm_up_read_mmap_lock_out_of_order(mm); 1022 1023 if (tracker_ptr) { 1024 // If requested, release semaphore 1025 if (params->semaphoreAddress && (status == NV_OK)) { 1026 status = semaphore_release(params->semaphoreAddress, 1027 params->semaphorePayload, 1028 &sema_va_range->semaphore_pool, 1029 dest_gpu, 1030 tracker_ptr); 1031 } 1032 1033 // Wait on the tracker if we are synchronous or there was an error. The 1034 // VA space lock must be held to prevent GPUs from being unregistered. 1035 if (synchronous || (status != NV_OK)) { 1036 NV_STATUS tracker_status = uvm_tracker_wait(tracker_ptr); 1037 1038 // Only clobber status if we didn't hit an earlier error 1039 if (status == NV_OK) 1040 status = tracker_status; 1041 1042 flush_events = true; 1043 } 1044 1045 uvm_tracker_deinit(tracker_ptr); 1046 } 1047 1048 uvm_va_space_up_read(va_space); 1049 uvm_va_space_mm_or_current_release(va_space, mm); 1050 1051 // If the migration is known to be complete, eagerly dispatch the migration 1052 // events, instead of processing them on a later event flush. Note that an 1053 // asynchronous migration could be complete by now, but the flush would not 1054 // be triggered. 1055 if (flush_events) 1056 uvm_tools_flush_events(); 1057 1058 return status; 1059 } 1060 1061 NV_STATUS uvm_api_migrate_range_group(UVM_MIGRATE_RANGE_GROUP_PARAMS *params, struct file *filp) 1062 { 1063 NV_STATUS status = NV_OK; 1064 NV_STATUS tracker_status = NV_OK; 1065 uvm_va_space_t *va_space = uvm_va_space_get(filp); 1066 struct mm_struct *mm; 1067 uvm_range_group_t *range_group; 1068 uvm_range_group_range_t *rgr; 1069 uvm_processor_id_t dest_id; 1070 uvm_tracker_t local_tracker = UVM_TRACKER_INIT(); 1071 NvU32 migrate_flags = 0; 1072 uvm_gpu_t *gpu = NULL; 1073 1074 // mmap_lock will be needed if we have to create CPU mappings 1075 mm = uvm_va_space_mm_or_current_retain_lock(va_space); 1076 uvm_va_space_down_read(va_space); 1077 1078 if (uvm_uuid_is_cpu(¶ms->destinationUuid)) { 1079 dest_id = UVM_ID_CPU; 1080 } 1081 else { 1082 gpu = uvm_va_space_get_gpu_by_uuid_with_gpu_va_space(va_space, ¶ms->destinationUuid); 1083 if (!gpu) { 1084 status = NV_ERR_INVALID_DEVICE; 1085 goto done; 1086 } 1087 1088 dest_id = gpu->id; 1089 } 1090 1091 range_group = radix_tree_lookup(&va_space->range_groups, params->rangeGroupId); 1092 if (!range_group) { 1093 status = NV_ERR_OBJECT_NOT_FOUND; 1094 goto done; 1095 } 1096 1097 // Migrate all VA ranges in the range group. uvm_migrate is used because it performs all 1098 // VA range validity checks. 1099 list_for_each_entry(rgr, &range_group->ranges, range_group_list_node) { 1100 NvU64 start = rgr->node.start; 1101 NvU64 length = rgr->node.end - rgr->node.start + 1; 1102 1103 if (gpu && !uvm_gpu_can_address(gpu, start, length)) { 1104 status = NV_ERR_OUT_OF_RANGE; 1105 } 1106 else { 1107 uvm_va_range_t *first_va_range = uvm_va_space_iter_first(va_space, start, start); 1108 1109 if (!first_va_range || first_va_range->type != UVM_VA_RANGE_TYPE_MANAGED) { 1110 status = NV_ERR_INVALID_ADDRESS; 1111 goto done; 1112 } 1113 1114 status = uvm_migrate(va_space, 1115 mm, 1116 start, 1117 length, 1118 dest_id, 1119 NUMA_NO_NODE, 1120 migrate_flags, 1121 first_va_range, 1122 &local_tracker); 1123 } 1124 1125 if (status != NV_OK) 1126 goto done; 1127 } 1128 1129 done: 1130 // We only need to hold mmap_lock to create new CPU mappings, so drop it if 1131 // we need to wait for the tracker to finish. 1132 // 1133 // TODO: Bug 1766650: For large migrations with destination CPU, try 1134 // benchmarks to see if a two-pass approach would be faster (first 1135 // pass pushes all GPU work asynchronously, second pass updates CPU 1136 // mappings synchronously). 1137 if (mm) 1138 uvm_up_read_mmap_lock_out_of_order(mm); 1139 1140 tracker_status = uvm_tracker_wait_deinit(&local_tracker); 1141 uvm_va_space_up_read(va_space); 1142 uvm_va_space_mm_or_current_release(va_space, mm); 1143 1144 // This API is synchronous, so wait for migrations to finish 1145 uvm_tools_flush_events(); 1146 1147 return status == NV_OK? tracker_status : status; 1148 } 1149