1 /******************************************************************************* 2 Copyright (c) 2016-2023 NVIDIA Corporation 3 4 Permission is hereby granted, free of charge, to any person obtaining a copy 5 of this software and associated documentation files (the "Software"), to 6 deal in the Software without restriction, including without limitation the 7 rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 8 sell copies of the Software, and to permit persons to whom the Software is 9 furnished to do so, subject to the following conditions: 10 11 The above copyright notice and this permission notice shall be 12 included in all copies or substantial portions of the Software. 13 14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 DEALINGS IN THE SOFTWARE. 21 22 *******************************************************************************/ 23 24 #include "uvm_common.h" 25 #include "uvm_ioctl.h" 26 #include "uvm_linux.h" 27 #include "uvm_global.h" 28 #include "uvm_gpu.h" 29 #include "uvm_lock.h" 30 #include "uvm_va_space.h" 31 #include "uvm_va_range.h" 32 #include "uvm_va_block.h" 33 #include "uvm_tracker.h" 34 #include "uvm_api.h" 35 #include "uvm_channel.h" 36 #include "uvm_processors.h" 37 #include "uvm_push.h" 38 #include "uvm_hal.h" 39 #include "uvm_tools.h" 40 #include "uvm_migrate.h" 41 #include "uvm_migrate_pageable.h" 42 #include "uvm_va_space_mm.h" 43 #include "nv_speculation_barrier.h" 44 45 typedef enum 46 { 47 UVM_MIGRATE_PASS_FIRST, 48 UVM_MIGRATE_PASS_SECOND 49 } uvm_migrate_pass_t; 50 51 static int uvm_perf_migrate_cpu_preunmap_enable = 1; 52 module_param(uvm_perf_migrate_cpu_preunmap_enable, int, S_IRUGO); 53 54 #define UVM_PERF_MIGRATE_CPU_PREUNMAP_BLOCK_ORDER_DEFAULT 2 55 #define UVM_PERF_MIGRATE_CPU_PREUNMAP_BLOCK_ORDER_MAX 10 56 static unsigned uvm_perf_migrate_cpu_preunmap_block_order = UVM_PERF_MIGRATE_CPU_PREUNMAP_BLOCK_ORDER_DEFAULT; 57 module_param(uvm_perf_migrate_cpu_preunmap_block_order, uint, S_IRUGO); 58 59 // Global post-processed values of the module parameters 60 static bool g_uvm_perf_migrate_cpu_preunmap_enable __read_mostly; 61 static NvU64 g_uvm_perf_migrate_cpu_preunmap_size __read_mostly; 62 63 static bool is_migration_single_block(uvm_va_range_t *first_va_range, NvU64 base, NvU64 length) 64 { 65 NvU64 end = base + length - 1; 66 67 if (end > first_va_range->node.end) 68 return false; 69 70 return uvm_va_range_block_index(first_va_range, base) == uvm_va_range_block_index(first_va_range, end); 71 } 72 73 static NV_STATUS block_migrate_map_mapped_pages(uvm_va_block_t *va_block, 74 uvm_va_block_retry_t *va_block_retry, 75 uvm_va_block_context_t *va_block_context, 76 uvm_va_block_region_t region, 77 uvm_processor_id_t dest_id) 78 { 79 uvm_prot_t prot; 80 uvm_page_index_t page_index; 81 NV_STATUS status = NV_OK; 82 const uvm_page_mask_t *pages_mapped_on_destination = uvm_va_block_map_mask_get(va_block, dest_id); 83 84 for (prot = UVM_PROT_READ_ONLY; prot <= UVM_PROT_READ_WRITE_ATOMIC; ++prot) 85 va_block_context->mask_by_prot[prot - 1].count = 0; 86 87 // Only map those pages that are not already mapped on destination 88 for_each_va_block_unset_page_in_region_mask(page_index, pages_mapped_on_destination, region) { 89 prot = uvm_va_block_page_compute_highest_permission(va_block, dest_id, page_index); 90 if (prot == UVM_PROT_NONE) 91 continue; 92 93 if (va_block_context->mask_by_prot[prot - 1].count++ == 0) 94 uvm_page_mask_zero(&va_block_context->mask_by_prot[prot - 1].page_mask); 95 96 uvm_page_mask_set(&va_block_context->mask_by_prot[prot - 1].page_mask, page_index); 97 } 98 99 for (prot = UVM_PROT_READ_ONLY; prot <= UVM_PROT_READ_WRITE_ATOMIC; ++prot) { 100 if (va_block_context->mask_by_prot[prot - 1].count == 0) 101 continue; 102 103 // We pass UvmEventMapRemoteCauseInvalid since the destination processor 104 // of a migration will never be mapped remotely 105 status = uvm_va_block_map(va_block, 106 va_block_context, 107 dest_id, 108 region, 109 &va_block_context->mask_by_prot[prot - 1].page_mask, 110 prot, 111 UvmEventMapRemoteCauseInvalid, 112 &va_block->tracker); 113 if (status != NV_OK) 114 break; 115 116 // Whoever added the other mapping(s) should have already added 117 // SetAccessedBy processors 118 } 119 120 return status; 121 } 122 123 static NV_STATUS block_migrate_map_unmapped_pages(uvm_va_block_t *va_block, 124 uvm_va_block_retry_t *va_block_retry, 125 uvm_va_block_context_t *va_block_context, 126 uvm_va_block_region_t region, 127 uvm_processor_id_t dest_id) 128 129 { 130 uvm_tracker_t local_tracker = UVM_TRACKER_INIT(); 131 NV_STATUS status = NV_OK; 132 NV_STATUS tracker_status; 133 134 // Get the mask of unmapped pages because it will change after the 135 // first map operation 136 uvm_va_block_unmapped_pages_get(va_block, region, &va_block_context->caller_page_mask); 137 138 if (uvm_va_block_is_hmm(va_block) && !UVM_ID_IS_CPU(dest_id)) { 139 // Do not map pages that are already resident on the CPU. This is in 140 // order to avoid breaking system-wide atomic operations on HMM. HMM's 141 // implementation of system-side atomic operations involves restricting 142 // mappings to one processor (CPU or a GPU) at a time. If we were to 143 // grant a GPU a mapping to system memory, this gets into trouble 144 // because, on the CPU side, Linux can silently upgrade PTE permissions 145 // (move from read-only, to read-write, without any MMU notifiers 146 // firing), thus breaking the model by allowing simultaneous read-write 147 // access from two separate processors. To avoid that, just don't map 148 // such pages at all, when migrating. 149 uvm_page_mask_andnot(&va_block_context->caller_page_mask, 150 &va_block_context->caller_page_mask, 151 uvm_va_block_resident_mask_get(va_block, UVM_ID_CPU, NUMA_NO_NODE)); 152 } 153 154 // Only map those pages that are not mapped anywhere else (likely due 155 // to a first touch or a migration). We pass 156 // UvmEventMapRemoteCauseInvalid since the destination processor of a 157 // migration will never be mapped remotely. 158 status = uvm_va_block_map(va_block, 159 va_block_context, 160 dest_id, 161 region, 162 &va_block_context->caller_page_mask, 163 UVM_PROT_READ_WRITE_ATOMIC, 164 UvmEventMapRemoteCauseInvalid, 165 &local_tracker); 166 if (status != NV_OK) 167 goto out; 168 169 // Add mappings for AccessedBy processors 170 // 171 // No mappings within this call will operate on dest_id, so we don't 172 // need to acquire the map operation above. 173 status = uvm_va_block_add_mappings_after_migration(va_block, 174 va_block_context, 175 dest_id, 176 dest_id, 177 region, 178 &va_block_context->caller_page_mask, 179 UVM_PROT_READ_WRITE_ATOMIC, 180 NULL); 181 182 out: 183 tracker_status = uvm_tracker_add_tracker_safe(&va_block->tracker, &local_tracker); 184 uvm_tracker_deinit(&local_tracker); 185 return status == NV_OK ? tracker_status : status; 186 } 187 188 // Pages that are not mapped anywhere can be safely mapped with RWA permission. 189 // The rest of pages need to individually compute the maximum permission that 190 // does not require a revocation. 191 static NV_STATUS block_migrate_add_mappings(uvm_va_block_t *va_block, 192 uvm_va_block_retry_t *va_block_retry, 193 uvm_va_block_context_t *va_block_context, 194 uvm_va_block_region_t region, 195 uvm_processor_id_t dest_id) 196 197 { 198 NV_STATUS status; 199 200 status = block_migrate_map_unmapped_pages(va_block, 201 va_block_retry, 202 va_block_context, 203 region, 204 dest_id); 205 if (status != NV_OK) 206 return status; 207 208 return block_migrate_map_mapped_pages(va_block, 209 va_block_retry, 210 va_block_context, 211 region, 212 dest_id); 213 } 214 215 NV_STATUS uvm_va_block_migrate_locked(uvm_va_block_t *va_block, 216 uvm_va_block_retry_t *va_block_retry, 217 uvm_va_block_context_t *va_block_context, 218 uvm_va_block_region_t region, 219 uvm_processor_id_t dest_id, 220 uvm_migrate_mode_t mode, 221 uvm_tracker_t *out_tracker) 222 { 223 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 224 NV_STATUS status, tracker_status = NV_OK; 225 226 uvm_assert_mutex_locked(&va_block->lock); 227 UVM_ASSERT(uvm_hmm_check_context_vma_is_valid(va_block, va_block_context->hmm.vma, region)); 228 229 if (uvm_va_block_is_hmm(va_block)) { 230 status = uvm_hmm_va_block_migrate_locked(va_block, 231 va_block_retry, 232 va_block_context, 233 dest_id, 234 region, 235 UVM_MAKE_RESIDENT_CAUSE_API_MIGRATE); 236 } 237 else { 238 uvm_va_policy_t *policy = uvm_va_range_get_policy(va_block->va_range); 239 240 if (uvm_va_policy_is_read_duplicate(policy, va_space)) { 241 status = uvm_va_block_make_resident_read_duplicate(va_block, 242 va_block_retry, 243 va_block_context, 244 dest_id, 245 region, 246 NULL, 247 NULL, 248 UVM_MAKE_RESIDENT_CAUSE_API_MIGRATE); 249 } 250 else { 251 status = uvm_va_block_make_resident(va_block, 252 va_block_retry, 253 va_block_context, 254 dest_id, 255 region, 256 NULL, 257 NULL, 258 UVM_MAKE_RESIDENT_CAUSE_API_MIGRATE); 259 } 260 } 261 262 if (status == NV_OK && mode == UVM_MIGRATE_MODE_MAKE_RESIDENT_AND_MAP) { 263 // block_migrate_add_mappings will acquire the work from the above 264 // make_resident call and update the VA block tracker. 265 status = block_migrate_add_mappings(va_block, va_block_retry, va_block_context, region, dest_id); 266 } 267 268 if (out_tracker) 269 tracker_status = uvm_tracker_add_tracker_safe(out_tracker, &va_block->tracker); 270 271 return status == NV_OK ? tracker_status : status; 272 } 273 274 // Unmapping CPU pages on P9 systems is very costly, to the point that it 275 // becomes the bottleneck of UvmMigrate. We have measured up to 3x lower BW for 276 // migrations that need to remove CPU mappings compared to migrations that only 277 // create CPU mappings. The overhead can be fully attributed to the TLB 278 // shootdown. When a CPU page is unmapped, it needs to (1) invalidate any copy 279 // in the P9 cores, and (2) if ATS is enabled, issue ATSD messages over NVLINK 280 // to remove the corresponding entries in the GPUs' TLBs. ATSDs are not even 281 // required when migration managed memory since UVM ensures that there are no 282 // ATS entries cached in the GPU TLBs for the managed VA ranges. However, we 283 // don't have a way to skip them as of today. 284 // 285 // In order to minimize the overhead of CPU unmaps during UvmMigrate we try to 286 // call unmap_mapping_range on VA regions larger than the VA block granularity 287 // before the actual migration so that TLB invalidations are batched better by 288 // the OS. This also has an impact in the number of ATSD messages issued. This 289 // is because the NPU code uses MMU notifiers in order to get a callback 290 // (invalidate_range) when a TLB invalidation is required. Fortunately, this 291 // callback is not called if there is nothing to be invalidated. Therefore, if 292 // we issue a large unmap, subsequent unmaps within that region will not invoke 293 // the callback. 294 // 295 // However, due to (1), even issuing a single invalidate for the whole migrated 296 // range introduces a noticeable overhead (20-30%) on systems with 3xNVLINK2. 297 // This is only expected to get worse if CPU-GPU interconnects' BW keeps 298 // increasing. 299 // 300 // Thus, VA range migrations are split into groups of contiguous VA blocks, and 301 // trigger a single pre-unmap of the group of VA blocks in the Linux kernel 302 // before the VA blocks' migration starts. This way, we trigger larger (more 303 // efficient) TLB invalidations than when we do it one VA block a time, while 304 // still being able to pipeline the migration, which allows to hide most of the 305 // costs of (1). 306 // 307 // However, there are some cases in which the CPU has mappings to the pages 308 // being migrated but they don't need to be removed (which can introduce 309 // unnecessary CPU faults later on). Therefore, we skip the pre-unmap step 310 // under the following conditions: 311 // - Pages mapped by the CPU that are *already* in the destination. 312 // - Pages mapped by the CPU that are *not* in the destination but 313 // read-duplication is enabled in the VA range. 314 315 // This function checks if the pre-unmap optimization is required given the 316 // system capabilities and the destination of the migration. This is to skip 317 // any subsequent checks required by the optimization, which can be costly. 318 // 319 // The current logic checks that: 320 // - We are in the first pass of the migration (see the explanation of the 321 // two-pass strategy in uvm_migrate). 322 // - The CPU has an NVLINK interconnect to the GPUs. Otherwise, we don't 323 // need this optimization since we are already limited by PCIe BW. 324 // - If the migration spans several VA blocks, otherwise skip the preunmap to 325 // avoid the overhead. 326 static bool migration_should_do_cpu_preunmap(uvm_va_space_t *va_space, 327 uvm_migrate_pass_t pass, 328 bool is_single_block) 329 330 { 331 if (!g_uvm_perf_migrate_cpu_preunmap_enable) 332 return false; 333 334 if (pass != UVM_MIGRATE_PASS_FIRST || is_single_block) 335 return false; 336 337 if (uvm_processor_mask_get_gpu_count(&va_space->has_nvlink[UVM_ID_CPU_VALUE]) == 0) 338 return false; 339 340 return true; 341 } 342 343 // This function determines if the VA range properties avoid the need to remove 344 // CPU mappings on UvmMigrate. Currently, it only checks whether 345 // read-duplication is enabled in the VA range. This is because, when migrating 346 // read-duplicated VA blocks, the source processor doesn't need to be unmapped 347 // (though it may need write access revoked). 348 static bool va_range_should_do_cpu_preunmap(const uvm_va_policy_t *policy, 349 uvm_va_space_t *va_space) 350 { 351 return !uvm_va_policy_is_read_duplicate(policy, va_space); 352 } 353 354 // Function that determines if the VA block to be migrated contains pages with 355 // CPU mappings that don't need to be removed (see the comment above). In that 356 // case false is returned. Otherwise it returns true, and stores in the 357 // variable pointed by num_unmap_pages the number of pages that do need to 358 // remove their CPU mappings. 359 static bool va_block_should_do_cpu_preunmap(uvm_va_block_t *va_block, 360 uvm_va_block_context_t *va_block_context, 361 NvU64 start, 362 NvU64 end, 363 uvm_processor_id_t dest_id, 364 NvU32 *num_unmap_pages) 365 { 366 const uvm_page_mask_t *mapped_pages_cpu; 367 NvU32 num_cpu_unchanged_pages = 0; 368 uvm_va_block_region_t region; 369 370 *num_unmap_pages = 0; 371 372 if (!va_block) 373 return true; 374 375 region = uvm_va_block_region_from_start_end(va_block, max(start, va_block->start), min(end, va_block->end)); 376 377 uvm_mutex_lock(&va_block->lock); 378 379 mapped_pages_cpu = uvm_va_block_map_mask_get(va_block, UVM_ID_CPU); 380 if (uvm_processor_mask_test(&va_block->resident, dest_id)) { 381 const uvm_page_mask_t *resident_pages_dest = uvm_va_block_resident_mask_get(va_block, dest_id, NUMA_NO_NODE); 382 uvm_page_mask_t *do_not_unmap_pages = &va_block_context->scratch_page_mask; 383 384 // TODO: Bug 1877578 385 // 386 // We assume that if pages are mapped on the CPU and not resident on 387 // the destination, the pages will change residency so the CPU must be 388 // unmapped. If we implement automatic read-duplication heuristics in 389 // the future, we'll also need to check if the pages are being 390 // read-duplicated. 391 uvm_page_mask_and(do_not_unmap_pages, mapped_pages_cpu, resident_pages_dest); 392 393 num_cpu_unchanged_pages = uvm_page_mask_region_weight(do_not_unmap_pages, region); 394 } 395 396 *num_unmap_pages = uvm_page_mask_region_weight(mapped_pages_cpu, region) - num_cpu_unchanged_pages; 397 398 uvm_mutex_unlock(&va_block->lock); 399 400 return num_cpu_unchanged_pages == 0; 401 } 402 403 static void preunmap_multi_block(uvm_va_range_t *va_range, 404 uvm_va_block_context_t *va_block_context, 405 NvU64 start, 406 NvU64 end, 407 uvm_processor_id_t dest_id) 408 { 409 size_t i; 410 const size_t first_block_index = uvm_va_range_block_index(va_range, start); 411 const size_t last_block_index = uvm_va_range_block_index(va_range, end); 412 NvU32 num_unmap_pages = 0; 413 414 UVM_ASSERT(start >= va_range->node.start); 415 UVM_ASSERT(end <= va_range->node.end); 416 UVM_ASSERT(va_range->type == UVM_VA_RANGE_TYPE_MANAGED); 417 uvm_assert_rwsem_locked(&va_range->va_space->lock); 418 419 UVM_ASSERT(uvm_range_group_all_migratable(va_range->va_space, start, end)); 420 421 for (i = first_block_index; i <= last_block_index; i++) { 422 NvU32 num_block_unmap_pages; 423 424 if (!va_block_should_do_cpu_preunmap(uvm_va_range_block(va_range, i), 425 va_block_context, 426 start, 427 end, 428 dest_id, 429 &num_block_unmap_pages)) { 430 return; 431 } 432 433 num_unmap_pages += num_block_unmap_pages; 434 } 435 436 if (num_unmap_pages > 0) 437 unmap_mapping_range(va_range->va_space->mapping, start, end - start + 1, 1); 438 } 439 440 static NV_STATUS uvm_va_range_migrate_multi_block(uvm_va_range_t *va_range, 441 uvm_va_block_context_t *va_block_context, 442 NvU64 start, 443 NvU64 end, 444 uvm_processor_id_t dest_id, 445 uvm_migrate_mode_t mode, 446 uvm_tracker_t *out_tracker) 447 { 448 size_t i; 449 const size_t first_block_index = uvm_va_range_block_index(va_range, start); 450 const size_t last_block_index = uvm_va_range_block_index(va_range, end); 451 452 UVM_ASSERT(start >= va_range->node.start); 453 UVM_ASSERT(end <= va_range->node.end); 454 UVM_ASSERT(va_range->type == UVM_VA_RANGE_TYPE_MANAGED); 455 uvm_assert_rwsem_locked(&va_range->va_space->lock); 456 457 UVM_ASSERT(uvm_range_group_all_migratable(va_range->va_space, start, end)); 458 459 // Iterate over blocks, populating them if necessary 460 for (i = first_block_index; i <= last_block_index; i++) { 461 uvm_va_block_retry_t va_block_retry; 462 uvm_va_block_region_t region; 463 uvm_va_block_t *va_block; 464 NV_STATUS status = uvm_va_range_block_create(va_range, i, &va_block); 465 466 if (status != NV_OK) 467 return status; 468 469 region = uvm_va_block_region_from_start_end(va_block, 470 max(start, va_block->start), 471 min(end, va_block->end)); 472 473 status = UVM_VA_BLOCK_LOCK_RETRY(va_block, &va_block_retry, 474 uvm_va_block_migrate_locked(va_block, 475 &va_block_retry, 476 va_block_context, 477 region, 478 dest_id, 479 mode, 480 out_tracker)); 481 if (status != NV_OK) 482 return status; 483 } 484 485 return NV_OK; 486 } 487 488 static NV_STATUS uvm_va_range_migrate(uvm_va_range_t *va_range, 489 uvm_va_block_context_t *va_block_context, 490 NvU64 start, 491 NvU64 end, 492 uvm_processor_id_t dest_id, 493 uvm_migrate_mode_t mode, 494 bool should_do_cpu_preunmap, 495 uvm_tracker_t *out_tracker) 496 { 497 NvU64 preunmap_range_start = start; 498 uvm_va_policy_t *policy = uvm_va_range_get_policy(va_range); 499 500 should_do_cpu_preunmap = should_do_cpu_preunmap && va_range_should_do_cpu_preunmap(policy, va_range->va_space); 501 502 // Divide migrations into groups of contiguous VA blocks. This is to trigger 503 // CPU unmaps for that region before the migration starts. 504 while (preunmap_range_start < end) { 505 NV_STATUS status; 506 NvU64 preunmap_range_end; 507 508 if (should_do_cpu_preunmap) { 509 preunmap_range_end = UVM_ALIGN_UP(preunmap_range_start + 1, g_uvm_perf_migrate_cpu_preunmap_size); 510 preunmap_range_end = min(preunmap_range_end - 1, end); 511 512 preunmap_multi_block(va_range, 513 va_block_context, 514 preunmap_range_start, 515 preunmap_range_end, 516 dest_id); 517 } 518 else { 519 preunmap_range_end = end; 520 } 521 522 status = uvm_va_range_migrate_multi_block(va_range, 523 va_block_context, 524 preunmap_range_start, 525 preunmap_range_end, 526 dest_id, 527 mode, 528 out_tracker); 529 if (status != NV_OK) 530 return status; 531 532 preunmap_range_start = preunmap_range_end + 1; 533 } 534 535 return NV_OK; 536 } 537 538 static NV_STATUS uvm_migrate_ranges(uvm_va_space_t *va_space, 539 uvm_va_block_context_t *va_block_context, 540 uvm_va_range_t *first_va_range, 541 NvU64 base, 542 NvU64 length, 543 uvm_processor_id_t dest_id, 544 uvm_migrate_mode_t mode, 545 bool should_do_cpu_preunmap, 546 uvm_tracker_t *out_tracker) 547 { 548 uvm_va_range_t *va_range, *va_range_last; 549 NvU64 end = base + length - 1; 550 NV_STATUS status = NV_OK; 551 bool skipped_migrate = false; 552 553 if (!first_va_range) { 554 // For HMM, we iterate over va_blocks since there is no va_range. 555 return uvm_hmm_migrate_ranges(va_space, 556 va_block_context, 557 base, 558 length, 559 dest_id, 560 mode, 561 out_tracker); 562 } 563 564 UVM_ASSERT(first_va_range == uvm_va_space_iter_first(va_space, base, base)); 565 566 va_range_last = NULL; 567 uvm_for_each_va_range_in_contig_from(va_range, va_space, first_va_range, end) { 568 uvm_range_group_range_iter_t iter; 569 uvm_va_policy_t *policy = uvm_va_range_get_policy(va_range); 570 571 va_range_last = va_range; 572 573 // Only managed ranges can be migrated 574 if (va_range->type != UVM_VA_RANGE_TYPE_MANAGED) { 575 status = NV_ERR_INVALID_ADDRESS; 576 break; 577 } 578 579 // For UVM-Lite GPUs, the CUDA driver may suballocate a single va_range 580 // into many range groups. For this reason, we iterate over each va_range first 581 // then through the range groups within. 582 uvm_range_group_for_each_migratability_in(&iter, 583 va_space, 584 max(base, va_range->node.start), 585 min(end, va_range->node.end)) { 586 // Skip non-migratable VA ranges 587 if (!iter.migratable) { 588 // Only return NV_WARN_MORE_PROCESSING_REQUIRED if the pages aren't 589 // already resident at dest_id. 590 if (!uvm_va_policy_preferred_location_equal(policy, dest_id, va_block_context->make_resident.dest_nid)) 591 skipped_migrate = true; 592 } 593 else if (uvm_processor_mask_test(&va_range->uvm_lite_gpus, dest_id) && 594 !uvm_id_equal(dest_id, policy->preferred_location)) { 595 // Don't migrate to a non-faultable GPU that is in UVM-Lite mode, 596 // unless it's the preferred location 597 status = NV_ERR_INVALID_DEVICE; 598 break; 599 } 600 else { 601 status = uvm_va_range_migrate(va_range, 602 va_block_context, 603 iter.start, 604 iter.end, 605 dest_id, 606 mode, 607 should_do_cpu_preunmap, 608 out_tracker); 609 if (status != NV_OK) 610 break; 611 } 612 } 613 } 614 615 if (status != NV_OK) 616 return status; 617 618 // Check that we were able to iterate over the entire range without any gaps 619 if (!va_range_last || va_range_last->node.end < end) 620 return NV_ERR_INVALID_ADDRESS; 621 622 if (skipped_migrate) 623 return NV_WARN_MORE_PROCESSING_REQUIRED; 624 625 return NV_OK; 626 } 627 628 static NV_STATUS uvm_migrate(uvm_va_space_t *va_space, 629 struct mm_struct *mm, 630 NvU64 base, 631 NvU64 length, 632 uvm_processor_id_t dest_id, 633 int dest_nid, 634 NvU32 migrate_flags, 635 uvm_va_range_t *first_va_range, 636 uvm_tracker_t *out_tracker) 637 { 638 NV_STATUS status = NV_OK; 639 uvm_va_block_context_t *va_block_context; 640 bool do_mappings; 641 bool do_two_passes; 642 bool is_single_block; 643 bool should_do_cpu_preunmap; 644 645 uvm_assert_rwsem_locked(&va_space->lock); 646 647 // If the GPU has its memory disabled, just skip the migration and let 648 // faults take care of things. 649 if (!uvm_va_space_processor_has_memory(va_space, dest_id)) 650 return NV_OK; 651 652 if (mm) 653 uvm_assert_mmap_lock_locked(mm); 654 else if (!first_va_range) 655 return NV_ERR_INVALID_ADDRESS; 656 657 va_block_context = uvm_va_block_context_alloc(mm); 658 if (!va_block_context) 659 return NV_ERR_NO_MEMORY; 660 661 va_block_context->make_resident.dest_nid = dest_nid; 662 663 // We perform two passes (unless the migration only covers a single VA 664 // block or UVM_MIGRATE_FLAG_SKIP_CPU_MAP is passed). This helps in the 665 // following scenarios: 666 // 667 // - Migrations that add CPU mappings, since they are synchronous operations 668 // that delay the migration of the next VA blocks. 669 // - Concurrent migrations. This is due to our current channel selection 670 // logic that doesn't prevent false dependencies between independent 671 // operations. For example, removal of mappings for outgoing transfers are 672 // delayed by the mappings added by incoming transfers. 673 // TODO: Bug 1764953: Re-evaluate the two-pass logic when channel selection 674 // is overhauled. 675 // 676 // The two passes are as follows: 677 // 678 // 1- Transfer all VA blocks (do not add mappings) 679 // 2- Go block by block reexecuting the transfer (in case someone moved it 680 // since the first pass), and adding the mappings. 681 // 682 // For HMM (!first_va_range), we always do a single pass. 683 is_single_block = !first_va_range || is_migration_single_block(first_va_range, base, length); 684 do_mappings = UVM_ID_IS_GPU(dest_id) || !(migrate_flags & UVM_MIGRATE_FLAG_SKIP_CPU_MAP); 685 do_two_passes = do_mappings && !is_single_block; 686 687 if (do_two_passes) { 688 should_do_cpu_preunmap = migration_should_do_cpu_preunmap(va_space, UVM_MIGRATE_PASS_FIRST, is_single_block); 689 690 status = uvm_migrate_ranges(va_space, 691 va_block_context, 692 first_va_range, 693 base, 694 length, 695 dest_id, 696 UVM_MIGRATE_MODE_MAKE_RESIDENT, 697 should_do_cpu_preunmap, 698 out_tracker); 699 } 700 701 if (status == NV_OK) { 702 uvm_migrate_mode_t mode = do_mappings? UVM_MIGRATE_MODE_MAKE_RESIDENT_AND_MAP: 703 UVM_MIGRATE_MODE_MAKE_RESIDENT; 704 uvm_migrate_pass_t pass = do_two_passes? UVM_MIGRATE_PASS_SECOND: 705 UVM_MIGRATE_PASS_FIRST; 706 should_do_cpu_preunmap = migration_should_do_cpu_preunmap(va_space, pass, is_single_block); 707 708 status = uvm_migrate_ranges(va_space, 709 va_block_context, 710 first_va_range, 711 base, 712 length, 713 dest_id, 714 mode, 715 should_do_cpu_preunmap, 716 out_tracker); 717 } 718 719 uvm_va_block_context_free(va_block_context); 720 721 return status; 722 } 723 724 static NV_STATUS semaphore_release_from_gpu(uvm_gpu_t *gpu, 725 uvm_va_range_semaphore_pool_t *semaphore_va_range, 726 NvU64 semaphore_user_addr, 727 NvU32 semaphore_payload, 728 uvm_tracker_t *release_after_tracker) 729 { 730 NV_STATUS status; 731 uvm_push_t push; 732 uvm_channel_type_t channel_type; 733 NvU64 semaphore_gpu_va; 734 NvU64 semaphore_offset; 735 736 UVM_ASSERT(uvm_mem_mapped_on_gpu_kernel(semaphore_va_range->mem, gpu)); 737 738 semaphore_offset = semaphore_user_addr - (NvU64)(uintptr_t)semaphore_va_range->mem->user->addr; 739 semaphore_gpu_va = uvm_mem_get_gpu_va_kernel(semaphore_va_range->mem, gpu) + semaphore_offset; 740 741 // Outside of SR-IOV heavy, using UVM_CHANNEL_TYPE_MEMOPS is optimal from a 742 // performance standpoint because if the migration is targeting a GPU, it is 743 // likely that the channel used for the GPU page table update (pushed to 744 // UVM_CHANNEL_TYPE_MEMOPS) will also be used for the release. The 745 // inter-channel dependency avoided by using a single channel can add a 746 // significant overhead to the enclosing migration. 747 // 748 // In SR-IOV heavy, the user semaphore release is functionally forbidden 749 // from being pushed to a UVM_CHANNEL_TYPE_MEMOPS channel, because it is not 750 // a page tree operation. 751 if (uvm_parent_gpu_is_virt_mode_sriov_heavy(gpu->parent)) 752 channel_type = UVM_CHANNEL_TYPE_GPU_INTERNAL; 753 else 754 channel_type = UVM_CHANNEL_TYPE_MEMOPS; 755 756 status = uvm_push_begin_acquire(gpu->channel_manager, 757 channel_type, 758 release_after_tracker, 759 &push, 760 "Pushing semaphore release (*0x%llx = %u)", 761 semaphore_user_addr, 762 semaphore_payload); 763 if (status != NV_OK) 764 return status; 765 766 gpu->parent->ce_hal->semaphore_release(&push, semaphore_gpu_va, semaphore_payload); 767 uvm_push_end(&push); 768 769 uvm_mutex_lock(&semaphore_va_range->tracker_lock); 770 status = uvm_tracker_add_push_safe(&semaphore_va_range->tracker, &push); 771 uvm_tracker_remove_completed(&semaphore_va_range->tracker); 772 uvm_mutex_unlock(&semaphore_va_range->tracker_lock); 773 774 return status; 775 } 776 777 static void semaphore_release_from_cpu(uvm_mem_t *semaphore_mem, NvU64 semaphore_user_addr, NvU32 semaphore_payload) 778 { 779 char *semaphore_cpu_va; 780 NvU64 semaphore_offset; 781 782 UVM_ASSERT(uvm_mem_mapped_on_cpu_kernel(semaphore_mem)); 783 784 semaphore_offset = semaphore_user_addr - (NvU64)(uintptr_t)semaphore_mem->user->addr; 785 786 // Prevent processor speculation prior to accessing user-mapped memory to 787 // avoid leaking information from side-channel attacks. Under speculation, a 788 // valid VA range which does not contain this semaphore could be used by the 789 // caller. It's unclear but likely that the user might be able to control 790 // the data at that address. Auditing all potential ways that could happen 791 // is difficult and error-prone, so to be on the safe side we'll just always 792 // block speculation. 793 nv_speculation_barrier(); 794 795 semaphore_cpu_va = (char *) uvm_mem_get_cpu_addr_kernel(semaphore_mem) + semaphore_offset; 796 797 UVM_WRITE_ONCE(*(NvU32 *)semaphore_cpu_va, semaphore_payload); 798 } 799 800 static NV_STATUS semaphore_release(NvU64 semaphore_address, 801 NvU32 semaphore_payload, 802 uvm_va_range_semaphore_pool_t *semaphore_pool, 803 uvm_gpu_t *dest_gpu, 804 uvm_tracker_t *tracker_ptr) 805 { 806 uvm_gpu_t *gpu; 807 uvm_gpu_t *gpu_owner = semaphore_pool->owner; 808 809 // If there is a GPU owner, release the semaphore from it. 810 if (gpu_owner != NULL) 811 return semaphore_release_from_gpu(gpu_owner, semaphore_pool, semaphore_address, semaphore_payload, tracker_ptr); 812 813 // Attempt eager release from CPU if the tracker is already completed. 814 if (uvm_tracker_is_completed(tracker_ptr)) { 815 semaphore_release_from_cpu(semaphore_pool->mem, semaphore_address, semaphore_payload); 816 return NV_OK; 817 } 818 819 if (dest_gpu == NULL) { 820 // The destination is the CPU, but we didn't do a CPU release above 821 // because the previous work is not complete. This situation arises when 822 // accessed_by mappings are being set up asynchronously, or the 823 // test-only flag UVM_MIGRATE_FLAG_SKIP_CPU_MAP is used. So there should 824 // be a registered GPU, since all CPU work is synchronous, and the 825 // semaphore must be mapped on that GPU. 826 // 827 // Note that the GPU selected for the release may not be the same device 828 // that prevented the tracker from being complete. 829 gpu = uvm_processor_mask_find_first_gpu(&semaphore_pool->mem->kernel.mapped_on); 830 831 UVM_ASSERT(gpu != NULL); 832 } 833 else { 834 gpu = dest_gpu; 835 } 836 837 return semaphore_release_from_gpu(gpu, semaphore_pool, semaphore_address, semaphore_payload, tracker_ptr); 838 } 839 840 NV_STATUS uvm_migrate_init(void) 841 { 842 NV_STATUS status = uvm_migrate_pageable_init(); 843 if (status != NV_OK) 844 return status; 845 846 g_uvm_perf_migrate_cpu_preunmap_enable = uvm_perf_migrate_cpu_preunmap_enable != 0; 847 848 BUILD_BUG_ON((UVM_VA_BLOCK_SIZE) & (UVM_VA_BLOCK_SIZE - 1)); 849 850 if (g_uvm_perf_migrate_cpu_preunmap_enable) { 851 if (uvm_perf_migrate_cpu_preunmap_block_order <= UVM_PERF_MIGRATE_CPU_PREUNMAP_BLOCK_ORDER_MAX) { 852 g_uvm_perf_migrate_cpu_preunmap_size = UVM_VA_BLOCK_SIZE << uvm_perf_migrate_cpu_preunmap_block_order; 853 } 854 else { 855 g_uvm_perf_migrate_cpu_preunmap_size = UVM_VA_BLOCK_SIZE << UVM_PERF_MIGRATE_CPU_PREUNMAP_BLOCK_ORDER_DEFAULT; 856 857 pr_info("Invalid value %u for uvm_perf_migrate_cpu_preunmap_block_order. Using %u instead\n", 858 uvm_perf_migrate_cpu_preunmap_block_order, 859 UVM_PERF_MIGRATE_CPU_PREUNMAP_BLOCK_ORDER_DEFAULT); 860 } 861 } 862 863 return NV_OK; 864 } 865 866 void uvm_migrate_exit(void) 867 { 868 uvm_migrate_pageable_exit(); 869 } 870 871 NV_STATUS uvm_api_migrate(UVM_MIGRATE_PARAMS *params, struct file *filp) 872 { 873 uvm_va_space_t *va_space = uvm_va_space_get(filp); 874 uvm_tracker_t tracker = UVM_TRACKER_INIT(); 875 uvm_tracker_t *tracker_ptr = NULL; 876 uvm_gpu_t *dest_gpu = NULL; 877 uvm_va_range_t *sema_va_range = NULL; 878 struct mm_struct *mm; 879 NV_STATUS status = NV_OK; 880 bool flush_events = false; 881 const bool synchronous = !(params->flags & UVM_MIGRATE_FLAG_ASYNC); 882 int cpu_numa_node = (int)params->cpuNumaNode; 883 884 // We temporarily allow 0 length in the IOCTL parameters as a signal to 885 // only release the semaphore. This is because user-space is in charge of 886 // migrating pageable memory in some cases. 887 // 888 // TODO: Bug 2419180: do not allow 0 length migrations when we fully switch 889 // to migrate_vma for all types of vmas. 890 if (params->length > 0 || synchronous || params->semaphoreAddress == 0) { 891 if (uvm_api_range_invalid(params->base, params->length)) 892 return NV_ERR_INVALID_ADDRESS; 893 } 894 895 if (params->flags & ~UVM_MIGRATE_FLAGS_ALL) 896 return NV_ERR_INVALID_ARGUMENT; 897 898 if ((params->flags & UVM_MIGRATE_FLAGS_TEST_ALL) && !uvm_enable_builtin_tests) { 899 UVM_INFO_PRINT("Test flag set for UVM_MIGRATE. Did you mean to insmod with uvm_enable_builtin_tests=1?\n"); 900 UVM_INFO_PRINT("TEMP\n"); 901 return NV_ERR_INVALID_ARGUMENT; 902 } 903 904 // mmap_lock will be needed if we have to create CPU mappings 905 mm = uvm_va_space_mm_or_current_retain_lock(va_space); 906 uvm_va_space_down_read(va_space); 907 908 if (synchronous) { 909 if (params->semaphoreAddress != 0) { 910 status = NV_ERR_INVALID_ARGUMENT; 911 goto done; 912 } 913 } 914 else { 915 if (params->semaphoreAddress == 0) { 916 if (params->semaphorePayload != 0) { 917 status = NV_ERR_INVALID_ARGUMENT; 918 goto done; 919 } 920 } 921 else { 922 sema_va_range = uvm_va_range_find(va_space, params->semaphoreAddress); 923 if (!IS_ALIGNED(params->semaphoreAddress, sizeof(params->semaphorePayload)) || 924 !sema_va_range || sema_va_range->type != UVM_VA_RANGE_TYPE_SEMAPHORE_POOL) { 925 status = NV_ERR_INVALID_ADDRESS; 926 goto done; 927 } 928 } 929 } 930 931 if (!uvm_uuid_is_cpu(¶ms->destinationUuid)) { 932 if (params->flags & UVM_MIGRATE_FLAG_NO_GPU_VA_SPACE) 933 dest_gpu = uvm_va_space_get_gpu_by_uuid(va_space, ¶ms->destinationUuid); 934 else 935 dest_gpu = uvm_va_space_get_gpu_by_uuid_with_gpu_va_space(va_space, ¶ms->destinationUuid); 936 937 if (!dest_gpu) { 938 status = NV_ERR_INVALID_DEVICE; 939 goto done; 940 } 941 942 if (params->length > 0 && !uvm_gpu_can_address(dest_gpu, params->base, params->length)) { 943 status = NV_ERR_OUT_OF_RANGE; 944 goto done; 945 } 946 } 947 else { 948 // If cpu_numa_node is not -1, we only check that it is a valid node in 949 // the system, it has memory, and it doesn't correspond to a GPU node. 950 // 951 // For pageable memory, this is fine because alloc_pages_node will clamp 952 // the allocation to cpuset_current_mems_allowed when uvm_migrate 953 //_pageable is called from process context (uvm_migrate) when dst_id is 954 // CPU. UVM bottom half calls uvm_migrate_pageable with CPU dst_id only 955 // when the VMA memory policy is set to dst_node_id and dst_node_id is 956 // not NUMA_NO_NODE. 957 if (cpu_numa_node != -1 && 958 (!nv_numa_node_has_memory(cpu_numa_node) || 959 !node_isset(cpu_numa_node, node_possible_map) || 960 uvm_va_space_find_gpu_with_memory_node_id(va_space, cpu_numa_node))) { 961 status = NV_ERR_INVALID_ARGUMENT; 962 goto done; 963 } 964 } 965 966 UVM_ASSERT(status == NV_OK); 967 968 // If we're synchronous or if we need to release a semaphore, use a tracker. 969 if (synchronous || params->semaphoreAddress) 970 tracker_ptr = &tracker; 971 972 if (params->length > 0) { 973 uvm_api_range_type_t type; 974 uvm_processor_id_t dest_id = dest_gpu ? dest_gpu->id : UVM_ID_CPU; 975 976 type = uvm_api_range_type_check(va_space, mm, params->base, params->length); 977 if (type == UVM_API_RANGE_TYPE_INVALID) { 978 status = NV_ERR_INVALID_ADDRESS; 979 goto done; 980 } 981 982 if (type == UVM_API_RANGE_TYPE_ATS) { 983 uvm_migrate_args_t uvm_migrate_args = 984 { 985 .va_space = va_space, 986 .mm = mm, 987 .start = params->base, 988 .length = params->length, 989 .dst_id = dest_id, 990 .dst_node_id = cpu_numa_node, 991 .populate_permissions = UVM_POPULATE_PERMISSIONS_INHERIT, 992 .touch = false, 993 .skip_mapped = false, 994 .populate_on_cpu_alloc_failures = false, 995 .user_space_start = ¶ms->userSpaceStart, 996 .user_space_length = ¶ms->userSpaceLength, 997 }; 998 999 status = uvm_migrate_pageable(&uvm_migrate_args); 1000 } 1001 else { 1002 status = uvm_migrate(va_space, 1003 mm, 1004 params->base, 1005 params->length, 1006 dest_id, 1007 (UVM_ID_IS_CPU(dest_id) ? cpu_numa_node : NUMA_NO_NODE), 1008 params->flags, 1009 uvm_va_space_iter_first(va_space, params->base, params->base), 1010 tracker_ptr); 1011 } 1012 } 1013 1014 done: 1015 // We only need to hold mmap_lock to create new CPU mappings, so drop it if 1016 // we need to wait for the tracker to finish. 1017 // 1018 // TODO: Bug 1766650: For large migrations with destination CPU, try 1019 // benchmarks to see if a two-pass approach would be faster (first 1020 // pass pushes all GPU work asynchronously, second pass updates CPU 1021 // mappings synchronously). 1022 if (mm) 1023 uvm_up_read_mmap_lock_out_of_order(mm); 1024 1025 if (tracker_ptr) { 1026 // If requested, release semaphore 1027 if (params->semaphoreAddress && (status == NV_OK)) { 1028 status = semaphore_release(params->semaphoreAddress, 1029 params->semaphorePayload, 1030 &sema_va_range->semaphore_pool, 1031 dest_gpu, 1032 tracker_ptr); 1033 } 1034 1035 // Wait on the tracker if we are synchronous or there was an error. The 1036 // VA space lock must be held to prevent GPUs from being unregistered. 1037 if (synchronous || (status != NV_OK)) { 1038 NV_STATUS tracker_status = uvm_tracker_wait(tracker_ptr); 1039 1040 // Only clobber status if we didn't hit an earlier error 1041 if (status == NV_OK) 1042 status = tracker_status; 1043 1044 flush_events = true; 1045 } 1046 1047 uvm_tracker_deinit(tracker_ptr); 1048 } 1049 1050 uvm_va_space_up_read(va_space); 1051 uvm_va_space_mm_or_current_release(va_space, mm); 1052 1053 // If the migration is known to be complete, eagerly dispatch the migration 1054 // events, instead of processing them on a later event flush. Note that an 1055 // asynchronous migration could be complete by now, but the flush would not 1056 // be triggered. 1057 if (flush_events) 1058 uvm_tools_flush_events(); 1059 1060 return status; 1061 } 1062 1063 NV_STATUS uvm_api_migrate_range_group(UVM_MIGRATE_RANGE_GROUP_PARAMS *params, struct file *filp) 1064 { 1065 NV_STATUS status = NV_OK; 1066 NV_STATUS tracker_status = NV_OK; 1067 uvm_va_space_t *va_space = uvm_va_space_get(filp); 1068 struct mm_struct *mm; 1069 uvm_range_group_t *range_group; 1070 uvm_range_group_range_t *rgr; 1071 uvm_processor_id_t dest_id; 1072 uvm_tracker_t local_tracker = UVM_TRACKER_INIT(); 1073 NvU32 migrate_flags = 0; 1074 uvm_gpu_t *gpu = NULL; 1075 1076 // mmap_lock will be needed if we have to create CPU mappings 1077 mm = uvm_va_space_mm_or_current_retain_lock(va_space); 1078 uvm_va_space_down_read(va_space); 1079 1080 if (uvm_uuid_is_cpu(¶ms->destinationUuid)) { 1081 dest_id = UVM_ID_CPU; 1082 } 1083 else { 1084 gpu = uvm_va_space_get_gpu_by_uuid_with_gpu_va_space(va_space, ¶ms->destinationUuid); 1085 if (!gpu) { 1086 status = NV_ERR_INVALID_DEVICE; 1087 goto done; 1088 } 1089 1090 dest_id = gpu->id; 1091 } 1092 1093 range_group = radix_tree_lookup(&va_space->range_groups, params->rangeGroupId); 1094 if (!range_group) { 1095 status = NV_ERR_OBJECT_NOT_FOUND; 1096 goto done; 1097 } 1098 1099 // Migrate all VA ranges in the range group. uvm_migrate is used because it performs all 1100 // VA range validity checks. 1101 list_for_each_entry(rgr, &range_group->ranges, range_group_list_node) { 1102 NvU64 start = rgr->node.start; 1103 NvU64 length = rgr->node.end - rgr->node.start + 1; 1104 1105 if (gpu && !uvm_gpu_can_address(gpu, start, length)) { 1106 status = NV_ERR_OUT_OF_RANGE; 1107 } 1108 else { 1109 uvm_va_range_t *first_va_range = uvm_va_space_iter_first(va_space, start, start); 1110 1111 if (!first_va_range || first_va_range->type != UVM_VA_RANGE_TYPE_MANAGED) { 1112 status = NV_ERR_INVALID_ADDRESS; 1113 goto done; 1114 } 1115 1116 status = uvm_migrate(va_space, 1117 mm, 1118 start, 1119 length, 1120 dest_id, 1121 NUMA_NO_NODE, 1122 migrate_flags, 1123 first_va_range, 1124 &local_tracker); 1125 } 1126 1127 if (status != NV_OK) 1128 goto done; 1129 } 1130 1131 done: 1132 // We only need to hold mmap_lock to create new CPU mappings, so drop it if 1133 // we need to wait for the tracker to finish. 1134 // 1135 // TODO: Bug 1766650: For large migrations with destination CPU, try 1136 // benchmarks to see if a two-pass approach would be faster (first 1137 // pass pushes all GPU work asynchronously, second pass updates CPU 1138 // mappings synchronously). 1139 if (mm) 1140 uvm_up_read_mmap_lock_out_of_order(mm); 1141 1142 tracker_status = uvm_tracker_wait_deinit(&local_tracker); 1143 uvm_va_space_up_read(va_space); 1144 uvm_va_space_mm_or_current_release(va_space, mm); 1145 1146 // This API is synchronous, so wait for migrations to finish 1147 uvm_tools_flush_events(); 1148 1149 return status == NV_OK? tracker_status : status; 1150 } 1151