1 /******************************************************************************* 2 Copyright (c) 2016-2022 NVIDIA Corporation 3 4 Permission is hereby granted, free of charge, to any person obtaining a copy 5 of this software and associated documentation files (the "Software"), to 6 deal in the Software without restriction, including without limitation the 7 rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 8 sell copies of the Software, and to permit persons to whom the Software is 9 furnished to do so, subject to the following conditions: 10 11 The above copyright notice and this permission notice shall be 12 included in all copies or substantial portions of the Software. 13 14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 DEALINGS IN THE SOFTWARE. 21 22 *******************************************************************************/ 23 24 #include "uvm_common.h" 25 #include "uvm_ioctl.h" 26 #include "uvm_linux.h" 27 #include "uvm_global.h" 28 #include "uvm_gpu.h" 29 #include "uvm_lock.h" 30 #include "uvm_va_space.h" 31 #include "uvm_va_range.h" 32 #include "uvm_va_block.h" 33 #include "uvm_tracker.h" 34 #include "uvm_api.h" 35 #include "uvm_channel.h" 36 #include "uvm_push.h" 37 #include "uvm_hal.h" 38 #include "uvm_tools.h" 39 #include "uvm_migrate.h" 40 #include "uvm_migrate_pageable.h" 41 #include "uvm_va_space_mm.h" 42 #include "nv_speculation_barrier.h" 43 44 typedef enum 45 { 46 UVM_MIGRATE_PASS_FIRST, 47 UVM_MIGRATE_PASS_SECOND 48 } uvm_migrate_pass_t; 49 50 static int uvm_perf_migrate_cpu_preunmap_enable = 1; 51 module_param(uvm_perf_migrate_cpu_preunmap_enable, int, S_IRUGO); 52 53 #define UVM_PERF_MIGRATE_CPU_PREUNMAP_BLOCK_ORDER_DEFAULT 2 54 #define UVM_PERF_MIGRATE_CPU_PREUNMAP_BLOCK_ORDER_MAX 10 55 static unsigned uvm_perf_migrate_cpu_preunmap_block_order = UVM_PERF_MIGRATE_CPU_PREUNMAP_BLOCK_ORDER_DEFAULT; 56 module_param(uvm_perf_migrate_cpu_preunmap_block_order, uint, S_IRUGO); 57 58 // Global post-processed values of the module parameters 59 static bool g_uvm_perf_migrate_cpu_preunmap_enable __read_mostly; 60 static NvU64 g_uvm_perf_migrate_cpu_preunmap_size __read_mostly; 61 62 static bool is_migration_single_block(uvm_va_range_t *first_va_range, NvU64 base, NvU64 length) 63 { 64 NvU64 end = base + length - 1; 65 66 if (end > first_va_range->node.end) 67 return false; 68 69 return uvm_va_range_block_index(first_va_range, base) == uvm_va_range_block_index(first_va_range, end); 70 } 71 72 static NV_STATUS block_migrate_map_mapped_pages(uvm_va_block_t *va_block, 73 uvm_va_block_retry_t *va_block_retry, 74 uvm_va_block_context_t *va_block_context, 75 uvm_va_block_region_t region, 76 uvm_processor_id_t dest_id) 77 { 78 uvm_prot_t prot; 79 uvm_page_index_t page_index; 80 NV_STATUS status = NV_OK; 81 const uvm_page_mask_t *pages_mapped_on_destination = uvm_va_block_map_mask_get(va_block, dest_id); 82 83 for (prot = UVM_PROT_READ_ONLY; prot <= UVM_PROT_READ_WRITE_ATOMIC; ++prot) 84 va_block_context->mask_by_prot[prot - 1].count = 0; 85 86 // Only map those pages that are not already mapped on destination 87 for_each_va_block_unset_page_in_region_mask(page_index, pages_mapped_on_destination, region) { 88 prot = uvm_va_block_page_compute_highest_permission(va_block, dest_id, page_index); 89 if (prot == UVM_PROT_NONE) 90 continue; 91 92 if (va_block_context->mask_by_prot[prot - 1].count++ == 0) 93 uvm_page_mask_zero(&va_block_context->mask_by_prot[prot - 1].page_mask); 94 95 uvm_page_mask_set(&va_block_context->mask_by_prot[prot - 1].page_mask, page_index); 96 } 97 98 for (prot = UVM_PROT_READ_ONLY; prot <= UVM_PROT_READ_WRITE_ATOMIC; ++prot) { 99 if (va_block_context->mask_by_prot[prot - 1].count == 0) 100 continue; 101 102 // We pass UvmEventMapRemoteCauseInvalid since the destination processor 103 // of a migration will never be mapped remotely 104 status = uvm_va_block_map(va_block, 105 va_block_context, 106 dest_id, 107 region, 108 &va_block_context->mask_by_prot[prot - 1].page_mask, 109 prot, 110 UvmEventMapRemoteCauseInvalid, 111 &va_block->tracker); 112 if (status != NV_OK) 113 break; 114 115 // Whoever added the other mapping(s) should have already added 116 // SetAccessedBy processors 117 } 118 119 return status; 120 } 121 122 static NV_STATUS block_migrate_map_unmapped_pages(uvm_va_block_t *va_block, 123 uvm_va_block_retry_t *va_block_retry, 124 uvm_va_block_context_t *va_block_context, 125 uvm_va_block_region_t region, 126 uvm_processor_id_t dest_id) 127 128 { 129 uvm_tracker_t local_tracker = UVM_TRACKER_INIT(); 130 NV_STATUS status = NV_OK; 131 NV_STATUS tracker_status; 132 133 // Save the mask of unmapped pages because it will change after the 134 // first map operation 135 uvm_page_mask_complement(&va_block_context->caller_page_mask, &va_block->maybe_mapped_pages); 136 137 // Only map those pages that are not mapped anywhere else (likely due 138 // to a first touch or a migration). We pass 139 // UvmEventMapRemoteCauseInvalid since the destination processor of a 140 // migration will never be mapped remotely. 141 status = uvm_va_block_map(va_block, 142 va_block_context, 143 dest_id, 144 region, 145 &va_block_context->caller_page_mask, 146 UVM_PROT_READ_WRITE_ATOMIC, 147 UvmEventMapRemoteCauseInvalid, 148 &local_tracker); 149 if (status != NV_OK) 150 goto out; 151 152 // Add mappings for AccessedBy processors 153 // 154 // No mappings within this call will operate on dest_id, so we don't 155 // need to acquire the map operation above. 156 status = uvm_va_block_add_mappings_after_migration(va_block, 157 va_block_context, 158 dest_id, 159 dest_id, 160 region, 161 &va_block_context->caller_page_mask, 162 UVM_PROT_READ_WRITE_ATOMIC, 163 NULL); 164 165 out: 166 tracker_status = uvm_tracker_add_tracker_safe(&va_block->tracker, &local_tracker); 167 uvm_tracker_deinit(&local_tracker); 168 return status == NV_OK ? tracker_status : status; 169 } 170 171 // Pages that are not mapped anywhere can be safely mapped with RWA permission. 172 // The rest of pages need to individually compute the maximum permission that 173 // does not require a revocation. 174 static NV_STATUS block_migrate_add_mappings(uvm_va_block_t *va_block, 175 uvm_va_block_retry_t *va_block_retry, 176 uvm_va_block_context_t *va_block_context, 177 uvm_va_block_region_t region, 178 uvm_processor_id_t dest_id) 179 180 { 181 NV_STATUS status; 182 183 status = block_migrate_map_unmapped_pages(va_block, 184 va_block_retry, 185 va_block_context, 186 region, 187 dest_id); 188 if (status != NV_OK) 189 return status; 190 191 return block_migrate_map_mapped_pages(va_block, 192 va_block_retry, 193 va_block_context, 194 region, 195 dest_id); 196 } 197 198 NV_STATUS uvm_va_block_migrate_locked(uvm_va_block_t *va_block, 199 uvm_va_block_retry_t *va_block_retry, 200 uvm_va_block_context_t *va_block_context, 201 uvm_va_block_region_t region, 202 uvm_processor_id_t dest_id, 203 uvm_migrate_mode_t mode, 204 uvm_tracker_t *out_tracker) 205 { 206 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 207 NV_STATUS status, tracker_status = NV_OK; 208 209 uvm_assert_mutex_locked(&va_block->lock); 210 UVM_ASSERT(uvm_hmm_check_context_vma_is_valid(va_block, va_block_context, region)); 211 212 if (uvm_va_block_is_hmm(va_block)) { 213 status = uvm_hmm_va_block_migrate_locked(va_block, 214 va_block_retry, 215 va_block_context, 216 dest_id, 217 region, 218 UVM_MAKE_RESIDENT_CAUSE_API_MIGRATE); 219 } 220 else { 221 va_block_context->policy = uvm_va_range_get_policy(va_block->va_range); 222 223 if (uvm_va_policy_is_read_duplicate(va_block_context->policy, va_space)) { 224 status = uvm_va_block_make_resident_read_duplicate(va_block, 225 va_block_retry, 226 va_block_context, 227 dest_id, 228 region, 229 NULL, 230 NULL, 231 UVM_MAKE_RESIDENT_CAUSE_API_MIGRATE); 232 } 233 else { 234 status = uvm_va_block_make_resident(va_block, 235 va_block_retry, 236 va_block_context, 237 dest_id, 238 region, 239 NULL, 240 NULL, 241 UVM_MAKE_RESIDENT_CAUSE_API_MIGRATE); 242 } 243 } 244 245 if (status == NV_OK && mode == UVM_MIGRATE_MODE_MAKE_RESIDENT_AND_MAP) { 246 // block_migrate_add_mappings will acquire the work from the above 247 // make_resident call and update the VA block tracker. 248 status = block_migrate_add_mappings(va_block, va_block_retry, va_block_context, region, dest_id); 249 } 250 251 if (out_tracker) 252 tracker_status = uvm_tracker_add_tracker_safe(out_tracker, &va_block->tracker); 253 254 return status == NV_OK ? tracker_status : status; 255 } 256 257 // Unmapping CPU pages on P9 systems is very costly, to the point that it 258 // becomes the bottleneck of UvmMigrate. We have measured up to 3x lower BW for 259 // migrations that need to remove CPU mappings compared to migrations that only 260 // create CPU mappings. The overhead can be fully attributed to the TLB 261 // shootdown. When a CPU page is unmapped, it needs to (1) invalidate any copy 262 // in the P9 cores, and (2) if ATS is enabled, issue ATSD messages over NVLINK 263 // to remove the corresponding entries in the GPUs' TLBs. ATSDs are not even 264 // required when migration managed memory since UVM ensures that there are no 265 // ATS entries cached in the GPU TLBs for the managed VA ranges. However, we 266 // don't have a way to skip them as of today. 267 // 268 // In order to minimize the overhead of CPU unmaps during UvmMigrate we try to 269 // call unmap_mapping_range on VA regions larger than the VA block granularity 270 // before the actual migration so that TLB invalidations are batched better by 271 // the OS. This also has an impact in the number of ATSD messages issued. This 272 // is because the NPU code uses MMU notifiers in order to get a callback 273 // (invalidate_range) when a TLB invalidation is required. Fortunately, this 274 // callback is not called if there is nothing to be invalidated. Therefore, if 275 // we issue a large unmap, subsequent unmaps within that region will not invoke 276 // the callback. 277 // 278 // However, due to (1), even issuing a single invalidate for the whole migrated 279 // range introduces a noticeable overhead (20-30%) on systems with 3xNVLINK2. 280 // This is only expected to get worse if CPU-GPU interconnects' BW keeps 281 // increasing. 282 // 283 // Thus, VA range migrations are split into groups of contiguous VA blocks, and 284 // trigger a single pre-unmap of the group of VA blocks in the Linux kernel 285 // before the VA blocks' migration starts. This way, we trigger larger (more 286 // efficient) TLB invalidations than when we do it one VA block a time, while 287 // still being able to pipeline the migration, which allows to hide most of the 288 // costs of (1). 289 // 290 // However, there are some cases in which the CPU has mappings to the pages 291 // being migrated but they don't need to be removed (which can introduce 292 // unnecessary CPU faults later on). Therefore, we skip the pre-unmap step 293 // under the following conditions: 294 // - Pages mapped by the CPU that are *already* in the destination. 295 // - Pages mapped by the CPU that are *not* in the destination but 296 // read-duplication is enabled in the VA range. 297 298 // This function checks if the pre-unmap optimization is required given the 299 // system capabilities and the destination of the migration. This is to skip 300 // any subsequent checks required by the optimization, which can be costly. 301 // 302 // The current logic checks that: 303 // - We are in the first pass of the migration (see the explanation of the 304 // two-pass strategy in uvm_migrate). 305 // - The CPU has an NVLINK interconnect to the GPUs. Otherwise, we don't 306 // need this optimization since we are already limited by PCIe BW. 307 // - If the migration spans several VA blocks, otherwise skip the preunmap to 308 // avoid the overhead. 309 static bool migration_should_do_cpu_preunmap(uvm_va_space_t *va_space, 310 uvm_migrate_pass_t pass, 311 bool is_single_block) 312 313 { 314 if (!g_uvm_perf_migrate_cpu_preunmap_enable) 315 return false; 316 317 if (pass != UVM_MIGRATE_PASS_FIRST || is_single_block) 318 return false; 319 320 if (uvm_processor_mask_get_gpu_count(&va_space->has_nvlink[UVM_ID_CPU_VALUE]) == 0) 321 return false; 322 323 return true; 324 } 325 326 // This function determines if the VA range properties avoid the need to remove 327 // CPU mappings on UvmMigrate. Currently, it only checks whether 328 // read-duplication is enabled in the VA range. This is because, when migrating 329 // read-duplicated VA blocks, the source processor doesn't need to be unmapped 330 // (though it may need write access revoked). 331 static bool va_range_should_do_cpu_preunmap(const uvm_va_policy_t *policy, 332 uvm_va_space_t *va_space) 333 { 334 return !uvm_va_policy_is_read_duplicate(policy, va_space); 335 } 336 337 // Function that determines if the VA block to be migrated contains pages with 338 // CPU mappings that don't need to be removed (see the comment above). In that 339 // case false is returned. Otherwise it returns true, and stores in the 340 // variable pointed by num_unmap_pages the number of pages that do need to 341 // remove their CPU mappings. 342 static bool va_block_should_do_cpu_preunmap(uvm_va_block_t *va_block, 343 uvm_va_block_context_t *va_block_context, 344 NvU64 start, 345 NvU64 end, 346 uvm_processor_id_t dest_id, 347 NvU32 *num_unmap_pages) 348 { 349 const uvm_page_mask_t *mapped_pages_cpu; 350 NvU32 num_cpu_unchanged_pages = 0; 351 uvm_va_block_region_t region; 352 353 *num_unmap_pages = 0; 354 355 if (!va_block) 356 return true; 357 358 UVM_ASSERT(va_range_should_do_cpu_preunmap(va_block_context->policy, uvm_va_block_get_va_space(va_block))); 359 360 region = uvm_va_block_region_from_start_end(va_block, max(start, va_block->start), min(end, va_block->end)); 361 362 uvm_mutex_lock(&va_block->lock); 363 364 mapped_pages_cpu = uvm_va_block_map_mask_get(va_block, UVM_ID_CPU); 365 if (uvm_processor_mask_test(&va_block->resident, dest_id)) { 366 const uvm_page_mask_t *resident_pages_dest = uvm_va_block_resident_mask_get(va_block, dest_id); 367 uvm_page_mask_t *do_not_unmap_pages = &va_block_context->scratch_page_mask; 368 369 // TODO: Bug 1877578 370 // 371 // We assume that if pages are mapped on the CPU and not resident on 372 // the destination, the pages will change residency so the CPU must be 373 // unmapped. If we implement automatic read-duplication heuristics in 374 // the future, we'll also need to check if the pages are being 375 // read-duplicated. 376 uvm_page_mask_and(do_not_unmap_pages, mapped_pages_cpu, resident_pages_dest); 377 378 num_cpu_unchanged_pages = uvm_page_mask_region_weight(do_not_unmap_pages, region); 379 } 380 381 *num_unmap_pages = uvm_page_mask_region_weight(mapped_pages_cpu, region) - num_cpu_unchanged_pages; 382 383 uvm_mutex_unlock(&va_block->lock); 384 385 return num_cpu_unchanged_pages == 0; 386 } 387 388 static void preunmap_multi_block(uvm_va_range_t *va_range, 389 uvm_va_block_context_t *va_block_context, 390 NvU64 start, 391 NvU64 end, 392 uvm_processor_id_t dest_id) 393 { 394 size_t i; 395 const size_t first_block_index = uvm_va_range_block_index(va_range, start); 396 const size_t last_block_index = uvm_va_range_block_index(va_range, end); 397 NvU32 num_unmap_pages = 0; 398 399 UVM_ASSERT(start >= va_range->node.start); 400 UVM_ASSERT(end <= va_range->node.end); 401 UVM_ASSERT(va_range->type == UVM_VA_RANGE_TYPE_MANAGED); 402 uvm_assert_rwsem_locked(&va_range->va_space->lock); 403 404 UVM_ASSERT(uvm_range_group_all_migratable(va_range->va_space, start, end)); 405 406 for (i = first_block_index; i <= last_block_index; i++) { 407 NvU32 num_block_unmap_pages; 408 409 if (!va_block_should_do_cpu_preunmap(uvm_va_range_block(va_range, i), 410 va_block_context, 411 start, 412 end, 413 dest_id, 414 &num_block_unmap_pages)) { 415 return; 416 } 417 418 num_unmap_pages += num_block_unmap_pages; 419 } 420 421 if (num_unmap_pages > 0) 422 unmap_mapping_range(va_range->va_space->mapping, start, end - start + 1, 1); 423 } 424 425 static NV_STATUS uvm_va_range_migrate_multi_block(uvm_va_range_t *va_range, 426 uvm_va_block_context_t *va_block_context, 427 NvU64 start, 428 NvU64 end, 429 uvm_processor_id_t dest_id, 430 uvm_migrate_mode_t mode, 431 uvm_tracker_t *out_tracker) 432 { 433 size_t i; 434 const size_t first_block_index = uvm_va_range_block_index(va_range, start); 435 const size_t last_block_index = uvm_va_range_block_index(va_range, end); 436 437 UVM_ASSERT(start >= va_range->node.start); 438 UVM_ASSERT(end <= va_range->node.end); 439 UVM_ASSERT(va_range->type == UVM_VA_RANGE_TYPE_MANAGED); 440 uvm_assert_rwsem_locked(&va_range->va_space->lock); 441 442 UVM_ASSERT(uvm_range_group_all_migratable(va_range->va_space, start, end)); 443 444 // Iterate over blocks, populating them if necessary 445 for (i = first_block_index; i <= last_block_index; i++) { 446 uvm_va_block_retry_t va_block_retry; 447 uvm_va_block_region_t region; 448 uvm_va_block_t *va_block; 449 NV_STATUS status = uvm_va_range_block_create(va_range, i, &va_block); 450 451 if (status != NV_OK) 452 return status; 453 454 region = uvm_va_block_region_from_start_end(va_block, 455 max(start, va_block->start), 456 min(end, va_block->end)); 457 458 status = UVM_VA_BLOCK_LOCK_RETRY(va_block, &va_block_retry, 459 uvm_va_block_migrate_locked(va_block, 460 &va_block_retry, 461 va_block_context, 462 region, 463 dest_id, 464 mode, 465 out_tracker)); 466 if (status != NV_OK) 467 return status; 468 } 469 470 return NV_OK; 471 } 472 473 static NV_STATUS uvm_va_range_migrate(uvm_va_range_t *va_range, 474 uvm_va_block_context_t *va_block_context, 475 NvU64 start, 476 NvU64 end, 477 uvm_processor_id_t dest_id, 478 uvm_migrate_mode_t mode, 479 bool should_do_cpu_preunmap, 480 uvm_tracker_t *out_tracker) 481 { 482 NvU64 preunmap_range_start = start; 483 484 UVM_ASSERT(va_block_context->policy == uvm_va_range_get_policy(va_range)); 485 486 should_do_cpu_preunmap = should_do_cpu_preunmap && va_range_should_do_cpu_preunmap(va_block_context->policy, 487 va_range->va_space); 488 489 // Divide migrations into groups of contiguous VA blocks. This is to trigger 490 // CPU unmaps for that region before the migration starts. 491 while (preunmap_range_start < end) { 492 NV_STATUS status; 493 NvU64 preunmap_range_end; 494 495 if (should_do_cpu_preunmap) { 496 preunmap_range_end = UVM_ALIGN_UP(preunmap_range_start + 1, g_uvm_perf_migrate_cpu_preunmap_size); 497 preunmap_range_end = min(preunmap_range_end - 1, end); 498 499 preunmap_multi_block(va_range, 500 va_block_context, 501 preunmap_range_start, 502 preunmap_range_end, 503 dest_id); 504 } 505 else { 506 preunmap_range_end = end; 507 } 508 509 status = uvm_va_range_migrate_multi_block(va_range, 510 va_block_context, 511 preunmap_range_start, 512 preunmap_range_end, 513 dest_id, 514 mode, 515 out_tracker); 516 if (status != NV_OK) 517 return status; 518 519 preunmap_range_start = preunmap_range_end + 1; 520 } 521 522 return NV_OK; 523 } 524 525 static NV_STATUS uvm_migrate_ranges(uvm_va_space_t *va_space, 526 uvm_va_block_context_t *va_block_context, 527 uvm_va_range_t *first_va_range, 528 NvU64 base, 529 NvU64 length, 530 uvm_processor_id_t dest_id, 531 uvm_migrate_mode_t mode, 532 bool should_do_cpu_preunmap, 533 uvm_tracker_t *out_tracker) 534 { 535 uvm_va_range_t *va_range, *va_range_last; 536 NvU64 end = base + length - 1; 537 NV_STATUS status = NV_OK; 538 bool skipped_migrate = false; 539 540 if (!first_va_range) { 541 // For HMM, we iterate over va_blocks since there is no va_range. 542 return uvm_hmm_migrate_ranges(va_space, 543 va_block_context, 544 base, 545 length, 546 dest_id, 547 mode, 548 out_tracker); 549 } 550 551 UVM_ASSERT(first_va_range == uvm_va_space_iter_first(va_space, base, base)); 552 553 va_range_last = NULL; 554 uvm_for_each_va_range_in_contig_from(va_range, va_space, first_va_range, end) { 555 uvm_range_group_range_iter_t iter; 556 va_range_last = va_range; 557 558 // Only managed ranges can be migrated 559 if (va_range->type != UVM_VA_RANGE_TYPE_MANAGED) { 560 status = NV_ERR_INVALID_ADDRESS; 561 break; 562 } 563 564 va_block_context->policy = uvm_va_range_get_policy(va_range); 565 566 // For UVM-Lite GPUs, the CUDA driver may suballocate a single va_range 567 // into many range groups. For this reason, we iterate over each va_range first 568 // then through the range groups within. 569 uvm_range_group_for_each_migratability_in(&iter, 570 va_space, 571 max(base, va_range->node.start), 572 min(end, va_range->node.end)) { 573 // Skip non-migratable VA ranges 574 if (!iter.migratable) { 575 // Only return NV_WARN_MORE_PROCESSING_REQUIRED if the pages aren't 576 // already resident at dest_id. 577 if (!uvm_id_equal(uvm_va_range_get_policy(va_range)->preferred_location, dest_id)) 578 skipped_migrate = true; 579 } 580 else if (uvm_processor_mask_test(&va_range->uvm_lite_gpus, dest_id) && 581 !uvm_id_equal(dest_id, uvm_va_range_get_policy(va_range)->preferred_location)) { 582 // Don't migrate to a non-faultable GPU that is in UVM-Lite mode, 583 // unless it's the preferred location 584 status = NV_ERR_INVALID_DEVICE; 585 break; 586 } 587 else { 588 status = uvm_va_range_migrate(va_range, 589 va_block_context, 590 iter.start, 591 iter.end, 592 dest_id, 593 mode, 594 should_do_cpu_preunmap, 595 out_tracker); 596 if (status != NV_OK) 597 break; 598 } 599 } 600 } 601 602 if (status != NV_OK) 603 return status; 604 605 // Check that we were able to iterate over the entire range without any gaps 606 if (!va_range_last || va_range_last->node.end < end) 607 return NV_ERR_INVALID_ADDRESS; 608 609 if (skipped_migrate) 610 return NV_WARN_MORE_PROCESSING_REQUIRED; 611 612 return NV_OK; 613 } 614 615 static NV_STATUS uvm_migrate(uvm_va_space_t *va_space, 616 struct mm_struct *mm, 617 NvU64 base, 618 NvU64 length, 619 uvm_processor_id_t dest_id, 620 NvU32 migrate_flags, 621 uvm_va_range_t *first_va_range, 622 uvm_tracker_t *out_tracker) 623 { 624 NV_STATUS status = NV_OK; 625 uvm_va_block_context_t *va_block_context; 626 bool do_mappings; 627 bool do_two_passes; 628 bool is_single_block; 629 bool should_do_cpu_preunmap; 630 631 uvm_assert_rwsem_locked(&va_space->lock); 632 633 // If the GPU has its memory disabled, just skip the migration and let 634 // faults take care of things. 635 if (!uvm_va_space_processor_has_memory(va_space, dest_id)) 636 return NV_OK; 637 638 if (mm) 639 uvm_assert_mmap_lock_locked(mm); 640 641 va_block_context = uvm_va_block_context_alloc(mm); 642 if (!va_block_context) 643 return NV_ERR_NO_MEMORY; 644 645 // We perform two passes (unless the migration only covers a single VA 646 // block or UVM_MIGRATE_FLAG_SKIP_CPU_MAP is passed). This helps in the 647 // following scenarios: 648 // 649 // - Migrations that add CPU mappings, since they are synchronous operations 650 // that delay the migration of the next VA blocks. 651 // - Concurrent migrations. This is due to our current channel selection 652 // logic that doesn't prevent false dependencies between independent 653 // operations. For example, removal of mappings for outgoing transfers are 654 // delayed by the mappings added by incoming transfers. 655 // TODO: Bug 1764953: Re-evaluate the two-pass logic when channel selection 656 // is overhauled. 657 // 658 // The two passes are as follows: 659 // 660 // 1- Transfer all VA blocks (do not add mappings) 661 // 2- Go block by block reexecuting the transfer (in case someone moved it 662 // since the first pass), and adding the mappings. 663 // 664 // For HMM (!first_va_range), we always do a single pass. 665 is_single_block = !first_va_range || is_migration_single_block(first_va_range, base, length); 666 do_mappings = UVM_ID_IS_GPU(dest_id) || !(migrate_flags & UVM_MIGRATE_FLAG_SKIP_CPU_MAP); 667 do_two_passes = do_mappings && !is_single_block; 668 669 if (do_two_passes) { 670 should_do_cpu_preunmap = migration_should_do_cpu_preunmap(va_space, UVM_MIGRATE_PASS_FIRST, is_single_block); 671 672 status = uvm_migrate_ranges(va_space, 673 va_block_context, 674 first_va_range, 675 base, 676 length, 677 dest_id, 678 UVM_MIGRATE_MODE_MAKE_RESIDENT, 679 should_do_cpu_preunmap, 680 out_tracker); 681 } 682 683 if (status == NV_OK) { 684 uvm_migrate_mode_t mode = do_mappings? UVM_MIGRATE_MODE_MAKE_RESIDENT_AND_MAP: 685 UVM_MIGRATE_MODE_MAKE_RESIDENT; 686 uvm_migrate_pass_t pass = do_two_passes? UVM_MIGRATE_PASS_SECOND: 687 UVM_MIGRATE_PASS_FIRST; 688 should_do_cpu_preunmap = migration_should_do_cpu_preunmap(va_space, pass, is_single_block); 689 690 status = uvm_migrate_ranges(va_space, 691 va_block_context, 692 first_va_range, 693 base, 694 length, 695 dest_id, 696 mode, 697 should_do_cpu_preunmap, 698 out_tracker); 699 } 700 701 uvm_va_block_context_free(va_block_context); 702 703 return status; 704 } 705 706 static NV_STATUS semaphore_release_from_gpu(uvm_gpu_t *gpu, 707 uvm_va_range_semaphore_pool_t *semaphore_va_range, 708 NvU64 semaphore_user_addr, 709 NvU32 semaphore_payload, 710 uvm_tracker_t *release_after_tracker) 711 { 712 NV_STATUS status; 713 uvm_push_t push; 714 uvm_channel_type_t channel_type; 715 NvU64 semaphore_gpu_va; 716 NvU64 semaphore_offset; 717 718 UVM_ASSERT(uvm_mem_mapped_on_gpu_kernel(semaphore_va_range->mem, gpu)); 719 720 semaphore_offset = semaphore_user_addr - (NvU64)(uintptr_t)semaphore_va_range->mem->user->addr; 721 semaphore_gpu_va = uvm_mem_get_gpu_va_kernel(semaphore_va_range->mem, gpu) + semaphore_offset; 722 723 // Outside of SR-IOV heavy, using UVM_CHANNEL_TYPE_MEMOPS is optimal from a 724 // performance standpoint because if the migration is targeting a GPU, it is 725 // likely that the channel used for the GPU page table update (pushed to 726 // UVM_CHANNEL_TYPE_MEMOPS) will also be used for the release. The 727 // inter-channel dependency avoided by using a single channel can add a 728 // significant overhead to the enclosing migration. 729 // 730 // In SR-IOV heavy, the user semaphore release is functionally forbidden 731 // from being pushed to a UVM_CHANNEL_TYPE_MEMOPS channel, because it is not 732 // a page tree operation. 733 if (uvm_gpu_is_virt_mode_sriov_heavy(gpu)) 734 channel_type = UVM_CHANNEL_TYPE_GPU_INTERNAL; 735 else 736 channel_type = UVM_CHANNEL_TYPE_MEMOPS; 737 738 status = uvm_push_begin_acquire(gpu->channel_manager, 739 channel_type, 740 release_after_tracker, 741 &push, 742 "Pushing semaphore release (*0x%llx = %u)", 743 semaphore_user_addr, 744 semaphore_payload); 745 if (status != NV_OK) 746 return status; 747 748 gpu->parent->ce_hal->semaphore_release(&push, semaphore_gpu_va, semaphore_payload); 749 uvm_push_end(&push); 750 751 uvm_mutex_lock(&semaphore_va_range->tracker_lock); 752 status = uvm_tracker_add_push_safe(&semaphore_va_range->tracker, &push); 753 uvm_tracker_remove_completed(&semaphore_va_range->tracker); 754 uvm_mutex_unlock(&semaphore_va_range->tracker_lock); 755 756 return status; 757 } 758 759 static void semaphore_release_from_cpu(uvm_mem_t *semaphore_mem, NvU64 semaphore_user_addr, NvU32 semaphore_payload) 760 { 761 char *semaphore_cpu_va; 762 NvU64 semaphore_offset; 763 764 UVM_ASSERT(uvm_mem_mapped_on_cpu_kernel(semaphore_mem)); 765 766 semaphore_offset = semaphore_user_addr - (NvU64)(uintptr_t)semaphore_mem->user->addr; 767 768 // Prevent processor speculation prior to accessing user-mapped memory to 769 // avoid leaking information from side-channel attacks. Under speculation, a 770 // valid VA range which does not contain this semaphore could be used by the 771 // caller. It's unclear but likely that the user might be able to control 772 // the data at that address. Auditing all potential ways that could happen 773 // is difficult and error-prone, so to be on the safe side we'll just always 774 // block speculation. 775 nv_speculation_barrier(); 776 777 semaphore_cpu_va = (char *) uvm_mem_get_cpu_addr_kernel(semaphore_mem) + semaphore_offset; 778 779 UVM_WRITE_ONCE(*(NvU32 *)semaphore_cpu_va, semaphore_payload); 780 } 781 782 static NV_STATUS semaphore_release(NvU64 semaphore_address, 783 NvU32 semaphore_payload, 784 uvm_va_range_semaphore_pool_t *semaphore_pool, 785 uvm_gpu_t *dest_gpu, 786 uvm_tracker_t *tracker_ptr) 787 { 788 uvm_gpu_t *gpu; 789 uvm_gpu_t *gpu_owner = semaphore_pool->owner; 790 791 // If there is a GPU owner, release the semaphore from it. 792 if (gpu_owner != NULL) 793 return semaphore_release_from_gpu(gpu_owner, semaphore_pool, semaphore_address, semaphore_payload, tracker_ptr); 794 795 // Attempt eager release from CPU if the tracker is already completed. 796 if (uvm_tracker_is_completed(tracker_ptr)) { 797 semaphore_release_from_cpu(semaphore_pool->mem, semaphore_address, semaphore_payload); 798 return NV_OK; 799 } 800 801 if (dest_gpu == NULL) { 802 // The destination is the CPU, but we didn't do a CPU release above 803 // because the previous work is not complete. This situation arises when 804 // accessed_by mappings are being set up asynchronously, or the 805 // test-only flag UVM_MIGRATE_FLAG_SKIP_CPU_MAP is used. So there should 806 // be a registered GPU, since all CPU work is synchronous, and the 807 // semaphore must be mapped on that GPU. 808 // 809 // Note that the GPU selected for the release may not be the same device 810 // that prevented the tracker from being complete. 811 gpu = uvm_global_processor_mask_find_first_gpu(&semaphore_pool->mem->kernel.mapped_on); 812 813 UVM_ASSERT(gpu != NULL); 814 } 815 else { 816 gpu = dest_gpu; 817 } 818 819 return semaphore_release_from_gpu(gpu, semaphore_pool, semaphore_address, semaphore_payload, tracker_ptr); 820 } 821 822 NV_STATUS uvm_migrate_init(void) 823 { 824 NV_STATUS status = uvm_migrate_pageable_init(); 825 if (status != NV_OK) 826 return status; 827 828 g_uvm_perf_migrate_cpu_preunmap_enable = uvm_perf_migrate_cpu_preunmap_enable != 0; 829 830 BUILD_BUG_ON((UVM_VA_BLOCK_SIZE) & (UVM_VA_BLOCK_SIZE - 1)); 831 832 if (g_uvm_perf_migrate_cpu_preunmap_enable) { 833 if (uvm_perf_migrate_cpu_preunmap_block_order <= UVM_PERF_MIGRATE_CPU_PREUNMAP_BLOCK_ORDER_MAX) { 834 g_uvm_perf_migrate_cpu_preunmap_size = UVM_VA_BLOCK_SIZE << uvm_perf_migrate_cpu_preunmap_block_order; 835 } 836 else { 837 g_uvm_perf_migrate_cpu_preunmap_size = UVM_VA_BLOCK_SIZE << UVM_PERF_MIGRATE_CPU_PREUNMAP_BLOCK_ORDER_DEFAULT; 838 839 pr_info("Invalid value %u for uvm_perf_migrate_cpu_preunmap_block_order. Using %u instead\n", 840 uvm_perf_migrate_cpu_preunmap_block_order, 841 UVM_PERF_MIGRATE_CPU_PREUNMAP_BLOCK_ORDER_DEFAULT); 842 } 843 } 844 845 return NV_OK; 846 } 847 848 void uvm_migrate_exit(void) 849 { 850 uvm_migrate_pageable_exit(); 851 } 852 853 NV_STATUS uvm_api_migrate(UVM_MIGRATE_PARAMS *params, struct file *filp) 854 { 855 uvm_va_space_t *va_space = uvm_va_space_get(filp); 856 uvm_tracker_t tracker = UVM_TRACKER_INIT(); 857 uvm_tracker_t *tracker_ptr = NULL; 858 uvm_gpu_t *dest_gpu = NULL; 859 uvm_va_range_t *sema_va_range = NULL; 860 struct mm_struct *mm; 861 NV_STATUS status = NV_OK; 862 bool flush_events = false; 863 const bool synchronous = !(params->flags & UVM_MIGRATE_FLAG_ASYNC); 864 865 // We temporarily allow 0 length in the IOCTL parameters as a signal to 866 // only release the semaphore. This is because user-space is in charge of 867 // migrating pageable memory in some cases. 868 // 869 // TODO: Bug 2419180: do not allow 0 length migrations when we fully switch 870 // to migrate_vma for all types of vmas. 871 if (params->length > 0 || synchronous || params->semaphoreAddress == 0) { 872 if (uvm_api_range_invalid(params->base, params->length)) 873 return NV_ERR_INVALID_ADDRESS; 874 } 875 876 if (params->flags & ~UVM_MIGRATE_FLAGS_ALL) 877 return NV_ERR_INVALID_ARGUMENT; 878 879 if ((params->flags & UVM_MIGRATE_FLAGS_TEST_ALL) && !uvm_enable_builtin_tests) { 880 UVM_INFO_PRINT("Test flag set for UVM_MIGRATE. Did you mean to insmod with uvm_enable_builtin_tests=1?\n"); 881 UVM_INFO_PRINT("TEMP\n"); 882 return NV_ERR_INVALID_ARGUMENT; 883 } 884 885 // mmap_lock will be needed if we have to create CPU mappings 886 mm = uvm_va_space_mm_or_current_retain_lock(va_space); 887 uvm_va_space_down_read(va_space); 888 889 if (synchronous) { 890 if (params->semaphoreAddress != 0) { 891 status = NV_ERR_INVALID_ARGUMENT; 892 goto done; 893 } 894 } 895 else { 896 if (params->semaphoreAddress == 0) { 897 if (params->semaphorePayload != 0) { 898 status = NV_ERR_INVALID_ARGUMENT; 899 goto done; 900 } 901 } 902 else { 903 sema_va_range = uvm_va_range_find(va_space, params->semaphoreAddress); 904 if (!IS_ALIGNED(params->semaphoreAddress, sizeof(params->semaphorePayload)) || 905 !sema_va_range || sema_va_range->type != UVM_VA_RANGE_TYPE_SEMAPHORE_POOL) { 906 status = NV_ERR_INVALID_ADDRESS; 907 goto done; 908 } 909 } 910 } 911 912 if (!uvm_uuid_is_cpu(¶ms->destinationUuid)) { 913 if (params->flags & UVM_MIGRATE_FLAG_NO_GPU_VA_SPACE) 914 dest_gpu = uvm_va_space_get_gpu_by_uuid(va_space, ¶ms->destinationUuid); 915 else 916 dest_gpu = uvm_va_space_get_gpu_by_uuid_with_gpu_va_space(va_space, ¶ms->destinationUuid); 917 918 if (!dest_gpu) { 919 status = NV_ERR_INVALID_DEVICE; 920 goto done; 921 } 922 923 if (params->length > 0 && !uvm_gpu_can_address(dest_gpu, params->base, params->length)) { 924 status = NV_ERR_OUT_OF_RANGE; 925 goto done; 926 } 927 } 928 929 UVM_ASSERT(status == NV_OK); 930 931 // If we're synchronous or if we need to release a semaphore, use a tracker. 932 if (synchronous || params->semaphoreAddress) 933 tracker_ptr = &tracker; 934 935 if (params->length > 0) { 936 uvm_api_range_type_t type; 937 938 type = uvm_api_range_type_check(va_space, mm, params->base, params->length); 939 if (type == UVM_API_RANGE_TYPE_INVALID) { 940 status = NV_ERR_INVALID_ADDRESS; 941 goto done; 942 } 943 944 if (type == UVM_API_RANGE_TYPE_ATS) { 945 uvm_migrate_args_t uvm_migrate_args = 946 { 947 .va_space = va_space, 948 .mm = mm, 949 .start = params->base, 950 .length = params->length, 951 .dst_id = (dest_gpu ? dest_gpu->id : UVM_ID_CPU), 952 .dst_node_id = (int)params->cpuNumaNode, 953 .populate_permissions = UVM_POPULATE_PERMISSIONS_INHERIT, 954 .touch = false, 955 .skip_mapped = false, 956 .user_space_start = ¶ms->userSpaceStart, 957 .user_space_length = ¶ms->userSpaceLength, 958 }; 959 960 status = uvm_migrate_pageable(&uvm_migrate_args); 961 } 962 else { 963 status = uvm_migrate(va_space, 964 mm, 965 params->base, 966 params->length, 967 (dest_gpu ? dest_gpu->id : UVM_ID_CPU), 968 params->flags, 969 uvm_va_space_iter_first(va_space, 970 params->base, 971 params->base), 972 tracker_ptr); 973 } 974 } 975 976 done: 977 // We only need to hold mmap_lock to create new CPU mappings, so drop it if 978 // we need to wait for the tracker to finish. 979 // 980 // TODO: Bug 1766650: For large migrations with destination CPU, try 981 // benchmarks to see if a two-pass approach would be faster (first 982 // pass pushes all GPU work asynchronously, second pass updates CPU 983 // mappings synchronously). 984 if (mm) 985 uvm_up_read_mmap_lock_out_of_order(mm); 986 987 if (tracker_ptr) { 988 // If requested, release semaphore 989 if (params->semaphoreAddress && (status == NV_OK)) { 990 status = semaphore_release(params->semaphoreAddress, 991 params->semaphorePayload, 992 &sema_va_range->semaphore_pool, 993 dest_gpu, 994 tracker_ptr); 995 } 996 997 // Wait on the tracker if we are synchronous or there was an error. The 998 // VA space lock must be held to prevent GPUs from being unregistered. 999 if (synchronous || (status != NV_OK)) { 1000 NV_STATUS tracker_status = uvm_tracker_wait(tracker_ptr); 1001 1002 // Only clobber status if we didn't hit an earlier error 1003 if (status == NV_OK) 1004 status = tracker_status; 1005 1006 flush_events = true; 1007 } 1008 1009 uvm_tracker_deinit(tracker_ptr); 1010 } 1011 1012 uvm_va_space_up_read(va_space); 1013 uvm_va_space_mm_or_current_release(va_space, mm); 1014 1015 // If the migration is known to be complete, eagerly dispatch the migration 1016 // events, instead of processing them on a later event flush. Note that an 1017 // asynchronous migration could be complete by now, but the flush would not 1018 // be triggered. 1019 if (flush_events) 1020 uvm_tools_flush_events(); 1021 1022 return status; 1023 } 1024 1025 NV_STATUS uvm_api_migrate_range_group(UVM_MIGRATE_RANGE_GROUP_PARAMS *params, struct file *filp) 1026 { 1027 NV_STATUS status = NV_OK; 1028 NV_STATUS tracker_status = NV_OK; 1029 uvm_va_space_t *va_space = uvm_va_space_get(filp); 1030 struct mm_struct *mm; 1031 uvm_range_group_t *range_group; 1032 uvm_range_group_range_t *rgr; 1033 uvm_processor_id_t dest_id; 1034 uvm_tracker_t local_tracker = UVM_TRACKER_INIT(); 1035 NvU32 migrate_flags = 0; 1036 uvm_gpu_t *gpu = NULL; 1037 1038 // mmap_lock will be needed if we have to create CPU mappings 1039 mm = uvm_va_space_mm_or_current_retain_lock(va_space); 1040 uvm_va_space_down_read(va_space); 1041 1042 if (uvm_uuid_is_cpu(¶ms->destinationUuid)) { 1043 dest_id = UVM_ID_CPU; 1044 } 1045 else { 1046 gpu = uvm_va_space_get_gpu_by_uuid_with_gpu_va_space(va_space, ¶ms->destinationUuid); 1047 if (!gpu) { 1048 status = NV_ERR_INVALID_DEVICE; 1049 goto done; 1050 } 1051 1052 dest_id = gpu->id; 1053 } 1054 1055 range_group = radix_tree_lookup(&va_space->range_groups, params->rangeGroupId); 1056 if (!range_group) { 1057 status = NV_ERR_OBJECT_NOT_FOUND; 1058 goto done; 1059 } 1060 1061 // Migrate all VA ranges in the range group. uvm_migrate is used because it performs all 1062 // VA range validity checks. 1063 list_for_each_entry(rgr, &range_group->ranges, range_group_list_node) { 1064 NvU64 start = rgr->node.start; 1065 NvU64 length = rgr->node.end - rgr->node.start + 1; 1066 1067 if (gpu && !uvm_gpu_can_address(gpu, start, length)) { 1068 status = NV_ERR_OUT_OF_RANGE; 1069 } 1070 else { 1071 uvm_va_range_t *first_va_range = uvm_va_space_iter_first(va_space, start, start); 1072 1073 if (!first_va_range || first_va_range->type != UVM_VA_RANGE_TYPE_MANAGED) { 1074 status = NV_ERR_INVALID_ADDRESS; 1075 goto done; 1076 } 1077 1078 status = uvm_migrate(va_space, 1079 mm, 1080 start, 1081 length, 1082 dest_id, 1083 migrate_flags, 1084 first_va_range, 1085 &local_tracker); 1086 } 1087 1088 if (status != NV_OK) 1089 goto done; 1090 } 1091 1092 done: 1093 // We only need to hold mmap_lock to create new CPU mappings, so drop it if 1094 // we need to wait for the tracker to finish. 1095 // 1096 // TODO: Bug 1766650: For large migrations with destination CPU, try 1097 // benchmarks to see if a two-pass approach would be faster (first 1098 // pass pushes all GPU work asynchronously, second pass updates CPU 1099 // mappings synchronously). 1100 if (mm) 1101 uvm_up_read_mmap_lock_out_of_order(mm); 1102 1103 tracker_status = uvm_tracker_wait_deinit(&local_tracker); 1104 uvm_va_space_up_read(va_space); 1105 uvm_va_space_mm_or_current_release(va_space, mm); 1106 1107 // This API is synchronous, so wait for migrations to finish 1108 uvm_tools_flush_events(); 1109 1110 return status == NV_OK? tracker_status : status; 1111 } 1112