1 /******************************************************************************* 2 Copyright (c) 2016-2023 NVIDIA Corporation 3 4 Permission is hereby granted, free of charge, to any person obtaining a copy 5 of this software and associated documentation files (the "Software"), to 6 deal in the Software without restriction, including without limitation the 7 rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 8 sell copies of the Software, and to permit persons to whom the Software is 9 furnished to do so, subject to the following conditions: 10 11 The above copyright notice and this permission notice shall be 12 included in all copies or substantial portions of the Software. 13 14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 DEALINGS IN THE SOFTWARE. 21 22 *******************************************************************************/ 23 24 #include "uvm_hmm.h" 25 26 // Support for HMM ( https://docs.kernel.org/mm/hmm.html ): 27 28 #ifdef NVCPU_X86_64 29 static bool uvm_disable_hmm = false; 30 MODULE_PARM_DESC(uvm_disable_hmm, 31 "Force-disable HMM functionality in the UVM driver. " 32 "Default: false (HMM is enabled if possible). " 33 "However, even with uvm_disable_hmm=false, HMM will not be " 34 "enabled if is not supported in this driver build " 35 "configuration, or if ATS settings conflict with HMM."); 36 #else 37 // So far, we've only tested HMM on x86_64, so disable it by default everywhere 38 // else. 39 static bool uvm_disable_hmm = true; 40 MODULE_PARM_DESC(uvm_disable_hmm, 41 "Force-disable HMM functionality in the UVM driver. " 42 "Default: true (HMM is not enabled on this CPU architecture). " 43 "However, even with uvm_disable_hmm=false, HMM will not be " 44 "enabled if is not supported in this driver build " 45 "configuration, or if ATS settings conflict with HMM."); 46 #endif 47 48 module_param(uvm_disable_hmm, bool, 0444); 49 50 #if UVM_IS_CONFIG_HMM() 51 52 #include <linux/hmm.h> 53 #include <linux/rmap.h> 54 #include <linux/migrate.h> 55 #include <linux/userfaultfd_k.h> 56 #include <linux/memremap.h> 57 #include <linux/wait.h> 58 59 #include "uvm_common.h" 60 #include "uvm_gpu.h" 61 #include "uvm_pmm_gpu.h" 62 #include "uvm_hal_types.h" 63 #include "uvm_va_block_types.h" 64 #include "uvm_va_space_mm.h" 65 #include "uvm_va_space.h" 66 #include "uvm_va_range.h" 67 #include "uvm_range_tree.h" 68 #include "uvm_pmm_sysmem.h" 69 #include "uvm_lock.h" 70 #include "uvm_api.h" 71 #include "uvm_va_policy.h" 72 #include "uvm_tools.h" 73 74 static NV_STATUS gpu_chunk_add(uvm_va_block_t *va_block, 75 uvm_page_index_t page_index, 76 struct page *page); 77 78 typedef struct 79 { 80 uvm_processor_id_t processor_id; 81 uvm_processor_id_t new_residency; 82 uvm_va_block_t *va_block; 83 uvm_va_block_retry_t *va_block_retry; 84 uvm_service_block_context_t *service_context; 85 uvm_page_mask_t page_mask; 86 uvm_page_mask_t same_devmem_page_mask; 87 } uvm_hmm_gpu_fault_event_t; 88 89 typedef struct 90 { 91 uvm_va_block_t *va_block; 92 uvm_va_block_retry_t *va_block_retry; 93 uvm_va_block_context_t *va_block_context; 94 uvm_va_block_region_t region; 95 uvm_processor_id_t dest_id; 96 uvm_make_resident_cause_t cause; 97 uvm_page_mask_t page_mask; 98 uvm_page_mask_t same_devmem_page_mask; 99 } uvm_hmm_migrate_event_t; 100 101 typedef struct 102 { 103 uvm_processor_id_t processor_id; 104 uvm_va_block_t *va_block; 105 uvm_va_block_retry_t *va_block_retry; 106 uvm_service_block_context_t *service_context; 107 uvm_page_mask_t page_mask; 108 uvm_page_mask_t same_devmem_page_mask; 109 } uvm_hmm_devmem_fault_context_t; 110 111 bool uvm_hmm_is_enabled_system_wide(void) 112 { 113 if (uvm_disable_hmm) 114 return false; 115 116 if (g_uvm_global.ats.enabled) 117 return false; 118 119 // Confidential Computing and HMM impose mutually exclusive constraints. In 120 // Confidential Computing the GPU can only access pages resident in vidmem, 121 // but in HMM pages may be required to be resident in sysmem: file backed 122 // VMAs, huge pages, etc. 123 if (g_uvm_global.conf_computing_enabled) 124 return false; 125 126 return uvm_va_space_mm_enabled_system(); 127 } 128 129 bool uvm_hmm_is_enabled(uvm_va_space_t *va_space) 130 { 131 return uvm_hmm_is_enabled_system_wide() && 132 uvm_va_space_mm_enabled(va_space) && 133 !(va_space->initialization_flags & UVM_INIT_FLAGS_DISABLE_HMM); 134 } 135 136 static uvm_va_block_t *hmm_va_block_from_node(uvm_range_tree_node_t *node) 137 { 138 if (!node) 139 return NULL; 140 return container_of(node, uvm_va_block_t, hmm.node); 141 } 142 143 void uvm_hmm_va_space_initialize(uvm_va_space_t *va_space) 144 { 145 uvm_hmm_va_space_t *hmm_va_space = &va_space->hmm; 146 147 if (!uvm_hmm_is_enabled(va_space)) 148 return; 149 150 uvm_range_tree_init(&hmm_va_space->blocks); 151 uvm_mutex_init(&hmm_va_space->blocks_lock, UVM_LOCK_ORDER_LEAF); 152 153 return; 154 } 155 156 void uvm_hmm_va_space_destroy(uvm_va_space_t *va_space) 157 { 158 uvm_hmm_va_space_t *hmm_va_space = &va_space->hmm; 159 uvm_range_tree_node_t *node, *next; 160 uvm_va_block_t *va_block; 161 162 if (!uvm_hmm_is_enabled(va_space)) 163 return; 164 165 uvm_assert_rwsem_locked_write(&va_space->lock); 166 167 // The blocks_lock is not needed when the va_space lock is held for write. 168 uvm_range_tree_for_each_safe(node, next, &hmm_va_space->blocks) { 169 va_block = hmm_va_block_from_node(node); 170 uvm_range_tree_remove(&hmm_va_space->blocks, node); 171 mmu_interval_notifier_remove(&va_block->hmm.notifier); 172 uvm_va_block_kill(va_block); 173 } 174 } 175 176 static void hmm_va_block_unregister_gpu(uvm_va_block_t *va_block, 177 uvm_gpu_t *gpu, 178 struct mm_struct *mm) 179 { 180 uvm_va_policy_node_t *node; 181 182 uvm_mutex_lock(&va_block->lock); 183 184 // Reset preferred location and accessed-by of policy nodes if needed. 185 uvm_for_each_va_policy_node_in(node, va_block, va_block->start, va_block->end) { 186 if (uvm_id_equal(node->policy.preferred_location, gpu->id)) 187 node->policy.preferred_location = UVM_ID_INVALID; 188 189 uvm_processor_mask_clear(&node->policy.accessed_by, gpu->id); 190 } 191 192 // Migrate and free any remaining resident allocations on this GPU. 193 uvm_va_block_unregister_gpu_locked(va_block, gpu, mm); 194 195 uvm_mutex_unlock(&va_block->lock); 196 } 197 198 void uvm_hmm_unregister_gpu(uvm_va_space_t *va_space, uvm_gpu_t *gpu, struct mm_struct *mm) 199 { 200 uvm_range_tree_node_t *node; 201 uvm_va_block_t *va_block; 202 203 if (!uvm_hmm_is_enabled(va_space)) 204 return; 205 206 if (mm) 207 uvm_assert_mmap_lock_locked(mm); 208 uvm_assert_rwsem_locked_write(&va_space->lock); 209 210 uvm_range_tree_for_each(node, &va_space->hmm.blocks) { 211 va_block = hmm_va_block_from_node(node); 212 213 hmm_va_block_unregister_gpu(va_block, gpu, mm); 214 } 215 } 216 217 static void hmm_va_block_remove_gpu_va_space(uvm_va_block_t *va_block, 218 uvm_gpu_va_space_t *gpu_va_space, 219 uvm_va_block_context_t *va_block_context) 220 { 221 uvm_mutex_lock(&va_block->lock); 222 223 uvm_va_block_remove_gpu_va_space(va_block, gpu_va_space, va_block_context); 224 225 uvm_mutex_unlock(&va_block->lock); 226 227 // TODO: Bug 3660922: Need to handle read duplication at some point. 228 // See range_remove_gpu_va_space_managed(). 229 } 230 231 void uvm_hmm_remove_gpu_va_space(uvm_va_space_t *va_space, 232 uvm_gpu_va_space_t *gpu_va_space, 233 struct mm_struct *mm) 234 { 235 uvm_va_block_context_t *va_block_context; 236 uvm_range_tree_node_t *node, *next; 237 uvm_va_block_t *va_block; 238 239 if (!uvm_hmm_is_enabled(va_space)) 240 return; 241 242 if (mm) 243 uvm_assert_mmap_lock_locked(mm); 244 uvm_assert_rwsem_locked_write(&va_space->lock); 245 246 va_block_context = uvm_va_space_block_context(va_space, mm); 247 248 uvm_range_tree_for_each_safe(node, next, &va_space->hmm.blocks) { 249 va_block = hmm_va_block_from_node(node); 250 251 hmm_va_block_remove_gpu_va_space(va_block, gpu_va_space, va_block_context); 252 } 253 } 254 255 static bool hmm_invalidate(uvm_va_block_t *va_block, 256 const struct mmu_notifier_range *range, 257 unsigned long cur_seq) 258 { 259 uvm_thread_context_t *uvm_context = uvm_thread_context(); 260 struct mmu_interval_notifier *mni = &va_block->hmm.notifier; 261 struct mm_struct *mm = mni->mm; 262 uvm_va_block_context_t *va_block_context; 263 uvm_va_block_region_t region; 264 NvU64 start, end; 265 uvm_processor_id_t id; 266 NV_STATUS status = NV_OK; 267 268 // The MMU_NOTIFY_RELEASE event isn't really needed since mn_itree_release() 269 // doesn't remove the interval notifiers from the struct_mm so there will 270 // be a full range MMU_NOTIFY_UNMAP event after the release from 271 // unmap_vmas() during exit_mmap(). 272 if (range->event == MMU_NOTIFY_SOFT_DIRTY || range->event == MMU_NOTIFY_RELEASE) 273 return true; 274 275 // Blockable is only set false by 276 // mmu_notifier_invalidate_range_start_nonblock() which is only called in 277 // __oom_reap_task_mm(). 278 if (!mmu_notifier_range_blockable(range)) 279 return false; 280 281 // We only ignore invalidations in this context whilst holding the 282 // va_block lock. This prevents deadlock when try_to_migrate() 283 // calls the notifier, but holding the lock prevents other threads 284 // invalidating PTEs so we can safely assume the results of 285 // migrate_vma_setup() are correct. 286 if (uvm_context->ignore_hmm_invalidate_va_block == va_block || 287 ((range->event == MMU_NOTIFY_MIGRATE || range->event == MMU_NOTIFY_EXCLUSIVE) && 288 range->owner == &g_uvm_global)) 289 return true; 290 291 va_block_context = uvm_va_block_context_alloc(mm); 292 if (!va_block_context) 293 return true; 294 295 uvm_mutex_lock(&va_block->lock); 296 297 // mmu_interval_notifier_remove() is always called before marking a 298 // va_block as dead so this va_block has to be alive. 299 UVM_ASSERT(!uvm_va_block_is_dead(va_block)); 300 301 // Note: unmap_vmas() does MMU_NOTIFY_UNMAP [0, 0xffffffffffffffff] 302 // Also note that hmm_invalidate() can be called when a new va_block is not 303 // yet inserted into the va_space->hmm.blocks table while the original 304 // va_block is being split. The original va_block may have its end address 305 // updated before the mmu interval notifier is updated so this invalidate 306 // may be for a range past the va_block end address. 307 start = range->start; 308 end = (range->end == ULONG_MAX) ? range->end : range->end - 1; 309 if (start < va_block->start) 310 start = va_block->start; 311 if (end > va_block->end) 312 end = va_block->end; 313 if (start > end) 314 goto unlock; 315 316 // These will be equal if no other thread causes an invalidation 317 // whilst the va_block lock was dropped. 318 uvm_context->hmm_invalidate_seqnum++; 319 va_block->hmm.changed++; 320 321 mmu_interval_set_seq(mni, cur_seq); 322 323 region = uvm_va_block_region_from_start_end(va_block, start, end); 324 325 va_block_context->hmm.vma = NULL; 326 327 // We only need to unmap GPUs since Linux handles the CPUs. 328 for_each_gpu_id_in_mask(id, &va_block->mapped) { 329 status = uvm_va_block_unmap(va_block, 330 va_block_context, 331 id, 332 region, 333 uvm_va_block_map_mask_get(va_block, id), 334 &va_block->tracker); 335 // Note that the va_block lock can be dropped, relocked, and 336 // NV_ERR_MORE_PROCESSING_REQUIRED returned. 337 if (status != NV_OK) 338 break; 339 } 340 341 if (range->event == MMU_NOTIFY_UNMAP || range->event == MMU_NOTIFY_CLEAR) 342 uvm_va_block_munmap_region(va_block, region); 343 344 if (status == NV_OK) 345 status = uvm_tracker_wait(&va_block->tracker); 346 347 // Remove stale HMM struct page pointers to system memory. 348 uvm_va_block_remove_cpu_chunks(va_block, region); 349 350 unlock: 351 uvm_mutex_unlock(&va_block->lock); 352 353 uvm_va_block_context_free(va_block_context); 354 355 UVM_ASSERT(status == NV_OK); 356 return true; 357 } 358 359 static bool uvm_hmm_invalidate_entry(struct mmu_interval_notifier *mni, 360 const struct mmu_notifier_range *range, 361 unsigned long cur_seq) 362 { 363 uvm_va_block_t *va_block = container_of(mni, uvm_va_block_t, hmm.notifier); 364 365 UVM_ENTRY_RET(hmm_invalidate(va_block, range, cur_seq)); 366 } 367 368 static const struct mmu_interval_notifier_ops uvm_hmm_notifier_ops = 369 { 370 .invalidate = uvm_hmm_invalidate_entry, 371 }; 372 373 NV_STATUS uvm_hmm_va_block_find(uvm_va_space_t *va_space, 374 NvU64 addr, 375 uvm_va_block_t **va_block_ptr) 376 { 377 uvm_range_tree_node_t *node; 378 379 if (!uvm_hmm_is_enabled(va_space)) 380 return NV_ERR_INVALID_ADDRESS; 381 382 uvm_assert_rwsem_locked(&va_space->lock); 383 384 uvm_mutex_lock(&va_space->hmm.blocks_lock); 385 node = uvm_range_tree_find(&va_space->hmm.blocks, addr); 386 uvm_mutex_unlock(&va_space->hmm.blocks_lock); 387 388 if (!node) 389 return NV_ERR_OBJECT_NOT_FOUND; 390 391 *va_block_ptr = hmm_va_block_from_node(node); 392 393 return NV_OK; 394 } 395 396 static int migrate_vma_setup_locked(struct migrate_vma *args, uvm_va_block_t *va_block) 397 { 398 uvm_thread_context_t *uvm_context = uvm_thread_context(); 399 int ret; 400 401 // It's only safe to ignore invalidations whilst doing a migration 402 // and holding the va_block lock. 403 uvm_assert_mutex_locked(&va_block->lock); 404 uvm_context->ignore_hmm_invalidate_va_block = va_block; 405 ret = migrate_vma_setup(args); 406 407 // We shouldn't be generating any more invalidations now. 408 uvm_context->ignore_hmm_invalidate_va_block = NULL; 409 return ret; 410 } 411 412 static bool uvm_hmm_vma_is_valid(struct vm_area_struct *vma, 413 unsigned long addr, 414 bool allow_unreadable_vma) 415 { 416 // UVM doesn't support userfaultfd. hmm_range_fault() doesn't support 417 // VM_IO or VM_PFNMAP VMAs. It also doesn't support VMAs without VM_READ 418 // but we allow those VMAs to have policy set on them. 419 // migrate_vma_setup() doesn't support VM_SPECIAL VMAs but that is handled 420 // by uvm_hmm_must_use_sysmem() forcing residency to the CPU. 421 return vma && 422 addr >= vma->vm_start && 423 !userfaultfd_armed(vma) && 424 !(vma->vm_flags & (VM_IO | VM_PFNMAP)) && 425 !uvm_vma_is_managed(vma) && 426 (allow_unreadable_vma || (vma->vm_flags & VM_READ)); 427 } 428 429 static void hmm_va_block_init(uvm_va_block_t *va_block, 430 uvm_va_space_t *va_space, 431 NvU64 start, 432 NvU64 end) 433 { 434 va_block->hmm.va_space = va_space; 435 va_block->hmm.node.start = start; 436 va_block->hmm.node.end = end; 437 uvm_range_tree_init(&va_block->hmm.va_policy_tree); 438 uvm_mutex_init(&va_block->hmm.migrate_lock, UVM_LOCK_ORDER_VA_BLOCK_MIGRATE); 439 } 440 441 static NV_STATUS hmm_va_block_find_create(uvm_va_space_t *va_space, 442 NvU64 addr, 443 bool allow_unreadable_vma, 444 struct vm_area_struct **vma_out, 445 uvm_va_block_t **va_block_ptr) 446 { 447 struct mm_struct *mm; 448 struct vm_area_struct *va_block_vma; 449 uvm_va_block_t *va_block; 450 NvU64 start, end; 451 NV_STATUS status; 452 int ret; 453 454 if (!uvm_hmm_is_enabled(va_space)) 455 return NV_ERR_INVALID_ADDRESS; 456 457 mm = va_space->va_space_mm.mm; 458 uvm_assert_mmap_lock_locked(mm); 459 uvm_assert_rwsem_locked(&va_space->lock); 460 UVM_ASSERT(PAGE_ALIGNED(addr)); 461 462 // Note that we have to allow PROT_NONE VMAs so that policies can be set. 463 va_block_vma = find_vma(mm, addr); 464 if (!uvm_hmm_vma_is_valid(va_block_vma, addr, allow_unreadable_vma)) 465 return NV_ERR_INVALID_ADDRESS; 466 467 // Since we only hold the va_space read lock, there can be multiple 468 // parallel va_block insertions. 469 uvm_mutex_lock(&va_space->hmm.blocks_lock); 470 471 va_block = hmm_va_block_from_node(uvm_range_tree_find(&va_space->hmm.blocks, addr)); 472 if (va_block) 473 goto done; 474 475 // The va_block is always created to cover the whole aligned 476 // UVM_VA_BLOCK_SIZE interval unless there are existing UVM va_ranges or 477 // HMM va_blocks. In that case, the new HMM va_block size is adjusted so it 478 // doesn't overlap. 479 start = UVM_VA_BLOCK_ALIGN_DOWN(addr); 480 end = start + UVM_VA_BLOCK_SIZE - 1; 481 482 // Search for existing UVM va_ranges in the start/end interval and create 483 // a maximum interval that doesn't overlap any existing UVM va_ranges. 484 // We know that 'addr' is not within a va_range or 485 // hmm_va_block_find_create() wouldn't be called. 486 status = uvm_range_tree_find_hole_in(&va_space->va_range_tree, addr, &start, &end); 487 UVM_ASSERT(status == NV_OK); 488 489 // Search for existing HMM va_blocks in the start/end interval and create 490 // a maximum interval that doesn't overlap any existing HMM va_blocks. 491 status = uvm_range_tree_find_hole_in(&va_space->hmm.blocks, addr, &start, &end); 492 UVM_ASSERT(status == NV_OK); 493 494 // Create a HMM va_block with a NULL va_range pointer. 495 status = uvm_va_block_create(NULL, start, end, &va_block); 496 if (status != NV_OK) 497 goto err_unlock; 498 499 hmm_va_block_init(va_block, va_space, start, end); 500 501 ret = mmu_interval_notifier_insert(&va_block->hmm.notifier, 502 mm, 503 start, 504 end - start + 1, 505 &uvm_hmm_notifier_ops); 506 if (ret) { 507 status = errno_to_nv_status(ret); 508 goto err_release; 509 } 510 511 status = uvm_range_tree_add(&va_space->hmm.blocks, &va_block->hmm.node); 512 UVM_ASSERT(status == NV_OK); 513 514 done: 515 uvm_mutex_unlock(&va_space->hmm.blocks_lock); 516 if (vma_out) 517 *vma_out = va_block_vma; 518 *va_block_ptr = va_block; 519 return NV_OK; 520 521 err_release: 522 uvm_va_block_release(va_block); 523 524 err_unlock: 525 uvm_mutex_unlock(&va_space->hmm.blocks_lock); 526 return status; 527 } 528 529 NV_STATUS uvm_hmm_va_block_find_create(uvm_va_space_t *va_space, 530 NvU64 addr, 531 struct vm_area_struct **vma, 532 uvm_va_block_t **va_block_ptr) 533 { 534 return hmm_va_block_find_create(va_space, addr, false, vma, va_block_ptr); 535 } 536 537 NV_STATUS uvm_hmm_find_vma(struct mm_struct *mm, struct vm_area_struct **vma_out, NvU64 addr) 538 { 539 if (!mm) 540 return NV_ERR_INVALID_ADDRESS; 541 542 uvm_assert_mmap_lock_locked(mm); 543 544 *vma_out = find_vma(mm, addr); 545 if (!uvm_hmm_vma_is_valid(*vma_out, addr, false)) 546 return NV_ERR_INVALID_ADDRESS; 547 548 return NV_OK; 549 } 550 551 bool uvm_hmm_check_context_vma_is_valid(uvm_va_block_t *va_block, 552 struct vm_area_struct *vma, 553 uvm_va_block_region_t region) 554 { 555 uvm_assert_mutex_locked(&va_block->lock); 556 557 if (uvm_va_block_is_hmm(va_block)) { 558 UVM_ASSERT(vma); 559 UVM_ASSERT(va_block->hmm.va_space->va_space_mm.mm == vma->vm_mm); 560 uvm_assert_mmap_lock_locked(va_block->hmm.va_space->va_space_mm.mm); 561 UVM_ASSERT(vma->vm_start <= uvm_va_block_region_start(va_block, region)); 562 UVM_ASSERT(vma->vm_end > uvm_va_block_region_end(va_block, region)); 563 } 564 565 return true; 566 } 567 568 void uvm_hmm_service_context_init(uvm_service_block_context_t *service_context) 569 { 570 // TODO: Bug 4050579: Remove this when swap cached pages can be migrated. 571 service_context->block_context.hmm.swap_cached = false; 572 } 573 574 NV_STATUS uvm_hmm_migrate_begin(uvm_va_block_t *va_block) 575 { 576 if (uvm_mutex_trylock(&va_block->hmm.migrate_lock)) 577 return NV_OK; 578 579 return NV_ERR_BUSY_RETRY; 580 } 581 582 void uvm_hmm_migrate_begin_wait(uvm_va_block_t *va_block) 583 { 584 uvm_mutex_lock(&va_block->hmm.migrate_lock); 585 } 586 587 void uvm_hmm_migrate_finish(uvm_va_block_t *va_block) 588 { 589 uvm_mutex_unlock(&va_block->hmm.migrate_lock); 590 } 591 592 // Migrate the given range [start end] within a va_block to dest_id. 593 static NV_STATUS hmm_migrate_range(uvm_va_block_t *va_block, 594 uvm_va_block_retry_t *va_block_retry, 595 uvm_va_block_context_t *va_block_context, 596 uvm_processor_id_t dest_id, 597 NvU64 start, 598 NvU64 end, 599 uvm_migrate_mode_t mode, 600 uvm_tracker_t *out_tracker) 601 { 602 uvm_va_block_region_t region; 603 uvm_va_policy_node_t *node; 604 const uvm_va_policy_t *policy; 605 NV_STATUS status = NV_OK; 606 607 uvm_hmm_migrate_begin_wait(va_block); 608 uvm_mutex_lock(&va_block->lock); 609 610 uvm_for_each_va_policy_in(policy, va_block, start, end, node, region) { 611 // Even though UVM_VA_BLOCK_RETRY_LOCKED() may unlock and relock the 612 // va_block lock, the policy remains valid because we hold the mmap 613 // lock so munmap can't remove the policy, and the va_space lock so the 614 // policy APIs can't change the policy. 615 status = UVM_VA_BLOCK_RETRY_LOCKED(va_block, 616 va_block_retry, 617 uvm_va_block_migrate_locked(va_block, 618 va_block_retry, 619 va_block_context, 620 region, 621 dest_id, 622 mode, 623 out_tracker)); 624 if (status != NV_OK) 625 break; 626 } 627 628 uvm_mutex_unlock(&va_block->lock); 629 uvm_hmm_migrate_finish(va_block); 630 631 return status; 632 } 633 634 void uvm_hmm_evict_va_blocks(uvm_va_space_t *va_space) 635 { 636 // We can't use uvm_va_space_mm_retain(), because the va_space_mm 637 // should already be dead by now. 638 struct mm_struct *mm = va_space->va_space_mm.mm; 639 uvm_hmm_va_space_t *hmm_va_space = &va_space->hmm; 640 uvm_range_tree_node_t *node, *next; 641 uvm_va_block_t *va_block; 642 uvm_va_block_context_t *block_context; 643 644 uvm_down_read_mmap_lock(mm); 645 uvm_va_space_down_write(va_space); 646 647 uvm_range_tree_for_each_safe(node, next, &hmm_va_space->blocks) { 648 uvm_va_block_region_t region; 649 struct vm_area_struct *vma; 650 651 va_block = hmm_va_block_from_node(node); 652 block_context = uvm_va_space_block_context(va_space, mm); 653 uvm_hmm_migrate_begin_wait(va_block); 654 uvm_mutex_lock(&va_block->lock); 655 for_each_va_block_vma_region(va_block, mm, vma, ®ion) { 656 if (!uvm_hmm_vma_is_valid(vma, vma->vm_start, false)) 657 continue; 658 659 block_context->hmm.vma = vma; 660 uvm_hmm_va_block_migrate_locked(va_block, 661 NULL, 662 block_context, 663 UVM_ID_CPU, 664 region, 665 UVM_MAKE_RESIDENT_CAUSE_API_MIGRATE); 666 } 667 uvm_mutex_unlock(&va_block->lock); 668 uvm_hmm_migrate_finish(va_block); 669 } 670 671 uvm_va_space_up_write(va_space); 672 uvm_up_read_mmap_lock(mm); 673 } 674 675 NV_STATUS uvm_hmm_test_va_block_inject_split_error(uvm_va_space_t *va_space, NvU64 addr) 676 { 677 uvm_va_block_test_t *block_test; 678 uvm_va_block_t *va_block; 679 NV_STATUS status; 680 681 if (!uvm_hmm_is_enabled(va_space)) 682 return NV_ERR_INVALID_ADDRESS; 683 684 status = hmm_va_block_find_create(va_space, addr, false, NULL, &va_block); 685 if (status != NV_OK) 686 return status; 687 688 block_test = uvm_va_block_get_test(va_block); 689 if (block_test) 690 block_test->inject_split_error = true; 691 692 return NV_OK; 693 } 694 695 typedef struct { 696 struct mmu_interval_notifier notifier; 697 uvm_va_block_t *existing_block; 698 } hmm_split_invalidate_data_t; 699 700 static bool hmm_split_invalidate(struct mmu_interval_notifier *mni, 701 const struct mmu_notifier_range *range, 702 unsigned long cur_seq) 703 { 704 hmm_split_invalidate_data_t *split_data = container_of(mni, hmm_split_invalidate_data_t, notifier); 705 706 uvm_tools_test_hmm_split_invalidate(split_data->existing_block->hmm.va_space); 707 hmm_invalidate(split_data->existing_block, range, cur_seq); 708 709 return true; 710 } 711 712 static bool hmm_split_invalidate_entry(struct mmu_interval_notifier *mni, 713 const struct mmu_notifier_range *range, 714 unsigned long cur_seq) 715 { 716 UVM_ENTRY_RET(hmm_split_invalidate(mni, range, cur_seq)); 717 } 718 719 static const struct mmu_interval_notifier_ops hmm_notifier_split_ops = 720 { 721 .invalidate = hmm_split_invalidate_entry, 722 }; 723 724 // Splits existing va_block into two pieces, with new_va_block always after 725 // va_block. va_block is updated to have new_end. new_end+1 must be page- 726 // aligned. 727 // 728 // Before: [----------- existing ------------] 729 // After: [---- existing ----][---- new ----] 730 // ^new_end 731 // 732 // On error, va_block is still accessible and is left in its original 733 // functional state. 734 static NV_STATUS hmm_split_block(uvm_va_block_t *va_block, 735 NvU64 new_end, 736 uvm_va_block_t **new_block_ptr) 737 { 738 uvm_va_space_t *va_space = va_block->hmm.va_space; 739 struct mm_struct *mm = va_space->va_space_mm.mm; 740 hmm_split_invalidate_data_t split_data; 741 NvU64 delay_us; 742 uvm_va_block_t *new_va_block; 743 NV_STATUS status; 744 int ret; 745 746 uvm_assert_rwsem_locked_write(&va_space->lock); 747 748 UVM_ASSERT(new_end > va_block->start); 749 UVM_ASSERT(new_end < va_block->end); 750 UVM_ASSERT(PAGE_ALIGNED(new_end + 1)); 751 752 status = uvm_va_block_create(NULL, new_end + 1, va_block->end, &new_va_block); 753 if (status != NV_OK) 754 return status; 755 756 // Initialize the newly created HMM va_block. 757 hmm_va_block_init(new_va_block, va_space, new_va_block->start, new_va_block->end); 758 759 ret = mmu_interval_notifier_insert(&new_va_block->hmm.notifier, 760 mm, 761 new_va_block->start, 762 uvm_va_block_size(new_va_block), 763 &uvm_hmm_notifier_ops); 764 765 // Since __mmu_notifier_register() was called when the va_space was 766 // initially created, we know that mm->notifier_subscriptions is valid 767 // and mmu_interval_notifier_insert() can't return ENOMEM. 768 // The only error return is for start + length overflowing but we already 769 // registered the same address range before so there should be no error. 770 UVM_ASSERT(!ret); 771 772 uvm_mutex_lock(&va_block->lock); 773 774 status = uvm_va_block_split_locked(va_block, new_end, new_va_block, NULL); 775 if (status != NV_OK) 776 goto err; 777 778 uvm_mutex_unlock(&va_block->lock); 779 780 // The MMU interval notifier has to be removed in order to resize it. 781 // That means there would be a window of time when invalidation callbacks 782 // could be missed. To handle this case, we register a temporary notifier 783 // to cover the address range while resizing the old notifier (it is 784 // OK to have multiple notifiers for the same range, we may simply try to 785 // invalidate twice). 786 split_data.existing_block = va_block; 787 ret = mmu_interval_notifier_insert(&split_data.notifier, 788 mm, 789 va_block->start, 790 new_end - va_block->start + 1, 791 &hmm_notifier_split_ops); 792 UVM_ASSERT(!ret); 793 794 // Delay to allow hmm_sanity test to trigger an mmu_notifier during the 795 // critical window where the split invalidate callback is active. 796 delay_us = atomic64_read(&va_space->test.split_invalidate_delay_us); 797 if (delay_us) 798 udelay(delay_us); 799 800 mmu_interval_notifier_remove(&va_block->hmm.notifier); 801 802 // Enable notifications on the old block with the smaller size. 803 ret = mmu_interval_notifier_insert(&va_block->hmm.notifier, 804 mm, 805 va_block->start, 806 uvm_va_block_size(va_block), 807 &uvm_hmm_notifier_ops); 808 UVM_ASSERT(!ret); 809 810 mmu_interval_notifier_remove(&split_data.notifier); 811 812 if (new_block_ptr) 813 *new_block_ptr = new_va_block; 814 815 return status; 816 817 err: 818 uvm_mutex_unlock(&va_block->lock); 819 mmu_interval_notifier_remove(&new_va_block->hmm.notifier); 820 uvm_va_block_release(new_va_block); 821 return status; 822 } 823 824 // Check to see if the HMM va_block would overlap the range start/end and 825 // split it so it can be removed. That breaks down to the following cases: 826 // start/end could cover all of the HMM va_block -> 827 // remove the va_block 828 // start/end could cover the left part of the HMM va_block -> 829 // remove the left part 830 // start/end could cover the right part of the HMM va_block -> 831 // remove the right part 832 // or start/end could "punch a hole" in the middle and leave the ends intact. 833 // In each case, only one HMM va_block is removed so return it in out_va_block. 834 static NV_STATUS split_block_if_needed(uvm_va_block_t *va_block, 835 NvU64 start, 836 NvU64 end, 837 uvm_va_block_t **out_va_block) 838 { 839 uvm_va_block_context_t *va_block_context; 840 uvm_va_space_t *va_space; 841 struct mm_struct *mm; 842 struct vm_area_struct *vma; 843 uvm_va_block_region_t region; 844 NvU64 addr, from, to; 845 uvm_va_block_t *new; 846 NV_STATUS status; 847 848 if (va_block->start < start) { 849 status = hmm_split_block(va_block, start - 1, &new); 850 if (status != NV_OK) 851 return status; 852 853 // Keep the left part, the right part will be deleted. 854 va_block = new; 855 } 856 857 if (va_block->end > end) { 858 status = hmm_split_block(va_block, end, NULL); 859 if (status != NV_OK) 860 return status; 861 862 // Keep the right part, the left part will be deleted. 863 } 864 865 *out_va_block = va_block; 866 867 // Migrate any GPU data to sysmem before destroying the HMM va_block. 868 // We do this because the new va_range might be for a UVM external 869 // allocation which could be converting an address range that was first 870 // operated on by UVM-HMM and the exteral allocation should see that data. 871 va_space = va_block->hmm.va_space; 872 mm = va_space->va_space_mm.mm; 873 va_block_context = uvm_va_space_block_context(va_space, mm); 874 875 for (addr = va_block->start; addr < va_block->end; addr = to + 1) { 876 vma = find_vma_intersection(mm, addr, va_block->end); 877 if (!vma) 878 break; 879 880 from = max(addr, (NvU64)vma->vm_start); 881 to = min(va_block->end, (NvU64)vma->vm_end - 1); 882 region = uvm_va_block_region_from_start_end(va_block, from, to); 883 884 if (!uvm_hmm_vma_is_valid(vma, from, false)) 885 continue; 886 887 va_block_context->hmm.vma = vma; 888 889 status = hmm_migrate_range(va_block, 890 NULL, 891 va_block_context, 892 UVM_ID_CPU, 893 from, 894 to, 895 UVM_MIGRATE_MODE_MAKE_RESIDENT_AND_MAP, 896 NULL); 897 if (status != NV_OK) 898 return status; 899 } 900 901 return NV_OK; 902 } 903 904 // Normally, the HMM va_block is destroyed when the va_space is destroyed 905 // (i.e., when the /dev/nvidia-uvm device is closed). A munmap() call triggers 906 // a uvm_hmm_invalidate() callback which unmaps the VMA's range from the GPU's 907 // page tables. However, it doesn't destroy the va_block because that would 908 // require calling mmu_interval_notifier_remove() which can't be called from 909 // the invalidate callback due to Linux locking constraints. If a process 910 // calls mmap()/munmap() for SAM and then creates a managed allocation, 911 // the same VMA range can be picked and there would be a UVM/HMM va_block 912 // conflict. Creating a managed allocation, external allocation, or other 913 // va_range types, calls this function to remove stale HMM va_blocks or split 914 // the HMM va_block so there is no overlap. 915 NV_STATUS uvm_hmm_va_block_reclaim(uvm_va_space_t *va_space, 916 struct mm_struct *mm, 917 NvU64 start, 918 NvU64 end) 919 { 920 uvm_range_tree_node_t *node, *next; 921 uvm_va_block_t *va_block; 922 NV_STATUS status; 923 924 if (!uvm_hmm_is_enabled(va_space)) 925 return NV_OK; 926 927 if (mm) 928 uvm_assert_mmap_lock_locked(mm); 929 uvm_assert_rwsem_locked_write(&va_space->lock); 930 931 // Process each HMM va_block that overlaps the interval [start, end]. 932 // Note that end is inclusive. 933 // The blocks_lock is not needed when the va_space lock is held for write. 934 uvm_range_tree_for_each_in_safe(node, next, &va_space->hmm.blocks, start, end) { 935 va_block = hmm_va_block_from_node(node); 936 937 if (mm) { 938 status = split_block_if_needed(va_block, start, end, &va_block); 939 if (status != NV_OK) 940 return status; 941 } 942 943 // Note that this waits for any invalidations callbacks to complete 944 // so uvm_hmm_invalidate() won't see a block disapear. 945 // The va_space write lock should prevent uvm_hmm_va_block_find_create() 946 // from adding it back. 947 mmu_interval_notifier_remove(&va_block->hmm.notifier); 948 uvm_range_tree_remove(&va_space->hmm.blocks, &va_block->hmm.node); 949 uvm_va_block_kill(va_block); 950 } 951 952 UVM_ASSERT(!uvm_range_tree_iter_first(&va_space->hmm.blocks, start, end)); 953 954 return NV_OK; 955 } 956 957 void uvm_hmm_va_block_split_tree(uvm_va_block_t *existing_va_block, uvm_va_block_t *new_block) 958 { 959 uvm_va_space_t *va_space = existing_va_block->hmm.va_space; 960 961 UVM_ASSERT(uvm_va_block_is_hmm(existing_va_block)); 962 uvm_assert_rwsem_locked_write(&va_space->lock); 963 964 uvm_range_tree_split(&existing_va_block->hmm.va_space->hmm.blocks, 965 &existing_va_block->hmm.node, 966 &new_block->hmm.node); 967 } 968 969 NV_STATUS uvm_hmm_split_as_needed(uvm_va_space_t *va_space, 970 NvU64 addr, 971 uvm_va_policy_is_split_needed_t split_needed_cb, 972 void *data) 973 { 974 uvm_va_block_t *va_block; 975 uvm_va_policy_node_t *node; 976 NV_STATUS status; 977 978 uvm_assert_rwsem_locked_write(&va_space->lock); 979 980 // If there is no HMM va_block or the va_block doesn't span the policy 981 // addr, there is no need to split. 982 status = uvm_hmm_va_block_find(va_space, addr, &va_block); 983 if (status != NV_OK || va_block->start == addr) 984 return NV_OK; 985 986 uvm_mutex_lock(&va_block->lock); 987 988 node = uvm_va_policy_node_find(va_block, addr); 989 if (!node) 990 goto done; 991 992 // If the policy range doesn't span addr, we're done. 993 if (addr == node->node.start) 994 goto done; 995 996 if (split_needed_cb(&node->policy, data)) 997 status = uvm_va_policy_node_split(va_block, node, addr - 1, NULL); 998 999 done: 1000 uvm_mutex_unlock(&va_block->lock); 1001 return status; 1002 } 1003 1004 static NV_STATUS hmm_set_preferred_location_locked(uvm_va_block_t *va_block, 1005 uvm_va_block_context_t *va_block_context, 1006 uvm_processor_id_t preferred_location, 1007 NvU64 addr, 1008 NvU64 end, 1009 uvm_tracker_t *out_tracker) 1010 { 1011 uvm_processor_mask_t set_accessed_by_processors; 1012 const uvm_va_policy_t *old_policy; 1013 uvm_va_policy_node_t *node; 1014 uvm_va_block_region_t region; 1015 uvm_processor_id_t id; 1016 NV_STATUS status, tracker_status; 1017 1018 // Note that we can't just call uvm_va_policy_set_range() for the whole 1019 // range [addr end] because we need to examine the old value of 1020 // policy->preferred_location before setting it. Thus we iterate over 1021 // the existing policy nodes. 1022 uvm_for_each_va_policy_in(old_policy, va_block, addr, end, node, region) { 1023 if (uvm_id_equal(old_policy->preferred_location, preferred_location)) 1024 continue; 1025 1026 // If the old preferred location is a valid processor ID, remote 1027 // mappings should be established to the new preferred location if 1028 // accessed-by is set. 1029 uvm_processor_mask_zero(&set_accessed_by_processors); 1030 1031 if (UVM_ID_IS_VALID(old_policy->preferred_location) && 1032 uvm_processor_mask_test(&old_policy->accessed_by, old_policy->preferred_location)) 1033 uvm_processor_mask_set(&set_accessed_by_processors, old_policy->preferred_location); 1034 1035 if (!uvm_va_policy_set_preferred_location(va_block, region, preferred_location, old_policy)) 1036 return NV_ERR_NO_MEMORY; 1037 1038 // Establish new remote mappings if the old preferred location had 1039 // accessed-by set. 1040 for_each_id_in_mask(id, &set_accessed_by_processors) { 1041 status = uvm_va_block_set_accessed_by_locked(va_block, va_block_context, id, region, out_tracker); 1042 if (status != NV_OK) 1043 return status; 1044 } 1045 1046 // Even though the UVM_VA_BLOCK_RETRY_LOCKED() may unlock and relock 1047 // the va_block lock, the policy remains valid because we hold the mmap 1048 // lock so munmap can't remove the policy, and the va_space lock so the 1049 // policy APIs can't change the policy. 1050 status = UVM_VA_BLOCK_RETRY_LOCKED(va_block, 1051 NULL, 1052 uvm_va_block_set_preferred_location_locked(va_block, 1053 va_block_context, 1054 region)); 1055 1056 tracker_status = uvm_tracker_add_tracker_safe(out_tracker, &va_block->tracker); 1057 if (status == NV_OK) 1058 status = tracker_status; 1059 1060 if (status != NV_OK) 1061 return status; 1062 } 1063 1064 return NV_OK; 1065 } 1066 1067 NV_STATUS uvm_hmm_set_preferred_location(uvm_va_space_t *va_space, 1068 uvm_processor_id_t preferred_location, 1069 NvU64 base, 1070 NvU64 last_address, 1071 uvm_tracker_t *out_tracker) 1072 { 1073 uvm_va_block_context_t *va_block_context; 1074 uvm_va_block_t *va_block; 1075 NvU64 addr; 1076 NV_STATUS status = NV_OK; 1077 1078 if (!uvm_hmm_is_enabled(va_space)) 1079 return NV_ERR_INVALID_ADDRESS; 1080 1081 uvm_assert_mmap_lock_locked(va_space->va_space_mm.mm); 1082 uvm_assert_rwsem_locked_write(&va_space->lock); 1083 UVM_ASSERT(PAGE_ALIGNED(base)); 1084 UVM_ASSERT(PAGE_ALIGNED(last_address + 1)); 1085 UVM_ASSERT(base < last_address); 1086 1087 // Update HMM preferred location policy. 1088 1089 va_block_context = uvm_va_space_block_context(va_space, va_space->va_space_mm.mm); 1090 1091 for (addr = base; addr < last_address; addr = va_block->end + 1) { 1092 NvU64 end; 1093 1094 status = hmm_va_block_find_create(va_space, addr, true, &va_block_context->hmm.vma, &va_block); 1095 if (status != NV_OK) 1096 break; 1097 1098 end = min(last_address, va_block->end); 1099 1100 uvm_mutex_lock(&va_block->lock); 1101 1102 status = hmm_set_preferred_location_locked(va_block, 1103 va_block_context, 1104 preferred_location, 1105 addr, 1106 end, 1107 out_tracker); 1108 1109 uvm_mutex_unlock(&va_block->lock); 1110 1111 if (status != NV_OK) 1112 break; 1113 } 1114 1115 return status; 1116 } 1117 1118 static NV_STATUS hmm_set_accessed_by_start_end_locked(uvm_va_block_t *va_block, 1119 uvm_va_block_context_t *va_block_context, 1120 uvm_processor_id_t processor_id, 1121 NvU64 start, 1122 NvU64 end, 1123 uvm_tracker_t *out_tracker) 1124 { 1125 uvm_va_space_t *va_space = va_block->hmm.va_space; 1126 uvm_va_policy_node_t *node; 1127 uvm_va_block_region_t region; 1128 NV_STATUS status = NV_OK; 1129 1130 uvm_for_each_va_policy_node_in(node, va_block, start, end) { 1131 // Read duplication takes precedence over SetAccessedBy. 1132 // Do not add mappings if read duplication is enabled. 1133 if (uvm_va_policy_is_read_duplicate(&node->policy, va_space)) 1134 continue; 1135 1136 region = uvm_va_block_region_from_start_end(va_block, 1137 max(start, node->node.start), 1138 min(end, node->node.end)); 1139 1140 status = uvm_va_block_set_accessed_by_locked(va_block, 1141 va_block_context, 1142 processor_id, 1143 region, 1144 out_tracker); 1145 if (status != NV_OK) 1146 break; 1147 } 1148 1149 return status; 1150 } 1151 1152 NV_STATUS uvm_hmm_set_accessed_by(uvm_va_space_t *va_space, 1153 uvm_processor_id_t processor_id, 1154 bool set_bit, 1155 NvU64 base, 1156 NvU64 last_address, 1157 uvm_tracker_t *out_tracker) 1158 { 1159 uvm_va_block_context_t *va_block_context; 1160 uvm_va_block_t *va_block; 1161 NvU64 addr; 1162 NV_STATUS status = NV_OK; 1163 1164 if (!uvm_hmm_is_enabled(va_space)) 1165 return NV_ERR_INVALID_ADDRESS; 1166 1167 uvm_assert_mmap_lock_locked(va_space->va_space_mm.mm); 1168 uvm_assert_rwsem_locked_write(&va_space->lock); 1169 UVM_ASSERT(PAGE_ALIGNED(base)); 1170 UVM_ASSERT(PAGE_ALIGNED(last_address + 1)); 1171 UVM_ASSERT(base < last_address); 1172 1173 // Update HMM accessed by policy. 1174 1175 va_block_context = uvm_va_space_block_context(va_space, va_space->va_space_mm.mm); 1176 1177 for (addr = base; addr < last_address; addr = va_block->end + 1) { 1178 NvU64 end; 1179 1180 status = hmm_va_block_find_create(va_space, addr, true, &va_block_context->hmm.vma, &va_block); 1181 if (status != NV_OK) 1182 break; 1183 1184 end = min(last_address, va_block->end); 1185 1186 uvm_mutex_lock(&va_block->lock); 1187 1188 status = uvm_va_policy_set_range(va_block, 1189 addr, 1190 end, 1191 UVM_VA_POLICY_ACCESSED_BY, 1192 !set_bit, 1193 processor_id, 1194 UVM_READ_DUPLICATION_MAX); 1195 1196 if (status == NV_OK && set_bit) { 1197 status = hmm_set_accessed_by_start_end_locked(va_block, 1198 va_block_context, 1199 processor_id, 1200 addr, 1201 end, 1202 out_tracker); 1203 } 1204 1205 uvm_mutex_unlock(&va_block->lock); 1206 1207 if (status != NV_OK) 1208 break; 1209 } 1210 1211 return status; 1212 } 1213 1214 void uvm_hmm_block_add_eviction_mappings(uvm_va_space_t *va_space, 1215 uvm_va_block_t *va_block, 1216 uvm_va_block_context_t *block_context) 1217 { 1218 uvm_tracker_t local_tracker = UVM_TRACKER_INIT(); 1219 uvm_va_policy_node_t *node; 1220 uvm_va_block_region_t region; 1221 uvm_processor_mask_t map_processors; 1222 uvm_processor_id_t id; 1223 NV_STATUS tracker_status; 1224 NV_STATUS status = NV_OK; 1225 1226 UVM_ASSERT(uvm_va_block_is_hmm(va_block)); 1227 uvm_assert_mmap_lock_locked(va_space->va_space_mm.mm); 1228 uvm_assert_rwsem_locked(&va_space->lock); 1229 1230 uvm_mutex_lock(&va_block->lock); 1231 1232 uvm_for_each_va_policy_node_in(node, va_block, va_block->start, va_block->end) { 1233 for_each_id_in_mask(id, &node->policy.accessed_by) { 1234 status = hmm_set_accessed_by_start_end_locked(va_block, 1235 block_context, 1236 id, 1237 node->node.start, 1238 node->node.end, 1239 &local_tracker); 1240 if (status != NV_OK) 1241 break; 1242 1243 if (!uvm_va_space_map_remote_on_eviction(va_space)) 1244 continue; 1245 1246 // Exclude the processors that have been already mapped due to 1247 // AccessedBy. 1248 uvm_processor_mask_andnot(&map_processors, &va_block->evicted_gpus, &node->policy.accessed_by); 1249 1250 for_each_gpu_id_in_mask(id, &map_processors) { 1251 uvm_gpu_t *gpu = uvm_va_space_get_gpu(va_space, id); 1252 uvm_va_block_gpu_state_t *gpu_state; 1253 1254 if (!gpu->parent->access_counters_supported) 1255 continue; 1256 1257 gpu_state = uvm_va_block_gpu_state_get(va_block, id); 1258 UVM_ASSERT(gpu_state); 1259 1260 // TODO: Bug 2096389: uvm_va_block_add_mappings does not add 1261 // remote mappings to read-duplicated pages. Add support for it 1262 // or create a new function. 1263 status = uvm_va_block_add_mappings(va_block, 1264 block_context, 1265 id, 1266 region, 1267 &gpu_state->evicted, 1268 UvmEventMapRemoteCauseEviction); 1269 tracker_status = uvm_tracker_add_tracker_safe(&local_tracker, &va_block->tracker); 1270 status = (status == NV_OK) ? tracker_status : status; 1271 if (status != NV_OK) { 1272 UVM_ASSERT(status != NV_ERR_MORE_PROCESSING_REQUIRED); 1273 break; 1274 } 1275 } 1276 } 1277 } 1278 1279 uvm_mutex_unlock(&va_block->lock); 1280 1281 tracker_status = uvm_tracker_wait_deinit(&local_tracker); 1282 status = (status == NV_OK) ? tracker_status : status; 1283 if (status != NV_OK) { 1284 UVM_ERR_PRINT("Deferred mappings to evicted memory for block [0x%llx, 0x%llx] failed %s\n", 1285 va_block->start, 1286 va_block->end, 1287 nvstatusToString(status)); 1288 } 1289 } 1290 1291 const uvm_va_policy_t *uvm_hmm_find_policy_end(uvm_va_block_t *va_block, 1292 struct vm_area_struct *vma, 1293 unsigned long addr, 1294 NvU64 *endp) 1295 { 1296 const uvm_va_policy_node_t *node; 1297 const uvm_va_policy_t *policy; 1298 NvU64 end = va_block->end; 1299 1300 uvm_assert_mmap_lock_locked(vma->vm_mm); 1301 uvm_assert_mutex_locked(&va_block->lock); 1302 1303 if (end > vma->vm_end - 1) 1304 end = vma->vm_end - 1; 1305 1306 node = uvm_va_policy_node_find(va_block, addr); 1307 if (node) { 1308 policy = &node->policy; 1309 if (end > node->node.end) 1310 end = node->node.end; 1311 } 1312 else { 1313 policy = &uvm_va_policy_default; 1314 } 1315 1316 *endp = end; 1317 1318 return policy; 1319 } 1320 1321 NV_STATUS uvm_hmm_find_policy_vma_and_outer(uvm_va_block_t *va_block, 1322 struct vm_area_struct **vma_out, 1323 uvm_page_index_t page_index, 1324 const uvm_va_policy_t **policy, 1325 uvm_page_index_t *outerp) 1326 { 1327 unsigned long addr; 1328 NvU64 end; 1329 uvm_page_index_t outer; 1330 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 1331 struct mm_struct *mm = va_space->va_space_mm.mm; 1332 1333 if (!mm) 1334 return NV_ERR_INVALID_ADDRESS; 1335 1336 UVM_ASSERT(uvm_va_block_is_hmm(va_block)); 1337 uvm_assert_mmap_lock_locked(mm); 1338 uvm_assert_mutex_locked(&va_block->lock); 1339 1340 addr = uvm_va_block_cpu_page_address(va_block, page_index); 1341 1342 *vma_out = vma_lookup(mm, addr); 1343 if (!*vma_out || !((*vma_out)->vm_flags & VM_READ)) 1344 return NV_ERR_INVALID_ADDRESS; 1345 1346 *policy = uvm_hmm_find_policy_end(va_block, *vma_out, addr, &end); 1347 1348 outer = uvm_va_block_cpu_page_index(va_block, end) + 1; 1349 if (*outerp > outer) 1350 *outerp = outer; 1351 1352 return NV_OK; 1353 } 1354 1355 static NV_STATUS hmm_clear_thrashing_policy(uvm_va_block_t *va_block, 1356 uvm_va_block_context_t *block_context) 1357 { 1358 const uvm_va_policy_t *policy; 1359 uvm_va_policy_node_t *node; 1360 uvm_va_block_region_t region; 1361 NV_STATUS status = NV_OK; 1362 1363 uvm_mutex_lock(&va_block->lock); 1364 1365 uvm_for_each_va_policy_in(policy, va_block, va_block->start, va_block->end, node, region) { 1366 // Unmap may split PTEs and require a retry. Needs to be called 1367 // before the pinned pages information is destroyed. 1368 status = UVM_VA_BLOCK_RETRY_LOCKED(va_block, 1369 NULL, 1370 uvm_perf_thrashing_unmap_remote_pinned_pages_all(va_block, 1371 block_context, 1372 region)); 1373 1374 uvm_perf_thrashing_info_destroy(va_block); 1375 1376 if (status != NV_OK) 1377 break; 1378 } 1379 1380 uvm_mutex_unlock(&va_block->lock); 1381 1382 return status; 1383 } 1384 1385 NV_STATUS uvm_hmm_clear_thrashing_policy(uvm_va_space_t *va_space) 1386 { 1387 uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, NULL); 1388 uvm_range_tree_node_t *node, *next; 1389 uvm_va_block_t *va_block; 1390 NV_STATUS status = NV_OK; 1391 1392 if (!uvm_hmm_is_enabled(va_space)) 1393 return NV_OK; 1394 1395 uvm_assert_rwsem_locked_write(&va_space->lock); 1396 1397 uvm_range_tree_for_each_safe(node, next, &va_space->hmm.blocks) { 1398 va_block = hmm_va_block_from_node(node); 1399 1400 status = hmm_clear_thrashing_policy(va_block, block_context); 1401 if (status != NV_OK) 1402 break; 1403 } 1404 1405 return status; 1406 } 1407 1408 uvm_va_block_region_t uvm_hmm_get_prefetch_region(uvm_va_block_t *va_block, 1409 struct vm_area_struct *vma, 1410 const uvm_va_policy_t *policy, 1411 NvU64 address) 1412 { 1413 NvU64 start, end; 1414 1415 UVM_ASSERT(uvm_va_block_is_hmm(va_block)); 1416 1417 // We need to limit the prefetch region to the VMA. 1418 start = max(va_block->start, (NvU64)vma->vm_start); 1419 end = min(va_block->end, (NvU64)vma->vm_end - 1); 1420 1421 // Also, we need to limit the prefetch region to the policy range. 1422 if (uvm_va_policy_is_default(policy)) { 1423 NV_STATUS status = uvm_range_tree_find_hole_in(&va_block->hmm.va_policy_tree, 1424 address, 1425 &start, 1426 &end); 1427 // We already know the hole exists and covers the fault region. 1428 UVM_ASSERT(status == NV_OK); 1429 } 1430 else { 1431 const uvm_va_policy_node_t *node = uvm_va_policy_node_from_policy(policy); 1432 1433 start = max(start, node->node.start); 1434 end = min(end, node->node.end); 1435 } 1436 1437 return uvm_va_block_region_from_start_end(va_block, start, end); 1438 } 1439 1440 uvm_prot_t uvm_hmm_compute_logical_prot(uvm_va_block_t *va_block, 1441 struct vm_area_struct *vma, 1442 NvU64 addr) 1443 { 1444 UVM_ASSERT(uvm_va_block_is_hmm(va_block)); 1445 uvm_assert_mmap_lock_locked(va_block->hmm.va_space->va_space_mm.mm); 1446 UVM_ASSERT(vma && addr >= vma->vm_start && addr < vma->vm_end); 1447 1448 if (!(vma->vm_flags & VM_READ)) 1449 return UVM_PROT_NONE; 1450 else if (!(vma->vm_flags & VM_WRITE)) 1451 return UVM_PROT_READ_ONLY; 1452 else 1453 return UVM_PROT_READ_WRITE_ATOMIC; 1454 } 1455 1456 static NV_STATUS hmm_va_block_cpu_page_populate(uvm_va_block_t *va_block, 1457 uvm_page_index_t page_index, 1458 struct page *page) 1459 { 1460 uvm_cpu_chunk_t *chunk; 1461 NV_STATUS status; 1462 1463 UVM_ASSERT(uvm_va_block_is_hmm(va_block)); 1464 UVM_ASSERT(!uvm_page_mask_test(&va_block->cpu.allocated, page_index)); 1465 1466 if (page == ZERO_PAGE(uvm_va_block_cpu_page_address(va_block, page_index))) 1467 return NV_ERR_INVALID_ADDRESS; 1468 1469 status = uvm_cpu_chunk_alloc_hmm(page, &chunk); 1470 if (status != NV_OK) 1471 return status; 1472 1473 status = uvm_cpu_chunk_insert_in_block(va_block, chunk, page_index); 1474 if (status != NV_OK) { 1475 uvm_cpu_chunk_free(chunk); 1476 return status; 1477 } 1478 1479 status = uvm_va_block_map_cpu_chunk_on_gpus(va_block, page_index); 1480 if (status != NV_OK) { 1481 uvm_cpu_chunk_remove_from_block(va_block, page_index); 1482 uvm_cpu_chunk_free(chunk); 1483 } 1484 1485 return status; 1486 } 1487 1488 static void hmm_va_block_cpu_page_unpopulate(uvm_va_block_t *va_block, 1489 uvm_page_index_t page_index) 1490 { 1491 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, page_index); 1492 1493 UVM_ASSERT(uvm_va_block_is_hmm(va_block)); 1494 1495 if (!chunk) 1496 return; 1497 1498 UVM_ASSERT(!uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU) || 1499 !uvm_page_mask_test(&va_block->cpu.resident, page_index)); 1500 1501 uvm_cpu_chunk_remove_from_block(va_block, page_index); 1502 uvm_va_block_unmap_cpu_chunk_on_gpus(va_block, chunk, page_index); 1503 uvm_cpu_chunk_free(chunk); 1504 } 1505 1506 static bool hmm_va_block_cpu_page_is_same(uvm_va_block_t *va_block, 1507 uvm_page_index_t page_index, 1508 struct page *page) 1509 { 1510 struct page *old_page = uvm_cpu_chunk_get_cpu_page(va_block, page_index); 1511 1512 UVM_ASSERT(uvm_cpu_chunk_is_hmm(uvm_cpu_chunk_get_chunk_for_page(va_block, page_index))); 1513 return old_page == page; 1514 } 1515 1516 // uvm_va_block_service_copy() and uvm_va_block_service_finish() expect the 1517 // service_context masks to match what is being processed. Since a page 1518 // that was expected to be processed isn't migrating, we have to clear the 1519 // masks to make service_context consistent with what is actually being 1520 // handled. 1521 static void clear_service_context_masks(uvm_service_block_context_t *service_context, 1522 uvm_processor_id_t new_residency, 1523 uvm_page_index_t page_index) 1524 { 1525 uvm_page_mask_clear(&service_context->block_context.caller_page_mask, page_index); 1526 1527 uvm_page_mask_clear(&service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency, 1528 page_index); 1529 1530 if (uvm_page_mask_empty(&service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency)) 1531 uvm_processor_mask_clear(&service_context->resident_processors, new_residency); 1532 1533 if (UVM_ID_IS_VALID(service_context->prefetch_hint.residency)) 1534 uvm_page_mask_clear(&service_context->prefetch_hint.prefetch_pages_mask, page_index); 1535 1536 if (service_context->thrashing_pin_count > 0 && 1537 uvm_page_mask_test_and_clear(&service_context->thrashing_pin_mask, page_index)) { 1538 service_context->thrashing_pin_count--; 1539 } 1540 1541 if (service_context->read_duplicate_count > 0 && 1542 uvm_page_mask_test_and_clear(&service_context->read_duplicate_mask, page_index)) { 1543 service_context->read_duplicate_count--; 1544 } 1545 } 1546 1547 static void cpu_mapping_set(uvm_va_block_t *va_block, 1548 bool is_write, 1549 uvm_page_index_t page_index) 1550 { 1551 uvm_processor_mask_set(&va_block->mapped, UVM_ID_CPU); 1552 uvm_page_mask_set(&va_block->maybe_mapped_pages, page_index); 1553 uvm_page_mask_set(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ], page_index); 1554 if (is_write) 1555 uvm_page_mask_set(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE], page_index); 1556 else 1557 uvm_page_mask_clear(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE], page_index); 1558 } 1559 1560 static void cpu_mapping_clear(uvm_va_block_t *va_block, uvm_page_index_t page_index) 1561 { 1562 uvm_page_mask_clear(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE], page_index); 1563 uvm_page_mask_clear(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ], page_index); 1564 if (uvm_page_mask_empty(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ])) 1565 uvm_processor_mask_clear(&va_block->mapped, UVM_ID_CPU); 1566 } 1567 1568 static void gpu_chunk_remove(uvm_va_block_t *va_block, 1569 uvm_page_index_t page_index, 1570 struct page *page) 1571 { 1572 uvm_va_block_gpu_state_t *gpu_state; 1573 uvm_gpu_chunk_t *gpu_chunk; 1574 uvm_gpu_id_t id; 1575 1576 id = uvm_pmm_devmem_page_to_gpu_id(page); 1577 gpu_state = uvm_va_block_gpu_state_get(va_block, id); 1578 UVM_ASSERT(gpu_state); 1579 1580 gpu_chunk = gpu_state->chunks[page_index]; 1581 if (!gpu_chunk) { 1582 // If we didn't find a chunk it's because the page was unmapped for 1583 // mremap and no fault has established a new mapping. 1584 UVM_ASSERT(!uvm_page_mask_test(&gpu_state->resident, page_index)); 1585 return; 1586 } 1587 1588 // TODO: Bug 3898467: unmap indirect peers when freeing GPU chunks 1589 1590 uvm_mmu_chunk_unmap(gpu_chunk, &va_block->tracker); 1591 gpu_state->chunks[page_index] = NULL; 1592 } 1593 1594 static NV_STATUS gpu_chunk_add(uvm_va_block_t *va_block, 1595 uvm_page_index_t page_index, 1596 struct page *page) 1597 { 1598 uvm_va_block_gpu_state_t *gpu_state; 1599 uvm_gpu_chunk_t *gpu_chunk; 1600 uvm_gpu_id_t id; 1601 NV_STATUS status; 1602 1603 id = uvm_pmm_devmem_page_to_gpu_id(page); 1604 gpu_state = uvm_va_block_gpu_state_get(va_block, id); 1605 1606 // It's possible that this is a fresh va_block we're trying to add an 1607 // existing gpu_chunk to. This occurs for example when a GPU faults on a 1608 // virtual address that has been remapped with mremap(). 1609 if (!gpu_state) { 1610 status = uvm_va_block_gpu_state_alloc(va_block); 1611 if (status != NV_OK) 1612 return status; 1613 gpu_state = uvm_va_block_gpu_state_get(va_block, id); 1614 } 1615 1616 UVM_ASSERT(gpu_state); 1617 1618 // Note that a mremap() might be to a CPU virtual address that is nolonger 1619 // aligned with a larger GPU chunk size. We would need to allocate a new 1620 // aligned GPU chunk and copy from old to new. 1621 // TODO: Bug 3368756: add support for large GPU pages. 1622 gpu_chunk = uvm_pmm_devmem_page_to_chunk(page); 1623 UVM_ASSERT(gpu_chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED); 1624 UVM_ASSERT(gpu_chunk->is_referenced); 1625 UVM_ASSERT(page->zone_device_data == va_block->hmm.va_space); 1626 1627 if (gpu_state->chunks[page_index] == gpu_chunk) 1628 return NV_OK; 1629 1630 UVM_ASSERT(!gpu_state->chunks[page_index]); 1631 1632 // In some configurations such as SR-IOV heavy, the chunk cannot be 1633 // referenced using its physical address. Create a virtual mapping. 1634 status = uvm_mmu_chunk_map(gpu_chunk); 1635 if (status != NV_OK) 1636 return status; 1637 1638 // TODO: Bug 3898467: map indirect peers. 1639 1640 uvm_processor_mask_set(&va_block->resident, id); 1641 uvm_page_mask_set(&gpu_state->resident, page_index); 1642 1643 // It is safe to modify the page index field without holding any PMM locks 1644 // because the chunk is allocated, which means that none of the other 1645 // fields in the bitmap can change. 1646 gpu_chunk->va_block = va_block; 1647 gpu_chunk->va_block_page_index = page_index; 1648 1649 gpu_state->chunks[page_index] = gpu_chunk; 1650 1651 return NV_OK; 1652 } 1653 1654 // This is called just before calling migrate_vma_finalize() in order to wait 1655 // for GPU operations to complete and update the va_block state to match which 1656 // pages migrated (or not) and therefore which pages will be released by 1657 // migrate_vma_finalize(). 1658 // 'migrated_pages' is the mask of pages that migrated, 1659 // 'same_devmem_page_mask' is the mask of pages that are the same in src_pfns 1660 // and dst_pfns and therefore appear to migrate_vma_*() to be not migrating. 1661 // 'region' is the page index region of all migrated, non-migrated, and 1662 // same_devmem_page_mask pages. 1663 static NV_STATUS sync_page_and_chunk_state(uvm_va_block_t *va_block, 1664 const unsigned long *src_pfns, 1665 const unsigned long *dst_pfns, 1666 uvm_va_block_region_t region, 1667 const uvm_page_mask_t *migrated_pages, 1668 const uvm_page_mask_t *same_devmem_page_mask) 1669 { 1670 uvm_page_index_t page_index; 1671 NV_STATUS status; 1672 1673 // Wait for the GPU to finish. migrate_vma_finalize() will release the 1674 // migrated source pages (or non migrating destination pages), so GPU 1675 // opererations must be finished by then. 1676 status = uvm_tracker_wait(&va_block->tracker); 1677 1678 for_each_va_block_page_in_region(page_index, region) { 1679 struct page *page; 1680 1681 if (uvm_page_mask_test(same_devmem_page_mask, page_index)) 1682 continue; 1683 1684 // If a page migrated, clean up the source page. 1685 // Otherwise, clean up the destination page. 1686 if (uvm_page_mask_test(migrated_pages, page_index)) 1687 page = migrate_pfn_to_page(src_pfns[page_index]); 1688 else 1689 page = migrate_pfn_to_page(dst_pfns[page_index]); 1690 1691 if (!page) 1692 continue; 1693 1694 if (is_device_private_page(page)) { 1695 gpu_chunk_remove(va_block, page_index, page); 1696 } 1697 else { 1698 // If the source page is a system memory page, 1699 // migrate_vma_finalize() will release the reference so we should 1700 // clear our pointer to it. 1701 // TODO: Bug 3660922: Need to handle read duplication at some point. 1702 hmm_va_block_cpu_page_unpopulate(va_block, page_index); 1703 } 1704 } 1705 1706 return status; 1707 } 1708 1709 // Update va_block state to reflect that the page isn't migrating. 1710 static void clean_up_non_migrating_page(uvm_va_block_t *va_block, 1711 const unsigned long *src_pfns, 1712 unsigned long *dst_pfns, 1713 uvm_page_index_t page_index) 1714 { 1715 struct page *dst_page = migrate_pfn_to_page(dst_pfns[page_index]); 1716 1717 if (!dst_page) 1718 return; 1719 1720 // migrate_vma_finalize() will release the dst_page reference so don't keep 1721 // a pointer to it. 1722 if (is_device_private_page(dst_page)) { 1723 gpu_chunk_remove(va_block, page_index, dst_page); 1724 } 1725 else { 1726 UVM_ASSERT(page_ref_count(dst_page) == 1); 1727 1728 hmm_va_block_cpu_page_unpopulate(va_block, page_index); 1729 } 1730 1731 unlock_page(dst_page); 1732 put_page(dst_page); 1733 dst_pfns[page_index] = 0; 1734 } 1735 1736 static void clean_up_non_migrating_pages(uvm_va_block_t *va_block, 1737 const unsigned long *src_pfns, 1738 unsigned long *dst_pfns, 1739 uvm_va_block_region_t region, 1740 uvm_page_mask_t *page_mask) 1741 { 1742 uvm_page_index_t page_index; 1743 NV_STATUS status; 1744 1745 status = uvm_tracker_wait(&va_block->tracker); 1746 UVM_ASSERT(status == NV_OK); 1747 1748 for_each_va_block_page_in_region_mask(page_index, page_mask, region) { 1749 clean_up_non_migrating_page(va_block, src_pfns, dst_pfns, page_index); 1750 } 1751 } 1752 1753 // CPU page fault handling. 1754 1755 // Fill in the dst_pfns[page_index] entry given that there is an allocated 1756 // CPU page. 1757 static void lock_block_cpu_page(uvm_va_block_t *va_block, 1758 uvm_page_index_t page_index, 1759 struct page *src_page, 1760 unsigned long *dst_pfns, 1761 uvm_page_mask_t *same_devmem_page_mask) 1762 { 1763 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, page_index); 1764 uvm_va_block_region_t chunk_region; 1765 struct page *dst_page; 1766 1767 UVM_ASSERT(chunk); 1768 UVM_ASSERT(chunk->page); 1769 1770 chunk_region = uvm_va_block_chunk_region(va_block, uvm_cpu_chunk_get_size(chunk), page_index); 1771 1772 dst_page = chunk->page + (page_index - chunk_region.first); 1773 1774 UVM_ASSERT(dst_page != ZERO_PAGE(uvm_va_block_cpu_page_address(va_block, page_index))); 1775 UVM_ASSERT(!is_device_private_page(dst_page)); 1776 1777 // The source page is usually a device private page but it could be a GPU 1778 // remote mapped system memory page. It could also be a driver allocated 1779 // page for GPU-to-GPU staged copies (i.e., not a resident copy and owned 1780 // by the driver). 1781 if (is_device_private_page(src_page)) { 1782 // Since the page isn't mirrored, it was allocated by alloc_pages() 1783 // and UVM owns the reference. We leave the reference count unchanged 1784 // and mark the page pointer as mirrored since UVM is transferring 1785 // ownership to Linux and we don't want UVM to double free the page in 1786 // hmm_va_block_cpu_page_unpopulate() or block_kill(). If the page 1787 // does not migrate, it will be freed though. 1788 UVM_ASSERT(!uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU) || 1789 !uvm_page_mask_test(&va_block->cpu.resident, page_index)); 1790 UVM_ASSERT(chunk->type == UVM_CPU_CHUNK_TYPE_PHYSICAL); 1791 UVM_ASSERT(page_ref_count(dst_page) == 1); 1792 uvm_cpu_chunk_make_hmm(chunk); 1793 } 1794 else { 1795 UVM_ASSERT(same_devmem_page_mask); 1796 UVM_ASSERT(src_page == dst_page); 1797 uvm_page_mask_set(same_devmem_page_mask, page_index); 1798 1799 // The call to migrate_vma_setup() will have inserted a migration PTE 1800 // so the CPU has no access. 1801 cpu_mapping_clear(va_block, page_index); 1802 return; 1803 } 1804 1805 lock_page(dst_page); 1806 dst_pfns[page_index] = migrate_pfn(page_to_pfn(dst_page)); 1807 } 1808 1809 static void hmm_mark_gpu_chunk_referenced(uvm_va_block_t *va_block, 1810 uvm_gpu_t *gpu, 1811 uvm_gpu_chunk_t *gpu_chunk) 1812 { 1813 // Tell PMM to expect a callback from Linux to free the page since the 1814 // device private struct page reference count will determine when the 1815 // GPU chunk is free. 1816 UVM_ASSERT(gpu_chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED); 1817 list_del_init(&gpu_chunk->list); 1818 uvm_pmm_gpu_unpin_referenced(&gpu->pmm, gpu_chunk, va_block); 1819 } 1820 1821 static void fill_dst_pfn(uvm_va_block_t *va_block, 1822 uvm_gpu_t *gpu, 1823 const unsigned long *src_pfns, 1824 unsigned long *dst_pfns, 1825 uvm_page_index_t page_index, 1826 uvm_page_mask_t *same_devmem_page_mask) 1827 { 1828 unsigned long src_pfn = src_pfns[page_index]; 1829 uvm_gpu_chunk_t *gpu_chunk; 1830 unsigned long pfn; 1831 struct page *dpage; 1832 1833 gpu_chunk = uvm_va_block_lookup_gpu_chunk(va_block, gpu, uvm_va_block_cpu_page_address(va_block, page_index)); 1834 UVM_ASSERT(gpu_chunk); 1835 UVM_ASSERT(gpu_chunk->log2_size == PAGE_SHIFT); 1836 pfn = uvm_pmm_gpu_devmem_get_pfn(&gpu->pmm, gpu_chunk); 1837 1838 // If the same GPU page is both source and destination, migrate_vma_pages() 1839 // will see the wrong "expected" reference count and not migrate it, so we 1840 // mark it as not migrating but we keep track of this so we don't confuse 1841 // it with a page that migrate_vma_pages() actually does not migrate. 1842 if ((src_pfn & MIGRATE_PFN_VALID) && (src_pfn >> MIGRATE_PFN_SHIFT) == pfn) { 1843 uvm_page_mask_set(same_devmem_page_mask, page_index); 1844 return; 1845 } 1846 1847 dpage = pfn_to_page(pfn); 1848 UVM_ASSERT(is_device_private_page(dpage)); 1849 UVM_ASSERT(dpage->pgmap->owner == &g_uvm_global); 1850 1851 hmm_mark_gpu_chunk_referenced(va_block, gpu, gpu_chunk); 1852 UVM_ASSERT(!page_count(dpage)); 1853 zone_device_page_init(dpage); 1854 dpage->zone_device_data = va_block->hmm.va_space; 1855 1856 dst_pfns[page_index] = migrate_pfn(pfn); 1857 } 1858 1859 static void fill_dst_pfns(uvm_va_block_t *va_block, 1860 const unsigned long *src_pfns, 1861 unsigned long *dst_pfns, 1862 uvm_va_block_region_t region, 1863 uvm_page_mask_t *page_mask, 1864 uvm_page_mask_t *same_devmem_page_mask, 1865 uvm_processor_id_t dest_id) 1866 { 1867 uvm_gpu_t *gpu = uvm_va_space_get_gpu(va_block->hmm.va_space, dest_id); 1868 uvm_page_index_t page_index; 1869 1870 uvm_page_mask_zero(same_devmem_page_mask); 1871 1872 for_each_va_block_page_in_region_mask(page_index, page_mask, region) { 1873 if (!(src_pfns[page_index] & MIGRATE_PFN_MIGRATE)) 1874 continue; 1875 1876 fill_dst_pfn(va_block, 1877 gpu, 1878 src_pfns, 1879 dst_pfns, 1880 page_index, 1881 same_devmem_page_mask); 1882 } 1883 } 1884 1885 static NV_STATUS alloc_and_copy_to_cpu(uvm_va_block_t *va_block, 1886 struct vm_area_struct *vma, 1887 const unsigned long *src_pfns, 1888 unsigned long *dst_pfns, 1889 uvm_va_block_region_t region, 1890 uvm_page_mask_t *page_mask, 1891 uvm_page_mask_t *same_devmem_page_mask, 1892 uvm_processor_id_t processor_id, 1893 uvm_service_block_context_t *service_context) 1894 { 1895 uvm_page_index_t page_index; 1896 NV_STATUS status = NV_OK; 1897 1898 for_each_va_block_page_in_region_mask(page_index, page_mask, region) { 1899 struct page *src_page; 1900 struct page *dst_page; 1901 gfp_t gfp; 1902 1903 if (!(src_pfns[page_index] & MIGRATE_PFN_MIGRATE)) { 1904 // Device exclusive PTEs are not selected but we still want to 1905 // process the page so record it as such. 1906 if (service_context && !UVM_ID_IS_CPU(processor_id) && 1907 service_context->access_type[page_index] == UVM_FAULT_ACCESS_TYPE_ATOMIC_STRONG) { 1908 uvm_page_mask_set(same_devmem_page_mask, page_index); 1909 continue; 1910 } 1911 1912 // We have previously found a page that is CPU resident which can't 1913 // be migrated (probably a shared mapping) so make sure we establish 1914 // a remote mapping for it. 1915 if (uvm_page_mask_test(same_devmem_page_mask, page_index)) 1916 continue; 1917 1918 goto clr_mask; 1919 } 1920 1921 // This is the page that will be copied to system memory. 1922 src_page = migrate_pfn_to_page(src_pfns[page_index]); 1923 1924 if (src_page) { 1925 // mremap may have caused us to loose the gpu_chunk associated with 1926 // this va_block/page_index so make sure we have the correct chunk. 1927 if (is_device_private_page(src_page)) 1928 gpu_chunk_add(va_block, page_index, src_page); 1929 1930 if (uvm_page_mask_test(&va_block->cpu.allocated, page_index)) { 1931 lock_block_cpu_page(va_block, page_index, src_page, dst_pfns, same_devmem_page_mask); 1932 continue; 1933 } 1934 } 1935 1936 UVM_ASSERT(!uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU) || 1937 !uvm_page_mask_test(&va_block->cpu.resident, page_index)); 1938 1939 // Allocate a user system memory page for the destination. 1940 // This is the typical case since Linux will free the source page when 1941 // migrating to device private memory. 1942 // If there is no source page, it means the page is pte_none() or the 1943 // zero page. This case "shouldn't happen" because we asked 1944 // migrate_vma_setup() only for device private pages but 1945 // migrate_vma_collect_hole() doesn't check the 1946 // MIGRATE_VMA_SELECT_SYSTEM flag. 1947 gfp = GFP_HIGHUSER_MOVABLE; 1948 if (!src_page) 1949 gfp |= __GFP_ZERO; 1950 1951 dst_page = alloc_page_vma(gfp, 1952 vma, 1953 va_block->start + (page_index << PAGE_SHIFT)); 1954 if (!dst_page) { 1955 // Ignore errors if the page is only for prefetching. 1956 if (service_context && 1957 service_context->access_type[page_index] == UVM_FAULT_ACCESS_TYPE_PREFETCH) 1958 goto clr_mask; 1959 UVM_ERR_PRINT("cannot allocate page %u (addr 0x%llx)\n", 1960 page_index, va_block->start + (page_index << PAGE_SHIFT)); 1961 status = NV_ERR_NO_MEMORY; 1962 break; 1963 } 1964 1965 status = hmm_va_block_cpu_page_populate(va_block, page_index, dst_page); 1966 if (status != NV_OK) { 1967 __free_page(dst_page); 1968 // Ignore errors if the page is only for prefetching. 1969 if (service_context && 1970 service_context->access_type[page_index] == UVM_FAULT_ACCESS_TYPE_PREFETCH) 1971 goto clr_mask; 1972 break; 1973 } 1974 1975 // Note that we don't call get_page(dst_page) since alloc_page_vma() 1976 // returns with a page reference count of one and we are passing 1977 // ownership to Linux. Also, uvm_va_block_cpu_page_populate() recorded 1978 // the page as "mirrored" so that migrate_vma_finalize() and 1979 // hmm_va_block_cpu_page_unpopulate() don't double free the page. 1980 lock_page(dst_page); 1981 dst_pfns[page_index] = migrate_pfn(page_to_pfn(dst_page)); 1982 continue; 1983 1984 clr_mask: 1985 // TODO: Bug 3900774: clean up murky mess of mask clearing. 1986 uvm_page_mask_clear(page_mask, page_index); 1987 if (service_context) 1988 clear_service_context_masks(service_context, UVM_ID_CPU, page_index); 1989 } 1990 1991 if (status != NV_OK) 1992 clean_up_non_migrating_pages(va_block, src_pfns, dst_pfns, region, page_mask); 1993 else if (uvm_page_mask_empty(page_mask)) 1994 return NV_WARN_MORE_PROCESSING_REQUIRED; 1995 1996 return status; 1997 } 1998 1999 static NV_STATUS uvm_hmm_devmem_fault_alloc_and_copy(uvm_hmm_devmem_fault_context_t *devmem_fault_context) 2000 { 2001 uvm_processor_id_t processor_id; 2002 uvm_service_block_context_t *service_context; 2003 uvm_va_block_retry_t *va_block_retry; 2004 const unsigned long *src_pfns; 2005 unsigned long *dst_pfns; 2006 uvm_page_mask_t *page_mask; 2007 uvm_page_mask_t *same_devmem_page_mask = &devmem_fault_context->same_devmem_page_mask; 2008 uvm_va_block_t *va_block; 2009 NV_STATUS status = NV_OK; 2010 2011 processor_id = devmem_fault_context->processor_id; 2012 service_context = devmem_fault_context->service_context; 2013 va_block_retry = devmem_fault_context->va_block_retry; 2014 va_block = devmem_fault_context->va_block; 2015 src_pfns = service_context->block_context.hmm.src_pfns; 2016 dst_pfns = service_context->block_context.hmm.dst_pfns; 2017 2018 // Build the migration page mask. 2019 // Note that thrashing pinned pages and prefetch pages are already 2020 // accounted for in service_context->per_processor_masks. 2021 page_mask = &devmem_fault_context->page_mask; 2022 uvm_page_mask_copy(page_mask, &service_context->per_processor_masks[UVM_ID_CPU_VALUE].new_residency); 2023 2024 status = alloc_and_copy_to_cpu(va_block, 2025 service_context->block_context.hmm.vma, 2026 src_pfns, 2027 dst_pfns, 2028 service_context->region, 2029 page_mask, 2030 same_devmem_page_mask, 2031 processor_id, 2032 service_context); 2033 if (status != NV_OK) 2034 return status; 2035 2036 // Do the copy but don't update the residency or mapping for the new 2037 // location yet. 2038 return uvm_va_block_service_copy(processor_id, UVM_ID_CPU, va_block, va_block_retry, service_context); 2039 } 2040 2041 static NV_STATUS uvm_hmm_devmem_fault_finalize_and_map(uvm_hmm_devmem_fault_context_t *devmem_fault_context) 2042 { 2043 uvm_processor_id_t processor_id; 2044 uvm_service_block_context_t *service_context; 2045 uvm_perf_prefetch_hint_t *prefetch_hint; 2046 uvm_va_block_retry_t *va_block_retry; 2047 const unsigned long *src_pfns; 2048 unsigned long *dst_pfns; 2049 uvm_page_mask_t *page_mask; 2050 uvm_va_block_t *va_block; 2051 uvm_va_block_region_t region; 2052 uvm_page_index_t page_index; 2053 NV_STATUS status, tracker_status; 2054 2055 processor_id = devmem_fault_context->processor_id; 2056 service_context = devmem_fault_context->service_context; 2057 prefetch_hint = &service_context->prefetch_hint; 2058 va_block = devmem_fault_context->va_block; 2059 va_block_retry = devmem_fault_context->va_block_retry; 2060 src_pfns = service_context->block_context.hmm.src_pfns; 2061 dst_pfns = service_context->block_context.hmm.dst_pfns; 2062 region = service_context->region; 2063 2064 page_mask = &devmem_fault_context->page_mask; 2065 2066 // There are a number of reasons why HMM will mark a page as not migrating 2067 // even if we set a valid entry in dst_pfns[]. Mark these pages accordingly. 2068 for_each_va_block_page_in_region_mask(page_index, page_mask, region) { 2069 if (src_pfns[page_index] & MIGRATE_PFN_MIGRATE) 2070 continue; 2071 2072 // If a page isn't migrating and only the GPU page table is being 2073 // updated, continue to process it normally. 2074 if (uvm_page_mask_test(&devmem_fault_context->same_devmem_page_mask, page_index)) 2075 continue; 2076 2077 // TODO: Bug 3900774: clean up murky mess of mask clearing. 2078 uvm_page_mask_clear(page_mask, page_index); 2079 clear_service_context_masks(service_context, UVM_ID_CPU, page_index); 2080 } 2081 2082 if (uvm_page_mask_empty(page_mask)) 2083 status = NV_WARN_MORE_PROCESSING_REQUIRED; 2084 else 2085 status = uvm_va_block_service_finish(processor_id, va_block, service_context); 2086 2087 tracker_status = sync_page_and_chunk_state(va_block, 2088 src_pfns, 2089 dst_pfns, 2090 region, 2091 page_mask, 2092 &devmem_fault_context->same_devmem_page_mask); 2093 2094 return status == NV_OK ? tracker_status : status; 2095 } 2096 2097 static NV_STATUS populate_region(uvm_va_block_t *va_block, 2098 unsigned long *pfns, 2099 uvm_va_block_region_t region, 2100 uvm_page_mask_t *populated_page_mask) 2101 { 2102 uvm_page_index_t page_index; 2103 NV_STATUS status; 2104 2105 // Make sure GPU state is allocated or else the GPU DMA mappings to 2106 // system memory won't be saved. 2107 status = uvm_va_block_gpu_state_alloc(va_block); 2108 if (status != NV_OK) 2109 return status; 2110 2111 for_each_va_block_page_in_region(page_index, region) { 2112 struct page *page; 2113 2114 // This case should only happen when querying CPU residency and we ask 2115 // for something not covered by a VMA. Otherwise, hmm_range_fault() 2116 // returns -EFAULT instead of setting the HMM_PFN_ERROR bit. 2117 if (pfns[page_index] & HMM_PFN_ERROR) 2118 return NV_ERR_INVALID_ADDRESS; 2119 2120 if (pfns[page_index] & HMM_PFN_VALID) { 2121 page = hmm_pfn_to_page(pfns[page_index]); 2122 } 2123 else { 2124 // The page can't be evicted since it has to be migrated to the GPU 2125 // first which would leave a device private page entry so this has 2126 // to be a pte_none(), swapped out, or similar entry. 2127 // The page would have been allocated if populate_region() is being 2128 // called from uvm_hmm_va_block_service_locked() so this must be 2129 // for uvm_hmm_va_block_update_residency_info(). Just leave the 2130 // residency/populated information unchanged since 2131 // uvm_hmm_invalidate() should handle that if the underlying page 2132 // is invalidated. 2133 // Also note there can be an allocated page due to GPU-to-GPU 2134 // migration between non-peer or indirect peer GPUs. 2135 continue; 2136 } 2137 2138 if (is_device_private_page(page)) { 2139 // Linux can call hmm_invalidate() and we have to clear the GPU 2140 // chunk pointer in uvm_va_block_gpu_state_t::chunks[] but it might 2141 // not release the device private struct page reference. Since 2142 // hmm_range_fault() did find a device private PTE, we can 2143 // re-establish the GPU chunk pointer. 2144 status = gpu_chunk_add(va_block, page_index, page); 2145 if (status != NV_OK) 2146 return status; 2147 continue; 2148 } 2149 2150 // If a CPU chunk is already allocated, check to see it matches what 2151 // hmm_range_fault() found. 2152 if (uvm_page_mask_test(&va_block->cpu.allocated, page_index)) { 2153 UVM_ASSERT(hmm_va_block_cpu_page_is_same(va_block, page_index, page)); 2154 } 2155 else { 2156 status = hmm_va_block_cpu_page_populate(va_block, page_index, page); 2157 if (status != NV_OK) 2158 return status; 2159 2160 // Record that we populated this page. hmm_block_cpu_fault_locked() 2161 // uses this to ensure pages that don't migrate get remote mapped. 2162 if (populated_page_mask) 2163 uvm_page_mask_set(populated_page_mask, page_index); 2164 } 2165 2166 // Since we have a stable snapshot of the CPU pages, we can 2167 // update the residency and protection information. 2168 uvm_processor_mask_set(&va_block->resident, UVM_ID_CPU); 2169 uvm_page_mask_set(&va_block->cpu.resident, page_index); 2170 2171 cpu_mapping_set(va_block, pfns[page_index] & HMM_PFN_WRITE, page_index); 2172 } 2173 2174 return NV_OK; 2175 } 2176 2177 static void hmm_range_fault_begin(uvm_va_block_t *va_block) 2178 { 2179 uvm_thread_context_t *uvm_context = uvm_thread_context(); 2180 2181 uvm_assert_mutex_locked(&va_block->lock); 2182 uvm_context->hmm_invalidate_seqnum = va_block->hmm.changed; 2183 } 2184 2185 static bool hmm_range_fault_retry(uvm_va_block_t *va_block) 2186 { 2187 uvm_thread_context_t *uvm_context = uvm_thread_context(); 2188 2189 uvm_assert_mutex_locked(&va_block->lock); 2190 return uvm_context->hmm_invalidate_seqnum != va_block->hmm.changed; 2191 } 2192 2193 // Make the region be resident on the CPU by calling hmm_range_fault() to fault 2194 // in CPU pages. 2195 static NV_STATUS hmm_make_resident_cpu(uvm_va_block_t *va_block, 2196 struct vm_area_struct *vma, 2197 unsigned long *hmm_pfns, 2198 uvm_va_block_region_t region, 2199 NvU8 *access_type, 2200 uvm_page_mask_t *populated_page_mask) 2201 { 2202 uvm_page_index_t page_index; 2203 int ret; 2204 struct hmm_range range = { 2205 .notifier = &va_block->hmm.notifier, 2206 .start = uvm_va_block_region_start(va_block, region), 2207 .end = uvm_va_block_region_end(va_block, region) + 1, 2208 .hmm_pfns = hmm_pfns + region.first, 2209 .pfn_flags_mask = HMM_PFN_REQ_FAULT | HMM_PFN_REQ_WRITE, 2210 .dev_private_owner = &g_uvm_global, 2211 }; 2212 2213 for_each_va_block_page_in_region(page_index, region) { 2214 if ((access_type && access_type[page_index] >= UVM_FAULT_ACCESS_TYPE_WRITE) || 2215 (vma->vm_flags & VM_WRITE)) 2216 hmm_pfns[page_index] = HMM_PFN_REQ_FAULT | HMM_PFN_REQ_WRITE; 2217 else 2218 hmm_pfns[page_index] = HMM_PFN_REQ_FAULT; 2219 } 2220 2221 hmm_range_fault_begin(va_block); 2222 2223 // Mirror the VA block to the HMM address range. 2224 // Note that we request HMM to handle page faults, which means that it will 2225 // populate and map potentially not-yet-existing pages to the VMA. 2226 // Also note that mmu_interval_read_begin() calls wait_event() for any 2227 // parallel invalidation callbacks to finish so we can't hold locks that 2228 // the invalidation callback acquires. 2229 uvm_mutex_unlock(&va_block->lock); 2230 2231 range.notifier_seq = mmu_interval_read_begin(range.notifier); 2232 ret = hmm_range_fault(&range); 2233 2234 uvm_mutex_lock(&va_block->lock); 2235 2236 if (ret) 2237 return (ret == -EBUSY) ? NV_WARN_MORE_PROCESSING_REQUIRED : errno_to_nv_status(ret); 2238 2239 if (hmm_range_fault_retry(va_block)) 2240 return NV_WARN_MORE_PROCESSING_REQUIRED; 2241 2242 return populate_region(va_block, 2243 hmm_pfns, 2244 region, 2245 populated_page_mask); 2246 } 2247 2248 // Release the reference count on any pages that were made device exclusive. 2249 static void hmm_release_atomic_pages(uvm_va_block_t *va_block, 2250 uvm_service_block_context_t *service_context) 2251 { 2252 uvm_va_block_region_t region = service_context->region; 2253 uvm_page_index_t page_index; 2254 2255 for_each_va_block_page_in_region(page_index, region) { 2256 struct page *page = service_context->block_context.hmm.pages[page_index]; 2257 2258 if (!page) 2259 continue; 2260 2261 unlock_page(page); 2262 put_page(page); 2263 } 2264 } 2265 2266 static NV_STATUS hmm_block_atomic_fault_locked(uvm_processor_id_t processor_id, 2267 uvm_va_block_t *va_block, 2268 uvm_va_block_retry_t *va_block_retry, 2269 uvm_service_block_context_t *service_context) 2270 { 2271 uvm_va_block_region_t region = service_context->region; 2272 struct page **pages = service_context->block_context.hmm.pages; 2273 int npages; 2274 uvm_page_index_t page_index; 2275 uvm_make_resident_cause_t cause; 2276 NV_STATUS status; 2277 2278 if (!uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU) || 2279 !uvm_page_mask_region_full(&va_block->cpu.resident, region)) { 2280 // There is an atomic GPU fault. We need to make sure no pages are 2281 // GPU resident so that make_device_exclusive_range() doesn't call 2282 // migrate_to_ram() and cause a va_space lock recursion problem. 2283 if (service_context->operation == UVM_SERVICE_OPERATION_REPLAYABLE_FAULTS) 2284 cause = UVM_MAKE_RESIDENT_CAUSE_REPLAYABLE_FAULT; 2285 else if (service_context->operation == UVM_SERVICE_OPERATION_NON_REPLAYABLE_FAULTS) 2286 cause = UVM_MAKE_RESIDENT_CAUSE_NON_REPLAYABLE_FAULT; 2287 else 2288 cause = UVM_MAKE_RESIDENT_CAUSE_ACCESS_COUNTER; 2289 2290 status = uvm_hmm_va_block_migrate_locked(va_block, 2291 va_block_retry, 2292 &service_context->block_context, 2293 UVM_ID_CPU, 2294 region, 2295 cause); 2296 if (status != NV_OK) 2297 goto done; 2298 2299 // make_device_exclusive_range() will try to call migrate_to_ram() 2300 // and deadlock with ourself if the data isn't CPU resident. 2301 if (!uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU) || 2302 !uvm_page_mask_region_full(&va_block->cpu.resident, region)) { 2303 status = NV_WARN_MORE_PROCESSING_REQUIRED; 2304 goto done; 2305 } 2306 } 2307 2308 // TODO: Bug 4014681: atomic GPU operations are not supported on MAP_SHARED 2309 // mmap() files so we check for that here and report a fatal fault. 2310 // Otherwise with the current Linux 6.1 make_device_exclusive_range(), 2311 // it doesn't make the page exclusive and we end up in an endless loop. 2312 if (service_context->block_context.hmm.vma->vm_flags & VM_SHARED) { 2313 status = NV_ERR_NOT_SUPPORTED; 2314 goto done; 2315 } 2316 2317 hmm_range_fault_begin(va_block); 2318 2319 uvm_mutex_unlock(&va_block->lock); 2320 2321 npages = make_device_exclusive_range(service_context->block_context.mm, 2322 uvm_va_block_cpu_page_address(va_block, region.first), 2323 uvm_va_block_cpu_page_address(va_block, region.outer - 1) + PAGE_SIZE, 2324 pages + region.first, 2325 &g_uvm_global); 2326 2327 uvm_mutex_lock(&va_block->lock); 2328 2329 if (npages < 0) { 2330 status = (npages == -EBUSY) ? NV_WARN_MORE_PROCESSING_REQUIRED : errno_to_nv_status(npages); 2331 goto done; 2332 } 2333 2334 while ((size_t)npages < uvm_va_block_region_num_pages(region)) 2335 pages[region.first + npages++] = NULL; 2336 2337 if (hmm_range_fault_retry(va_block)) { 2338 status = NV_WARN_MORE_PROCESSING_REQUIRED; 2339 goto release; 2340 } 2341 2342 status = NV_OK; 2343 2344 for_each_va_block_page_in_region(page_index, region) { 2345 struct page *page = pages[page_index]; 2346 2347 if (!page) { 2348 // Record that one of the pages isn't exclusive but keep converting 2349 // the others. 2350 status = NV_WARN_MORE_PROCESSING_REQUIRED; 2351 continue; 2352 } 2353 2354 // If a CPU chunk is already allocated, check to see it matches what 2355 // make_device_exclusive_range() found. 2356 if (uvm_page_mask_test(&va_block->cpu.allocated, page_index)) { 2357 UVM_ASSERT(hmm_va_block_cpu_page_is_same(va_block, page_index, page)); 2358 UVM_ASSERT(uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU)); 2359 UVM_ASSERT(uvm_page_mask_test(&va_block->cpu.resident, page_index)); 2360 } 2361 else { 2362 NV_STATUS s = hmm_va_block_cpu_page_populate(va_block, page_index, page); 2363 2364 if (s == NV_OK) { 2365 uvm_processor_mask_set(&va_block->resident, UVM_ID_CPU); 2366 uvm_page_mask_set(&va_block->cpu.resident, page_index); 2367 } 2368 } 2369 2370 cpu_mapping_clear(va_block, page_index); 2371 } 2372 2373 if (status != NV_OK) 2374 goto release; 2375 2376 status = uvm_va_block_service_copy(processor_id, UVM_ID_CPU, va_block, va_block_retry, service_context); 2377 if (status != NV_OK) 2378 goto release; 2379 2380 status = uvm_va_block_service_finish(processor_id, va_block, service_context); 2381 2382 release: 2383 hmm_release_atomic_pages(va_block, service_context); 2384 2385 done: 2386 return status; 2387 } 2388 2389 static bool is_atomic_fault(NvU8 *access_type, uvm_va_block_region_t region) 2390 { 2391 uvm_page_index_t page_index; 2392 2393 for_each_va_block_page_in_region(page_index, region) { 2394 if (access_type[page_index] == UVM_FAULT_ACCESS_TYPE_ATOMIC_STRONG) 2395 return true; 2396 } 2397 2398 return false; 2399 } 2400 2401 static bool is_gpu_resident(uvm_va_block_t *va_block, uvm_va_block_region_t region) 2402 { 2403 uvm_processor_id_t gpu_id; 2404 2405 for_each_gpu_id_in_mask(gpu_id, &va_block->resident) { 2406 uvm_va_block_gpu_state_t *gpu_state; 2407 2408 gpu_state = uvm_va_block_gpu_state_get(va_block, gpu_id); 2409 if (!uvm_page_mask_region_empty(&gpu_state->resident, region)) 2410 return true; 2411 } 2412 2413 return false; 2414 } 2415 2416 static NV_STATUS hmm_block_cpu_fault_locked(uvm_processor_id_t processor_id, 2417 uvm_va_block_t *va_block, 2418 uvm_va_block_retry_t *va_block_retry, 2419 uvm_service_block_context_t *service_context) 2420 { 2421 uvm_va_block_region_t region = service_context->region; 2422 struct migrate_vma *args = &service_context->block_context.hmm.migrate_vma_args; 2423 NV_STATUS status; 2424 int ret; 2425 uvm_hmm_devmem_fault_context_t fault_context = { 2426 .processor_id = processor_id, 2427 .va_block = va_block, 2428 .va_block_retry = va_block_retry, 2429 .service_context = service_context, 2430 }; 2431 2432 // Normally the source page will be a device private page that is being 2433 // migrated to system memory. However, when it is a GPU fault, the source 2434 // page can be a system memory page that the GPU needs to remote map 2435 // instead. However migrate_vma_setup() won't select these types of 2436 // mappings/pages: 2437 // - device exclusive PTEs 2438 // - shared mappings 2439 // - file backed mappings 2440 // Also, if the source and destination page are the same, the page reference 2441 // count won't be the "expected" count and migrate_vma_pages() won't migrate 2442 // it. This mask records that uvm_hmm_devmem_fault_alloc_and_copy() and 2443 // uvm_hmm_devmem_fault_finalize_and_map() still needs to process these 2444 // pages even if src_pfn indicates they are not migrating. 2445 uvm_page_mask_zero(&fault_context.same_devmem_page_mask); 2446 2447 if (!UVM_ID_IS_CPU(processor_id)) { 2448 if (is_atomic_fault(service_context->access_type, region)) { 2449 return hmm_block_atomic_fault_locked(processor_id, 2450 va_block, 2451 va_block_retry, 2452 service_context); 2453 } 2454 2455 status = hmm_make_resident_cpu(va_block, 2456 service_context->block_context.hmm.vma, 2457 service_context->block_context.hmm.src_pfns, 2458 region, 2459 service_context->access_type, 2460 &fault_context.same_devmem_page_mask); 2461 if (status != NV_OK) 2462 return status; 2463 2464 // If no GPU has a resident copy, we can skip the migrate_vma_*(). 2465 // This is necessary if uvm_hmm_must_use_sysmem() returned true. 2466 if (!is_gpu_resident(va_block, region)) { 2467 status = uvm_va_block_service_copy(processor_id, 2468 UVM_ID_CPU, 2469 va_block, 2470 va_block_retry, 2471 service_context); 2472 if (status != NV_OK) 2473 return status; 2474 2475 return uvm_va_block_service_finish(processor_id, va_block, service_context); 2476 } 2477 } 2478 2479 args->vma = service_context->block_context.hmm.vma; 2480 args->src = service_context->block_context.hmm.src_pfns + region.first; 2481 args->dst = service_context->block_context.hmm.dst_pfns + region.first; 2482 args->start = uvm_va_block_region_start(va_block, region); 2483 args->end = uvm_va_block_region_end(va_block, region) + 1; 2484 args->flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE; 2485 args->pgmap_owner = &g_uvm_global; 2486 2487 if (UVM_ID_IS_CPU(processor_id)) { 2488 args->fault_page = service_context->cpu_fault.vmf->page; 2489 } 2490 else { 2491 args->flags |= MIGRATE_VMA_SELECT_SYSTEM; 2492 args->fault_page = NULL; 2493 } 2494 2495 ret = migrate_vma_setup_locked(args, va_block); 2496 UVM_ASSERT(!ret); 2497 2498 // The overall process here is to migrate pages from the GPU to the CPU 2499 // and possibly remote map the GPU to sysmem if accessed_by is set. 2500 // This is safe because we hold the va_block lock across the calls to 2501 // uvm_hmm_devmem_fault_alloc_and_copy(), migrate_vma_pages(), 2502 // uvm_hmm_devmem_fault_finalize_and_map(), and migrate_vma_finalize(). 2503 // If uvm_hmm_devmem_fault_alloc_and_copy() needs to drop the va_block 2504 // lock, a sequence number is used to tell if an invalidate() callback 2505 // occurred while not holding the lock. If the sequence number changes, 2506 // all the locks need to be dropped (mm, va_space, va_block) and the whole 2507 // uvm_va_block_service_locked() called again. Otherwise, there were no 2508 // conflicting invalidate callbacks and our snapshots of the CPU page 2509 // tables are accurate and can be used to DMA pages and update GPU page 2510 // tables. 2511 status = uvm_hmm_devmem_fault_alloc_and_copy(&fault_context); 2512 if (status == NV_OK) { 2513 migrate_vma_pages(args); 2514 status = uvm_hmm_devmem_fault_finalize_and_map(&fault_context); 2515 } 2516 2517 migrate_vma_finalize(args); 2518 2519 if (status == NV_WARN_NOTHING_TO_DO) 2520 status = NV_OK; 2521 2522 return status; 2523 } 2524 2525 static NV_STATUS dmamap_src_sysmem_pages(uvm_va_block_t *va_block, 2526 struct vm_area_struct *vma, 2527 const unsigned long *src_pfns, 2528 unsigned long *dst_pfns, 2529 uvm_va_block_region_t region, 2530 uvm_page_mask_t *page_mask, 2531 uvm_processor_id_t dest_id, 2532 uvm_service_block_context_t *service_context) 2533 { 2534 uvm_page_index_t page_index; 2535 NV_STATUS status = NV_OK; 2536 2537 for_each_va_block_page_in_region_mask(page_index, page_mask, region) { 2538 struct page *src_page; 2539 2540 if (!(src_pfns[page_index] & MIGRATE_PFN_MIGRATE)) { 2541 // HMM currently has some limitations on what pages can be migrated. 2542 // For example, no file backed pages, device private pages owned by 2543 // a different device, device exclusive or swapped out pages. 2544 goto clr_mask; 2545 } 2546 2547 // This is the page that will be copied to the destination GPU. 2548 src_page = migrate_pfn_to_page(src_pfns[page_index]); 2549 if (src_page) { 2550 if (is_device_private_page(src_page)) { 2551 status = gpu_chunk_add(va_block, page_index, src_page); 2552 if (status != NV_OK) 2553 break; 2554 continue; 2555 } 2556 2557 if (PageSwapCache(src_page)) { 2558 // TODO: Bug 4050579: Remove this when swap cached pages can be 2559 // migrated. 2560 if (service_context) { 2561 service_context->block_context.hmm.swap_cached = true; 2562 break; 2563 } 2564 2565 goto clr_mask; 2566 } 2567 2568 // If the page is already allocated, it is most likely a mirrored 2569 // page. Check to be sure it matches what we have recorded. The 2570 // page shouldn't be a staging page from a GPU to GPU migration 2571 // or a remote mapped atomic sysmem page because migrate_vma_setup() 2572 // found a normal page and non-mirrored pages are only known 2573 // privately to the UVM driver. 2574 if (uvm_page_mask_test(&va_block->cpu.allocated, page_index)) { 2575 UVM_ASSERT(hmm_va_block_cpu_page_is_same(va_block, page_index, src_page)); 2576 UVM_ASSERT(uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU)); 2577 UVM_ASSERT(uvm_page_mask_test(&va_block->cpu.resident, page_index)); 2578 } 2579 else { 2580 status = hmm_va_block_cpu_page_populate(va_block, page_index, src_page); 2581 if (status != NV_OK) 2582 goto clr_mask; 2583 2584 // Since there is a CPU resident page, there shouldn't be one 2585 // anywhere else. TODO: Bug 3660922: Need to handle read 2586 // duplication at some point. 2587 UVM_ASSERT(!uvm_va_block_page_resident_processors_count(va_block, page_index)); 2588 2589 // migrate_vma_setup() was able to isolate and lock the page; 2590 // therefore, it is CPU resident and not mapped. 2591 uvm_processor_mask_set(&va_block->resident, UVM_ID_CPU); 2592 uvm_page_mask_set(&va_block->cpu.resident, page_index); 2593 } 2594 2595 // The call to migrate_vma_setup() will have inserted a migration 2596 // PTE so the CPU has no access. 2597 cpu_mapping_clear(va_block, page_index); 2598 } 2599 else { 2600 // It is OK to migrate an empty anonymous page, a zero page will 2601 // be allocated on the GPU. Just be sure to free any pages 2602 // used for GPU to GPU copies. It can't be an evicted page because 2603 // migrate_vma_setup() would have found a source page. 2604 if (uvm_page_mask_test(&va_block->cpu.allocated, page_index)) { 2605 UVM_ASSERT(!uvm_va_block_page_resident_processors_count(va_block, page_index)); 2606 2607 hmm_va_block_cpu_page_unpopulate(va_block, page_index); 2608 } 2609 } 2610 2611 continue; 2612 2613 clr_mask: 2614 // TODO: Bug 3900774: clean up murky mess of mask clearing. 2615 uvm_page_mask_clear(page_mask, page_index); 2616 if (service_context) 2617 clear_service_context_masks(service_context, dest_id, page_index); 2618 } 2619 2620 if (uvm_page_mask_empty(page_mask) || 2621 (service_context && service_context->block_context.hmm.swap_cached)) 2622 status = NV_WARN_MORE_PROCESSING_REQUIRED; 2623 2624 if (status != NV_OK) 2625 clean_up_non_migrating_pages(va_block, src_pfns, dst_pfns, region, page_mask); 2626 2627 return status; 2628 } 2629 2630 static NV_STATUS uvm_hmm_gpu_fault_alloc_and_copy(struct vm_area_struct *vma, 2631 uvm_hmm_gpu_fault_event_t *uvm_hmm_gpu_fault_event) 2632 { 2633 uvm_processor_id_t processor_id; 2634 uvm_processor_id_t new_residency; 2635 uvm_va_block_t *va_block; 2636 uvm_va_block_retry_t *va_block_retry; 2637 uvm_service_block_context_t *service_context; 2638 uvm_perf_prefetch_hint_t *prefetch_hint; 2639 const unsigned long *src_pfns; 2640 unsigned long *dst_pfns; 2641 uvm_va_block_region_t region; 2642 uvm_page_mask_t *page_mask; 2643 NV_STATUS status; 2644 2645 processor_id = uvm_hmm_gpu_fault_event->processor_id; 2646 new_residency = uvm_hmm_gpu_fault_event->new_residency; 2647 va_block = uvm_hmm_gpu_fault_event->va_block; 2648 va_block_retry = uvm_hmm_gpu_fault_event->va_block_retry; 2649 service_context = uvm_hmm_gpu_fault_event->service_context; 2650 region = service_context->region; 2651 prefetch_hint = &service_context->prefetch_hint; 2652 src_pfns = service_context->block_context.hmm.src_pfns; 2653 dst_pfns = service_context->block_context.hmm.dst_pfns; 2654 2655 // Build the migration mask. 2656 // Note that thrashing pinned pages are already accounted for in 2657 // service_context->resident_processors. 2658 page_mask = &uvm_hmm_gpu_fault_event->page_mask; 2659 uvm_page_mask_copy(page_mask, 2660 &service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency); 2661 2662 status = dmamap_src_sysmem_pages(va_block, 2663 vma, 2664 src_pfns, 2665 dst_pfns, 2666 region, 2667 page_mask, 2668 new_residency, 2669 service_context); 2670 if (status != NV_OK) 2671 return status; 2672 2673 // Do the alloc and copy but don't update the residency or mapping for the 2674 // new location yet. 2675 status = uvm_va_block_service_copy(processor_id, new_residency, va_block, va_block_retry, service_context); 2676 if (status != NV_OK) 2677 return status; 2678 2679 // Record the destination PFNs of device private struct pages now that 2680 // uvm_va_block_service_copy() has populated the GPU destination pages. 2681 fill_dst_pfns(va_block, 2682 src_pfns, 2683 dst_pfns, 2684 region, 2685 page_mask, 2686 &uvm_hmm_gpu_fault_event->same_devmem_page_mask, 2687 new_residency); 2688 2689 return status; 2690 } 2691 2692 static NV_STATUS uvm_hmm_gpu_fault_finalize_and_map(uvm_hmm_gpu_fault_event_t *uvm_hmm_gpu_fault_event) 2693 { 2694 uvm_processor_id_t processor_id; 2695 uvm_processor_id_t new_residency; 2696 uvm_va_block_t *va_block; 2697 uvm_va_block_retry_t *va_block_retry; 2698 uvm_service_block_context_t *service_context; 2699 const unsigned long *src_pfns; 2700 unsigned long *dst_pfns; 2701 uvm_va_block_region_t region; 2702 uvm_page_index_t page_index; 2703 uvm_page_mask_t *page_mask; 2704 NV_STATUS status, tracker_status; 2705 2706 processor_id = uvm_hmm_gpu_fault_event->processor_id; 2707 new_residency = uvm_hmm_gpu_fault_event->new_residency; 2708 va_block = uvm_hmm_gpu_fault_event->va_block; 2709 va_block_retry = uvm_hmm_gpu_fault_event->va_block_retry; 2710 service_context = uvm_hmm_gpu_fault_event->service_context; 2711 src_pfns = service_context->block_context.hmm.src_pfns; 2712 dst_pfns = service_context->block_context.hmm.dst_pfns; 2713 region = service_context->region; 2714 page_mask = &uvm_hmm_gpu_fault_event->page_mask; 2715 2716 // There are a number of reasons why HMM will mark a page as not migrating 2717 // even if we set a valid entry in dst_pfns[]. Mark these pages accordingly. 2718 for_each_va_block_page_in_region_mask(page_index, page_mask, region) { 2719 unsigned long src_pfn = src_pfns[page_index]; 2720 2721 if (src_pfn & MIGRATE_PFN_MIGRATE) 2722 continue; 2723 2724 // If a device private page isn't migrating and only the GPU page table 2725 // is being updated, continue to process it normally. 2726 if (uvm_page_mask_test(&uvm_hmm_gpu_fault_event->same_devmem_page_mask, page_index)) 2727 continue; 2728 2729 // TODO: Bug 3900774: clean up murky mess of mask clearing. 2730 uvm_page_mask_clear(page_mask, page_index); 2731 clear_service_context_masks(service_context, new_residency, page_index); 2732 } 2733 2734 if (uvm_page_mask_empty(page_mask)) 2735 status = NV_WARN_MORE_PROCESSING_REQUIRED; 2736 else 2737 status = uvm_va_block_service_finish(processor_id, va_block, service_context); 2738 2739 tracker_status = sync_page_and_chunk_state(va_block, 2740 src_pfns, 2741 dst_pfns, 2742 region, 2743 page_mask, 2744 &uvm_hmm_gpu_fault_event->same_devmem_page_mask); 2745 2746 return status == NV_OK ? tracker_status : status; 2747 } 2748 2749 NV_STATUS uvm_hmm_va_block_service_locked(uvm_processor_id_t processor_id, 2750 uvm_processor_id_t new_residency, 2751 uvm_va_block_t *va_block, 2752 uvm_va_block_retry_t *va_block_retry, 2753 uvm_service_block_context_t *service_context) 2754 { 2755 struct mm_struct *mm = service_context->block_context.mm; 2756 struct vm_area_struct *vma = service_context->block_context.hmm.vma; 2757 uvm_va_block_region_t region = service_context->region; 2758 uvm_hmm_gpu_fault_event_t uvm_hmm_gpu_fault_event; 2759 struct migrate_vma *args = &service_context->block_context.hmm.migrate_vma_args; 2760 int ret; 2761 NV_STATUS status = NV_ERR_INVALID_ADDRESS; 2762 2763 if (!mm) 2764 return status; 2765 2766 uvm_assert_mmap_lock_locked(mm); 2767 uvm_assert_rwsem_locked(&va_block->hmm.va_space->lock); 2768 uvm_assert_mutex_locked(&va_block->hmm.migrate_lock); 2769 uvm_assert_mutex_locked(&va_block->lock); 2770 UVM_ASSERT(vma); 2771 2772 // If the desired destination is the CPU, try to fault in CPU pages. 2773 if (UVM_ID_IS_CPU(new_residency)) 2774 return hmm_block_cpu_fault_locked(processor_id, va_block, va_block_retry, service_context); 2775 2776 uvm_hmm_gpu_fault_event.processor_id = processor_id; 2777 uvm_hmm_gpu_fault_event.new_residency = new_residency; 2778 uvm_hmm_gpu_fault_event.va_block = va_block; 2779 uvm_hmm_gpu_fault_event.va_block_retry = va_block_retry; 2780 uvm_hmm_gpu_fault_event.service_context = service_context; 2781 2782 args->vma = vma; 2783 args->src = service_context->block_context.hmm.src_pfns + region.first; 2784 args->dst = service_context->block_context.hmm.dst_pfns + region.first; 2785 args->start = uvm_va_block_region_start(va_block, region); 2786 args->end = uvm_va_block_region_end(va_block, region) + 1; 2787 args->flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE | MIGRATE_VMA_SELECT_SYSTEM; 2788 args->pgmap_owner = &g_uvm_global; 2789 args->fault_page = NULL; 2790 2791 ret = migrate_vma_setup_locked(args, va_block); 2792 UVM_ASSERT(!ret); 2793 2794 // The overall process here is to migrate pages from the CPU or GPUs to the 2795 // faulting GPU. 2796 // This is safe because we hold the va_block lock across the calls to 2797 // uvm_hmm_gpu_fault_alloc_and_copy(), migrate_vma_pages(), 2798 // uvm_hmm_gpu_fault_finalize_and_map(), and migrate_vma_finalize(). 2799 // If uvm_hmm_gpu_fault_alloc_and_copy() needs to drop the va_block 2800 // lock, a sequence number is used to tell if an invalidate() callback 2801 // occurred while not holding the lock. If the sequence number changes, 2802 // all the locks need to be dropped (mm, va_space, va_block) and the whole 2803 // uvm_va_block_service_locked() called again. Otherwise, there were no 2804 // conflicting invalidate callbacks and our snapshots of the CPU page 2805 // tables are accurate and can be used to DMA pages and update GPU page 2806 // tables. TODO: Bug 3901904: there might be better ways of handling no 2807 // page being migrated. 2808 status = uvm_hmm_gpu_fault_alloc_and_copy(vma, &uvm_hmm_gpu_fault_event); 2809 if (status == NV_WARN_MORE_PROCESSING_REQUIRED) { 2810 migrate_vma_finalize(args); 2811 2812 // migrate_vma_setup() might have not been able to lock/isolate any 2813 // pages because they are swapped out or are device exclusive. 2814 // We do know that none of the pages in the region are zero pages 2815 // since migrate_vma_setup() would have reported that information. 2816 // Try to make it resident in system memory and retry the migration. 2817 status = hmm_make_resident_cpu(va_block, 2818 service_context->block_context.hmm.vma, 2819 service_context->block_context.hmm.src_pfns, 2820 region, 2821 service_context->access_type, 2822 NULL); 2823 return NV_WARN_MORE_PROCESSING_REQUIRED; 2824 } 2825 2826 if (status == NV_OK) { 2827 migrate_vma_pages(args); 2828 status = uvm_hmm_gpu_fault_finalize_and_map(&uvm_hmm_gpu_fault_event); 2829 } 2830 2831 migrate_vma_finalize(args); 2832 2833 if (status == NV_WARN_NOTHING_TO_DO) 2834 status = NV_OK; 2835 2836 return status; 2837 } 2838 2839 static NV_STATUS uvm_hmm_migrate_alloc_and_copy(struct vm_area_struct *vma, 2840 uvm_hmm_migrate_event_t *uvm_hmm_migrate_event) 2841 { 2842 uvm_va_block_t *va_block; 2843 uvm_va_block_retry_t *va_block_retry; 2844 uvm_va_block_context_t *va_block_context; 2845 const unsigned long *src_pfns; 2846 unsigned long *dst_pfns; 2847 uvm_va_block_region_t region; 2848 uvm_processor_id_t dest_id; 2849 uvm_page_mask_t *page_mask; 2850 NV_STATUS status; 2851 2852 va_block = uvm_hmm_migrate_event->va_block; 2853 va_block_retry = uvm_hmm_migrate_event->va_block_retry; 2854 va_block_context = uvm_hmm_migrate_event->va_block_context; 2855 src_pfns = va_block_context->hmm.src_pfns; 2856 dst_pfns = va_block_context->hmm.dst_pfns; 2857 region = uvm_hmm_migrate_event->region; 2858 dest_id = uvm_hmm_migrate_event->dest_id; 2859 page_mask = &uvm_hmm_migrate_event->page_mask; 2860 uvm_page_mask_init_from_region(page_mask, region, NULL); 2861 uvm_page_mask_zero(&uvm_hmm_migrate_event->same_devmem_page_mask); 2862 2863 uvm_assert_mutex_locked(&va_block->lock); 2864 2865 if (UVM_ID_IS_CPU(dest_id)) { 2866 status = alloc_and_copy_to_cpu(va_block, 2867 vma, 2868 src_pfns, 2869 dst_pfns, 2870 region, 2871 page_mask, 2872 &uvm_hmm_migrate_event->same_devmem_page_mask, 2873 UVM_ID_INVALID, 2874 NULL); 2875 } 2876 else { 2877 status = dmamap_src_sysmem_pages(va_block, 2878 vma, 2879 src_pfns, 2880 dst_pfns, 2881 region, 2882 page_mask, 2883 dest_id, 2884 NULL); 2885 } 2886 if (status != NV_OK) 2887 return status; 2888 2889 status = uvm_va_block_make_resident_copy(va_block, 2890 va_block_retry, 2891 va_block_context, 2892 dest_id, 2893 region, 2894 page_mask, 2895 NULL, 2896 uvm_hmm_migrate_event->cause); 2897 if (status != NV_OK) 2898 return status; 2899 2900 if (!UVM_ID_IS_CPU(dest_id)) { 2901 // Record the destination PFNs of device private struct pages now that 2902 // uvm_va_block_make_resident_copy() has populated the GPU destination 2903 // pages. 2904 fill_dst_pfns(va_block, 2905 src_pfns, 2906 dst_pfns, 2907 region, 2908 page_mask, 2909 &uvm_hmm_migrate_event->same_devmem_page_mask, 2910 dest_id); 2911 } 2912 2913 return status; 2914 } 2915 2916 static NV_STATUS uvm_hmm_migrate_finalize(uvm_hmm_migrate_event_t *uvm_hmm_migrate_event) 2917 { 2918 uvm_va_block_t *va_block; 2919 uvm_va_block_retry_t *va_block_retry; 2920 uvm_va_block_context_t *va_block_context; 2921 uvm_va_block_region_t region; 2922 uvm_processor_id_t dest_id; 2923 uvm_page_index_t page_index; 2924 uvm_page_mask_t *page_mask; 2925 const unsigned long *src_pfns; 2926 unsigned long *dst_pfns; 2927 2928 va_block = uvm_hmm_migrate_event->va_block; 2929 va_block_retry = uvm_hmm_migrate_event->va_block_retry; 2930 va_block_context = uvm_hmm_migrate_event->va_block_context; 2931 region = uvm_hmm_migrate_event->region; 2932 dest_id = uvm_hmm_migrate_event->dest_id; 2933 page_mask = &uvm_hmm_migrate_event->page_mask; 2934 src_pfns = va_block_context->hmm.src_pfns; 2935 dst_pfns = va_block_context->hmm.dst_pfns; 2936 2937 uvm_assert_mutex_locked(&va_block->lock); 2938 2939 // There are a number of reasons why HMM will mark a page as not migrating 2940 // even if we set a valid entry in dst_pfns[]. Mark these pages accordingly. 2941 for_each_va_block_page_in_region_mask(page_index, page_mask, region) { 2942 unsigned long src_pfn = src_pfns[page_index]; 2943 2944 if (src_pfn & MIGRATE_PFN_MIGRATE) 2945 continue; 2946 2947 // If a device private page isn't migrating and only the GPU page table 2948 // is being updated, continue to process it normally. 2949 if (uvm_page_mask_test(&uvm_hmm_migrate_event->same_devmem_page_mask, page_index)) 2950 continue; 2951 2952 uvm_page_mask_clear(page_mask, page_index); 2953 } 2954 2955 uvm_va_block_make_resident_finish(va_block, va_block_context, region, page_mask); 2956 2957 return sync_page_and_chunk_state(va_block, 2958 src_pfns, 2959 dst_pfns, 2960 region, 2961 page_mask, 2962 &uvm_hmm_migrate_event->same_devmem_page_mask); 2963 } 2964 2965 static bool is_resident(uvm_va_block_t *va_block, 2966 uvm_processor_id_t dest_id, 2967 uvm_va_block_region_t region) 2968 { 2969 if (!uvm_processor_mask_test(&va_block->resident, dest_id)) 2970 return false; 2971 2972 return uvm_page_mask_region_full(uvm_va_block_resident_mask_get(va_block, dest_id), region); 2973 } 2974 2975 // Note that migrate_vma_*() doesn't handle asynchronous migrations so the 2976 // migration flag UVM_MIGRATE_FLAG_SKIP_CPU_MAP doesn't have an effect. 2977 // TODO: Bug 3900785: investigate ways to implement async migration. 2978 NV_STATUS uvm_hmm_va_block_migrate_locked(uvm_va_block_t *va_block, 2979 uvm_va_block_retry_t *va_block_retry, 2980 uvm_va_block_context_t *va_block_context, 2981 uvm_processor_id_t dest_id, 2982 uvm_va_block_region_t region, 2983 uvm_make_resident_cause_t cause) 2984 { 2985 uvm_hmm_migrate_event_t uvm_hmm_migrate_event; 2986 struct vm_area_struct *vma = va_block_context->hmm.vma; 2987 NvU64 start; 2988 NvU64 end; 2989 struct migrate_vma *args = &va_block_context->hmm.migrate_vma_args; 2990 NV_STATUS status; 2991 int ret; 2992 2993 UVM_ASSERT(vma); 2994 UVM_ASSERT(va_block_context->mm == vma->vm_mm); 2995 uvm_assert_mmap_lock_locked(va_block_context->mm); 2996 uvm_assert_rwsem_locked(&va_block->hmm.va_space->lock); 2997 uvm_assert_mutex_locked(&va_block->hmm.migrate_lock); 2998 uvm_assert_mutex_locked(&va_block->lock); 2999 3000 start = uvm_va_block_region_start(va_block, region); 3001 end = uvm_va_block_region_end(va_block, region); 3002 UVM_ASSERT(vma->vm_start <= start && end < vma->vm_end); 3003 3004 uvm_hmm_migrate_event.va_block = va_block; 3005 uvm_hmm_migrate_event.va_block_retry = va_block_retry; 3006 uvm_hmm_migrate_event.va_block_context = va_block_context; 3007 uvm_hmm_migrate_event.region = region; 3008 uvm_hmm_migrate_event.dest_id = dest_id; 3009 uvm_hmm_migrate_event.cause = cause; 3010 3011 args->vma = vma; 3012 args->src = va_block_context->hmm.src_pfns + region.first; 3013 args->dst = va_block_context->hmm.dst_pfns + region.first; 3014 args->start = uvm_va_block_region_start(va_block, region); 3015 args->end = uvm_va_block_region_end(va_block, region) + 1; 3016 args->flags = UVM_ID_IS_CPU(dest_id) ? MIGRATE_VMA_SELECT_DEVICE_PRIVATE : 3017 MIGRATE_VMA_SELECT_DEVICE_PRIVATE | MIGRATE_VMA_SELECT_SYSTEM; 3018 args->pgmap_owner = &g_uvm_global; 3019 args->fault_page = NULL; 3020 3021 // Note that migrate_vma_setup() doesn't handle file backed or VM_SPECIAL 3022 // VMAs so if UvmMigrate() tries to migrate such a region, -EINVAL will 3023 // be returned and we will only try to make the pages be CPU resident. 3024 ret = migrate_vma_setup_locked(args, va_block); 3025 if (ret) 3026 return hmm_make_resident_cpu(va_block, 3027 vma, 3028 va_block_context->hmm.src_pfns, 3029 region, 3030 NULL, 3031 NULL); 3032 3033 // The overall process here is to migrate pages from the CPU or GPUs to the 3034 // destination processor. Note that block_migrate_add_mappings() handles 3035 // updating GPU mappings after the migration. 3036 // This is safe because we hold the va_block lock across the calls to 3037 // uvm_hmm_migrate_alloc_and_copy(), migrate_vma_pages(), 3038 // uvm_hmm_migrate_finalize(), migrate_vma_finalize() and 3039 // block_migrate_add_mappings(). 3040 // If uvm_hmm_migrate_alloc_and_copy() needs to drop the va_block 3041 // lock, a sequence number is used to tell if an invalidate() callback 3042 // occurred while not holding the lock. If the sequence number changes, 3043 // all the locks need to be dropped (mm, va_space, va_block) and the whole 3044 // uvm_hmm_va_block_migrate_locked() called again. Otherwise, there were no 3045 // conflicting invalidate callbacks and our snapshots of the CPU page 3046 // tables are accurate and can be used to DMA pages and update GPU page 3047 // tables. 3048 status = uvm_hmm_migrate_alloc_and_copy(vma, &uvm_hmm_migrate_event); 3049 if (status == NV_WARN_MORE_PROCESSING_REQUIRED) { 3050 uvm_processor_id_t id; 3051 uvm_page_mask_t *page_mask; 3052 3053 migrate_vma_finalize(args); 3054 3055 // The CPU pages tables might contain only device private pages or 3056 // the migrate_vma_setup() might have not been able to lock/isolate 3057 // any pages because they are swapped out, or on another device. 3058 // We do know that none of the pages in the region are zero pages 3059 // since migrate_vma_setup() would have reported that information. 3060 // Collect all the pages that need to be faulted in and made CPU 3061 // resident, then do the hmm_range_fault() and retry. 3062 page_mask = &va_block_context->caller_page_mask; 3063 uvm_page_mask_init_from_region(page_mask, region, NULL); 3064 3065 for_each_id_in_mask(id, &va_block->resident) { 3066 if (!uvm_page_mask_andnot(page_mask, 3067 page_mask, 3068 uvm_va_block_resident_mask_get(va_block, id))) 3069 return NV_OK; 3070 } 3071 3072 return hmm_make_resident_cpu(va_block, 3073 vma, 3074 va_block_context->hmm.src_pfns, 3075 region, 3076 NULL, 3077 NULL); 3078 } 3079 3080 if (status == NV_OK) { 3081 migrate_vma_pages(args); 3082 status = uvm_hmm_migrate_finalize(&uvm_hmm_migrate_event); 3083 } 3084 3085 migrate_vma_finalize(args); 3086 3087 if (status == NV_WARN_NOTHING_TO_DO) 3088 status = NV_OK; 3089 3090 return status; 3091 } 3092 3093 NV_STATUS uvm_hmm_migrate_ranges(uvm_va_space_t *va_space, 3094 uvm_va_block_context_t *va_block_context, 3095 NvU64 base, 3096 NvU64 length, 3097 uvm_processor_id_t dest_id, 3098 uvm_migrate_mode_t mode, 3099 uvm_tracker_t *out_tracker) 3100 { 3101 struct mm_struct *mm; 3102 uvm_va_block_t *va_block; 3103 uvm_va_block_retry_t va_block_retry; 3104 NvU64 addr, end, last_address; 3105 NV_STATUS status = NV_OK; 3106 3107 if (!uvm_hmm_is_enabled(va_space)) 3108 return NV_ERR_INVALID_ADDRESS; 3109 3110 mm = va_block_context->mm; 3111 UVM_ASSERT(mm == va_space->va_space_mm.mm); 3112 uvm_assert_mmap_lock_locked(mm); 3113 uvm_assert_rwsem_locked(&va_space->lock); 3114 3115 last_address = base + length - 1; 3116 3117 for (addr = base; addr < last_address; addr = end + 1) { 3118 struct vm_area_struct *vma; 3119 3120 status = hmm_va_block_find_create(va_space, addr, false, &va_block_context->hmm.vma, &va_block); 3121 if (status != NV_OK) 3122 return status; 3123 3124 end = va_block->end; 3125 if (end > last_address) 3126 end = last_address; 3127 3128 vma = va_block_context->hmm.vma; 3129 if (end > vma->vm_end - 1) 3130 end = vma->vm_end - 1; 3131 3132 status = hmm_migrate_range(va_block, 3133 &va_block_retry, 3134 va_block_context, 3135 dest_id, 3136 addr, 3137 end, 3138 mode, 3139 out_tracker); 3140 if (status != NV_OK) 3141 break; 3142 } 3143 3144 return status; 3145 } 3146 3147 NV_STATUS uvm_hmm_va_block_evict_chunk_prep(uvm_va_block_t *va_block, 3148 uvm_va_block_context_t *va_block_context, 3149 uvm_gpu_chunk_t *gpu_chunk, 3150 uvm_va_block_region_t chunk_region) 3151 { 3152 uvm_thread_context_t *uvm_context = uvm_thread_context(); 3153 unsigned long *src_pfns = va_block_context->hmm.src_pfns; 3154 uvm_gpu_t *gpu = uvm_gpu_chunk_get_gpu(gpu_chunk); 3155 unsigned long pfn = uvm_pmm_gpu_devmem_get_pfn(&gpu->pmm, gpu_chunk); 3156 uvm_page_index_t page_index = chunk_region.first; 3157 int ret; 3158 3159 uvm_assert_mutex_locked(&va_block->lock); 3160 // TODO: Bug 3368756: add support for large GPU pages. 3161 UVM_ASSERT(uvm_va_block_region_num_pages(chunk_region) == 1); 3162 3163 uvm_context->ignore_hmm_invalidate_va_block = va_block; 3164 ret = migrate_device_range(src_pfns + page_index, pfn, uvm_va_block_region_num_pages(chunk_region)); 3165 uvm_context->ignore_hmm_invalidate_va_block = NULL; 3166 if (ret) 3167 return errno_to_nv_status(ret); 3168 3169 return NV_OK; 3170 } 3171 3172 // Note that the caller must initialize va_block_context->hmm.src_pfns by 3173 // calling uvm_hmm_va_block_evict_chunk_prep() before calling this. 3174 static NV_STATUS hmm_va_block_evict_chunks(uvm_va_block_t *va_block, 3175 uvm_va_block_context_t *va_block_context, 3176 const uvm_page_mask_t *pages_to_evict, 3177 uvm_va_block_region_t region, 3178 uvm_make_resident_cause_t cause, 3179 bool *out_accessed_by_set) 3180 { 3181 NvU64 start = uvm_va_block_region_start(va_block, region); 3182 NvU64 end = uvm_va_block_region_end(va_block, region); 3183 unsigned long *src_pfns = va_block_context->hmm.src_pfns; 3184 unsigned long *dst_pfns = va_block_context->hmm.dst_pfns; 3185 uvm_hmm_migrate_event_t uvm_hmm_migrate_event = { 3186 .va_block = va_block, 3187 .va_block_retry = NULL, 3188 .va_block_context = va_block_context, 3189 .region = region, 3190 .dest_id = UVM_ID_CPU, 3191 .cause = cause, 3192 }; 3193 uvm_page_mask_t *page_mask = &uvm_hmm_migrate_event.page_mask; 3194 const uvm_va_policy_t *policy; 3195 uvm_va_policy_node_t *node; 3196 unsigned long npages; 3197 NV_STATUS status; 3198 3199 uvm_assert_mutex_locked(&va_block->lock); 3200 3201 if (out_accessed_by_set) 3202 *out_accessed_by_set = false; 3203 3204 // Note that there is no VMA available when evicting HMM pages. 3205 va_block_context->hmm.vma = NULL; 3206 3207 uvm_page_mask_copy(page_mask, pages_to_evict); 3208 3209 uvm_for_each_va_policy_in(policy, va_block, start, end, node, region) { 3210 npages = uvm_va_block_region_num_pages(region); 3211 3212 if (out_accessed_by_set && uvm_processor_mask_get_count(&policy->accessed_by) > 0) 3213 *out_accessed_by_set = true; 3214 3215 // Pages resident on the GPU should not have a resident page in system 3216 // memory. 3217 // TODO: Bug 3660922: Need to handle read duplication at some point. 3218 UVM_ASSERT(uvm_page_mask_region_empty(&va_block->cpu.resident, region)); 3219 3220 status = alloc_and_copy_to_cpu(va_block, 3221 NULL, 3222 src_pfns, 3223 dst_pfns, 3224 region, 3225 page_mask, 3226 NULL, 3227 UVM_ID_INVALID, 3228 NULL); 3229 if (status != NV_OK) 3230 goto err; 3231 3232 status = uvm_va_block_make_resident_copy(va_block, 3233 NULL, 3234 va_block_context, 3235 UVM_ID_CPU, 3236 region, 3237 page_mask, 3238 NULL, 3239 cause); 3240 if (status != NV_OK) 3241 goto err; 3242 3243 migrate_device_pages(src_pfns + region.first, dst_pfns + region.first, npages); 3244 3245 uvm_hmm_migrate_event.region = region; 3246 3247 status = uvm_hmm_migrate_finalize(&uvm_hmm_migrate_event); 3248 if (status != NV_OK) 3249 goto err; 3250 3251 migrate_device_finalize(src_pfns + region.first, dst_pfns + region.first, npages); 3252 } 3253 3254 return NV_OK; 3255 3256 err: 3257 migrate_device_finalize(src_pfns + region.first, dst_pfns + region.first, npages); 3258 return status; 3259 } 3260 3261 NV_STATUS uvm_hmm_va_block_evict_chunks(uvm_va_block_t *va_block, 3262 uvm_va_block_context_t *va_block_context, 3263 const uvm_page_mask_t *pages_to_evict, 3264 uvm_va_block_region_t region, 3265 bool *out_accessed_by_set) 3266 { 3267 return hmm_va_block_evict_chunks(va_block, 3268 va_block_context, 3269 pages_to_evict, 3270 region, 3271 UVM_MAKE_RESIDENT_CAUSE_EVICTION, 3272 out_accessed_by_set); 3273 } 3274 3275 NV_STATUS uvm_hmm_va_block_evict_pages_from_gpu(uvm_va_block_t *va_block, 3276 uvm_gpu_t *gpu, 3277 uvm_va_block_context_t *va_block_context, 3278 const uvm_page_mask_t *pages_to_evict, 3279 uvm_va_block_region_t region) 3280 { 3281 unsigned long *src_pfns = va_block_context->hmm.src_pfns; 3282 uvm_va_block_gpu_state_t *gpu_state; 3283 uvm_page_index_t page_index; 3284 uvm_gpu_chunk_t *gpu_chunk; 3285 NV_STATUS status; 3286 3287 uvm_assert_mutex_locked(&va_block->lock); 3288 3289 gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id); 3290 UVM_ASSERT(gpu_state); 3291 UVM_ASSERT(gpu_state->chunks); 3292 3293 // Fill in the src_pfns[] with the ZONE_DEVICE private PFNs of the GPU. 3294 memset(src_pfns, 0, sizeof(va_block_context->hmm.src_pfns)); 3295 3296 // TODO: Bug 3368756: add support for large GPU pages. 3297 for_each_va_block_page_in_region_mask(page_index, pages_to_evict, region) { 3298 gpu_chunk = uvm_va_block_lookup_gpu_chunk(va_block, 3299 gpu, 3300 uvm_va_block_cpu_page_address(va_block, page_index)); 3301 status = uvm_hmm_va_block_evict_chunk_prep(va_block, 3302 va_block_context, 3303 gpu_chunk, 3304 uvm_va_block_region_for_page(page_index)); 3305 if (status != NV_OK) 3306 return status; 3307 } 3308 3309 return hmm_va_block_evict_chunks(va_block, 3310 va_block_context, 3311 pages_to_evict, 3312 region, 3313 UVM_MAKE_RESIDENT_CAUSE_API_MIGRATE, 3314 NULL); 3315 } 3316 3317 NV_STATUS uvm_hmm_pmm_gpu_evict_pfn(unsigned long pfn) 3318 { 3319 unsigned long src_pfn = 0; 3320 unsigned long dst_pfn = 0; 3321 struct page *dst_page; 3322 NV_STATUS status = NV_OK; 3323 int ret; 3324 3325 ret = migrate_device_range(&src_pfn, pfn, 1); 3326 if (ret) 3327 return errno_to_nv_status(ret); 3328 3329 if (src_pfn & MIGRATE_PFN_MIGRATE) { 3330 // All the code for copying a vidmem page to sysmem relies on 3331 // having a va_block. However certain combinations of mremap() 3332 // and fork() can result in device-private pages being mapped 3333 // in a child process without a va_block. 3334 // 3335 // We don't expect the above to be a common occurance so for 3336 // now we allocate a fresh zero page when evicting without a 3337 // va_block. However this results in child processes losing 3338 // data so make sure we warn about it. Ideally we would just 3339 // not migrate and SIGBUS the child if it tries to access the 3340 // page. However that would prevent unloading of the driver so 3341 // we're stuck with this until we fix the problem. 3342 // TODO: Bug 3902536: add code to migrate GPU memory without having a 3343 // va_block. 3344 WARN_ON(1); 3345 dst_page = alloc_page(GFP_HIGHUSER_MOVABLE | __GFP_ZERO); 3346 if (!dst_page) { 3347 status = NV_ERR_NO_MEMORY; 3348 goto out; 3349 } 3350 3351 lock_page(dst_page); 3352 dst_pfn = migrate_pfn(page_to_pfn(dst_page)); 3353 3354 migrate_device_pages(&src_pfn, &dst_pfn, 1); 3355 } 3356 3357 out: 3358 migrate_device_finalize(&src_pfn, &dst_pfn, 1); 3359 3360 return status; 3361 } 3362 3363 // The routines below are all for UVM-HMM tests. 3364 3365 NV_STATUS uvm_hmm_va_block_range_bounds(uvm_va_space_t *va_space, 3366 struct mm_struct *mm, 3367 NvU64 lookup_address, 3368 NvU64 *startp, 3369 NvU64 *endp, 3370 UVM_TEST_VA_RESIDENCY_INFO_PARAMS *params) 3371 { 3372 struct vm_area_struct *vma; 3373 NvU64 start; 3374 NvU64 end; 3375 3376 if (!uvm_hmm_is_enabled(va_space) || !mm) 3377 return NV_ERR_INVALID_ADDRESS; 3378 3379 uvm_assert_mmap_lock_locked(mm); 3380 uvm_assert_rwsem_locked(&va_space->lock); 3381 3382 // The VMA might have changed while not holding mmap_lock so check it. 3383 vma = find_vma(mm, lookup_address); 3384 if (!uvm_hmm_vma_is_valid(vma, lookup_address, false)) 3385 return NV_ERR_INVALID_ADDRESS; 3386 3387 // Since managed VA ranges don't cover more than one VMA, return only the 3388 // intersecting range of the VA block and VMA. 3389 start = UVM_VA_BLOCK_ALIGN_DOWN(lookup_address); 3390 end = start + UVM_VA_BLOCK_SIZE - 1; 3391 if (start < vma->vm_start) 3392 start = vma->vm_start; 3393 if (end > vma->vm_end - 1) 3394 end = vma->vm_end - 1; 3395 3396 *startp = start; 3397 *endp = end; 3398 3399 if (params) { 3400 uvm_va_space_processor_uuid(va_space, ¶ms->resident_on[0], UVM_ID_CPU); 3401 params->resident_physical_size[0] = PAGE_SIZE; 3402 params->resident_on_count = 1; 3403 3404 uvm_va_space_processor_uuid(va_space, ¶ms->mapped_on[0], UVM_ID_CPU); 3405 params->mapping_type[0] = (vma->vm_flags & VM_WRITE) ? 3406 UVM_PROT_READ_WRITE_ATOMIC : UVM_PROT_READ_ONLY; 3407 params->page_size[0] = PAGE_SIZE; 3408 params->mapped_on_count = 1; 3409 3410 uvm_va_space_processor_uuid(va_space, ¶ms->populated_on[0], UVM_ID_CPU); 3411 params->populated_on_count = 1; 3412 } 3413 3414 return NV_OK; 3415 } 3416 3417 NV_STATUS uvm_hmm_va_block_update_residency_info(uvm_va_block_t *va_block, 3418 struct mm_struct *mm, 3419 NvU64 lookup_address, 3420 bool populate) 3421 { 3422 uvm_va_space_t *va_space = va_block->hmm.va_space; 3423 struct vm_area_struct *vma; 3424 struct hmm_range range; 3425 uvm_va_block_region_t region; 3426 unsigned long pfn; 3427 NvU64 end; 3428 int ret; 3429 NV_STATUS status; 3430 3431 if (!uvm_hmm_is_enabled(va_space) || !mm) 3432 return NV_ERR_INVALID_ADDRESS; 3433 3434 uvm_assert_mmap_lock_locked(mm); 3435 uvm_assert_rwsem_locked(&va_space->lock); 3436 3437 // The VMA might have changed while not holding mmap_lock so check it. 3438 vma = find_vma(mm, lookup_address); 3439 if (!uvm_hmm_vma_is_valid(vma, lookup_address, false)) 3440 return NV_ERR_INVALID_ADDRESS; 3441 3442 end = lookup_address + PAGE_SIZE; 3443 region = uvm_va_block_region_from_start_end(va_block, lookup_address, end - 1); 3444 3445 range.notifier = &va_block->hmm.notifier; 3446 range.start = lookup_address; 3447 range.end = end; 3448 range.hmm_pfns = &pfn; 3449 range.default_flags = 0; 3450 range.pfn_flags_mask = 0; 3451 range.dev_private_owner = &g_uvm_global; 3452 3453 if (populate) { 3454 range.default_flags = HMM_PFN_REQ_FAULT; 3455 if (vma->vm_flags & VM_WRITE) 3456 range.default_flags |= HMM_PFN_REQ_WRITE; 3457 } 3458 3459 uvm_hmm_migrate_begin_wait(va_block); 3460 3461 while (true) { 3462 range.notifier_seq = mmu_interval_read_begin(range.notifier); 3463 ret = hmm_range_fault(&range); 3464 if (ret == -EBUSY) 3465 continue; 3466 if (ret) { 3467 uvm_hmm_migrate_finish(va_block); 3468 return errno_to_nv_status(ret); 3469 } 3470 3471 uvm_mutex_lock(&va_block->lock); 3472 3473 if (!mmu_interval_read_retry(range.notifier, range.notifier_seq)) 3474 break; 3475 3476 uvm_mutex_unlock(&va_block->lock); 3477 } 3478 3479 // Update the va_block CPU state based on the snapshot. 3480 // Note that we have to adjust the pfns address since it will be indexed 3481 // by region.first. 3482 status = populate_region(va_block, &pfn - region.first, region, NULL); 3483 3484 uvm_mutex_unlock(&va_block->lock); 3485 uvm_hmm_migrate_finish(va_block); 3486 3487 return NV_OK; 3488 } 3489 3490 NV_STATUS uvm_test_split_invalidate_delay(UVM_TEST_SPLIT_INVALIDATE_DELAY_PARAMS *params, struct file *filp) 3491 { 3492 uvm_va_space_t *va_space = uvm_va_space_get(filp); 3493 3494 atomic64_set(&va_space->test.split_invalidate_delay_us, params->delay_us); 3495 3496 return NV_OK; 3497 } 3498 3499 NV_STATUS uvm_hmm_va_range_info(uvm_va_space_t *va_space, 3500 struct mm_struct *mm, 3501 UVM_TEST_VA_RANGE_INFO_PARAMS *params) 3502 { 3503 uvm_range_tree_node_t *tree_node; 3504 const uvm_va_policy_node_t *node; 3505 struct vm_area_struct *vma; 3506 uvm_va_block_t *va_block; 3507 3508 if (!mm || !uvm_hmm_is_enabled(va_space)) 3509 return NV_ERR_INVALID_ADDRESS; 3510 3511 uvm_assert_mmap_lock_locked(mm); 3512 uvm_assert_rwsem_locked(&va_space->lock); 3513 3514 params->type = UVM_TEST_VA_RANGE_TYPE_MANAGED; 3515 params->managed.subtype = UVM_TEST_RANGE_SUBTYPE_HMM; 3516 params->va_range_start = 0; 3517 params->va_range_end = ULONG_MAX; 3518 params->read_duplication = UVM_TEST_READ_DUPLICATION_UNSET; 3519 memset(¶ms->preferred_location, 0, sizeof(params->preferred_location)); 3520 params->accessed_by_count = 0; 3521 params->managed.vma_start = 0; 3522 params->managed.vma_end = 0; 3523 params->managed.is_zombie = NV_FALSE; 3524 params->managed.owned_by_calling_process = (mm == current->mm ? NV_TRUE : NV_FALSE); 3525 3526 vma = find_vma(mm, params->lookup_address); 3527 if (!uvm_hmm_vma_is_valid(vma, params->lookup_address, false)) 3528 return NV_ERR_INVALID_ADDRESS; 3529 3530 params->va_range_start = vma->vm_start; 3531 params->va_range_end = vma->vm_end - 1; 3532 params->managed.vma_start = vma->vm_start; 3533 params->managed.vma_end = vma->vm_end - 1; 3534 3535 uvm_mutex_lock(&va_space->hmm.blocks_lock); 3536 tree_node = uvm_range_tree_find(&va_space->hmm.blocks, params->lookup_address); 3537 if (!tree_node) { 3538 UVM_ASSERT(uvm_range_tree_find_hole_in(&va_space->hmm.blocks, params->lookup_address, 3539 ¶ms->va_range_start, ¶ms->va_range_end) == NV_OK); 3540 uvm_mutex_unlock(&va_space->hmm.blocks_lock); 3541 return NV_OK; 3542 } 3543 3544 uvm_mutex_unlock(&va_space->hmm.blocks_lock); 3545 va_block = hmm_va_block_from_node(tree_node); 3546 uvm_mutex_lock(&va_block->lock); 3547 3548 params->va_range_start = va_block->start; 3549 params->va_range_end = va_block->end; 3550 3551 node = uvm_va_policy_node_find(va_block, params->lookup_address); 3552 if (node) { 3553 uvm_processor_id_t processor_id; 3554 3555 if (params->va_range_start < node->node.start) 3556 params->va_range_start = node->node.start; 3557 if (params->va_range_end > node->node.end) 3558 params->va_range_end = node->node.end; 3559 3560 params->read_duplication = node->policy.read_duplication; 3561 3562 if (!UVM_ID_IS_INVALID(node->policy.preferred_location)) 3563 uvm_va_space_processor_uuid(va_space, ¶ms->preferred_location, node->policy.preferred_location); 3564 3565 for_each_id_in_mask(processor_id, &node->policy.accessed_by) 3566 uvm_va_space_processor_uuid(va_space, ¶ms->accessed_by[params->accessed_by_count++], processor_id); 3567 } 3568 else { 3569 uvm_range_tree_find_hole_in(&va_block->hmm.va_policy_tree, params->lookup_address, 3570 ¶ms->va_range_start, ¶ms->va_range_end); 3571 } 3572 3573 uvm_mutex_unlock(&va_block->lock); 3574 3575 return NV_OK; 3576 } 3577 3578 // TODO: Bug 3660968: Remove this hack as soon as HMM migration is implemented 3579 // for VMAs other than anonymous private memory. 3580 bool uvm_hmm_must_use_sysmem(uvm_va_block_t *va_block, 3581 uvm_va_block_context_t *va_block_context) 3582 { 3583 struct vm_area_struct *vma = va_block_context->hmm.vma; 3584 3585 uvm_assert_mutex_locked(&va_block->lock); 3586 3587 if (!uvm_va_block_is_hmm(va_block)) 3588 return false; 3589 3590 UVM_ASSERT(vma); 3591 UVM_ASSERT(va_block_context->mm == vma->vm_mm); 3592 uvm_assert_mmap_lock_locked(va_block_context->mm); 3593 3594 // TODO: Bug 4050579: Remove this when swap cached pages can be migrated. 3595 if (va_block_context->hmm.swap_cached) 3596 return true; 3597 3598 // migrate_vma_setup() can't migrate VM_SPECIAL so we have to force GPU 3599 // remote mapping. 3600 // TODO: Bug 3660968: add support for file-backed migrations. 3601 // TODO: Bug 3368756: add support for transparent huge page migrations. 3602 return !vma_is_anonymous(vma) || 3603 (vma->vm_flags & VM_SPECIAL) || 3604 vma_is_dax(vma) || 3605 is_vm_hugetlb_page(vma); 3606 } 3607 3608 #endif // UVM_IS_CONFIG_HMM() 3609 3610