1 /******************************************************************************* 2 Copyright (c) 2016-2023 NVIDIA Corporation 3 4 Permission is hereby granted, free of charge, to any person obtaining a copy 5 of this software and associated documentation files (the "Software"), to 6 deal in the Software without restriction, including without limitation the 7 rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 8 sell copies of the Software, and to permit persons to whom the Software is 9 furnished to do so, subject to the following conditions: 10 11 The above copyright notice and this permission notice shall be 12 included in all copies or substantial portions of the Software. 13 14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 DEALINGS IN THE SOFTWARE. 21 22 *******************************************************************************/ 23 24 #include "uvm_hmm.h" 25 26 // Support for HMM ( https://docs.kernel.org/mm/hmm.html ): 27 28 #ifdef NVCPU_X86_64 29 static bool uvm_disable_hmm = false; 30 MODULE_PARM_DESC(uvm_disable_hmm, 31 "Force-disable HMM functionality in the UVM driver. " 32 "Default: false (HMM is enabled if possible). " 33 "However, even with uvm_disable_hmm=false, HMM will not be " 34 "enabled if is not supported in this driver build " 35 "configuration, or if ATS settings conflict with HMM."); 36 #else 37 // So far, we've only tested HMM on x86_64, so disable it by default everywhere 38 // else. 39 static bool uvm_disable_hmm = true; 40 MODULE_PARM_DESC(uvm_disable_hmm, 41 "Force-disable HMM functionality in the UVM driver. " 42 "Default: true (HMM is not enabled on this CPU architecture). " 43 "However, even with uvm_disable_hmm=false, HMM will not be " 44 "enabled if is not supported in this driver build " 45 "configuration, or if ATS settings conflict with HMM."); 46 #endif 47 48 module_param(uvm_disable_hmm, bool, 0444); 49 50 #if UVM_IS_CONFIG_HMM() 51 52 #include <linux/hmm.h> 53 #include <linux/rmap.h> 54 #include <linux/migrate.h> 55 #include <linux/userfaultfd_k.h> 56 #include <linux/memremap.h> 57 #include <linux/wait.h> 58 59 #include "uvm_common.h" 60 #include "uvm_gpu.h" 61 #include "uvm_pmm_gpu.h" 62 #include "uvm_hal_types.h" 63 #include "uvm_va_block_types.h" 64 #include "uvm_va_space_mm.h" 65 #include "uvm_va_space.h" 66 #include "uvm_va_range.h" 67 #include "uvm_range_tree.h" 68 #include "uvm_pmm_sysmem.h" 69 #include "uvm_lock.h" 70 #include "uvm_api.h" 71 #include "uvm_va_policy.h" 72 #include "uvm_tools.h" 73 74 static NV_STATUS gpu_chunk_add(uvm_va_block_t *va_block, 75 uvm_page_index_t page_index, 76 struct page *page); 77 78 typedef struct 79 { 80 uvm_processor_id_t processor_id; 81 uvm_processor_id_t new_residency; 82 uvm_va_block_t *va_block; 83 uvm_va_block_retry_t *va_block_retry; 84 uvm_service_block_context_t *service_context; 85 uvm_page_mask_t page_mask; 86 uvm_page_mask_t same_devmem_page_mask; 87 } uvm_hmm_gpu_fault_event_t; 88 89 typedef struct 90 { 91 uvm_va_block_t *va_block; 92 uvm_va_block_retry_t *va_block_retry; 93 uvm_va_block_context_t *va_block_context; 94 uvm_va_block_region_t region; 95 uvm_processor_id_t dest_id; 96 uvm_make_resident_cause_t cause; 97 uvm_page_mask_t page_mask; 98 uvm_page_mask_t same_devmem_page_mask; 99 } uvm_hmm_migrate_event_t; 100 101 typedef struct 102 { 103 uvm_processor_id_t processor_id; 104 uvm_va_block_t *va_block; 105 uvm_va_block_retry_t *va_block_retry; 106 uvm_service_block_context_t *service_context; 107 uvm_page_mask_t page_mask; 108 uvm_page_mask_t same_devmem_page_mask; 109 } uvm_hmm_devmem_fault_context_t; 110 111 bool uvm_hmm_is_enabled_system_wide(void) 112 { 113 return !uvm_disable_hmm && !g_uvm_global.ats.enabled && uvm_va_space_mm_enabled_system(); 114 } 115 116 bool uvm_hmm_is_enabled(uvm_va_space_t *va_space) 117 { 118 return uvm_hmm_is_enabled_system_wide() && 119 uvm_va_space_mm_enabled(va_space) && 120 !(va_space->initialization_flags & UVM_INIT_FLAGS_DISABLE_HMM); 121 } 122 123 static uvm_va_block_t *hmm_va_block_from_node(uvm_range_tree_node_t *node) 124 { 125 if (!node) 126 return NULL; 127 return container_of(node, uvm_va_block_t, hmm.node); 128 } 129 130 NV_STATUS uvm_hmm_va_space_initialize(uvm_va_space_t *va_space) 131 { 132 uvm_hmm_va_space_t *hmm_va_space = &va_space->hmm; 133 struct mm_struct *mm = va_space->va_space_mm.mm; 134 int ret; 135 136 if (!uvm_hmm_is_enabled(va_space)) 137 return NV_OK; 138 139 uvm_assert_mmap_lock_locked_write(mm); 140 uvm_assert_rwsem_locked_write(&va_space->lock); 141 142 uvm_range_tree_init(&hmm_va_space->blocks); 143 uvm_mutex_init(&hmm_va_space->blocks_lock, UVM_LOCK_ORDER_LEAF); 144 145 // Initialize MMU interval notifiers for this process. 146 // This allows mmu_interval_notifier_insert() to be called without holding 147 // the mmap_lock for write. 148 // Note: there is no __mmu_notifier_unregister(), this call just allocates 149 // memory which is attached to the mm_struct and freed when the mm_struct 150 // is freed. 151 ret = __mmu_notifier_register(NULL, mm); 152 if (ret) 153 return errno_to_nv_status(ret); 154 155 return NV_OK; 156 } 157 158 void uvm_hmm_va_space_destroy(uvm_va_space_t *va_space) 159 { 160 uvm_hmm_va_space_t *hmm_va_space = &va_space->hmm; 161 uvm_range_tree_node_t *node, *next; 162 uvm_va_block_t *va_block; 163 164 if (!uvm_hmm_is_enabled(va_space)) 165 return; 166 167 uvm_assert_rwsem_locked_write(&va_space->lock); 168 169 // The blocks_lock is not needed when the va_space lock is held for write. 170 uvm_range_tree_for_each_safe(node, next, &hmm_va_space->blocks) { 171 va_block = hmm_va_block_from_node(node); 172 uvm_range_tree_remove(&hmm_va_space->blocks, node); 173 mmu_interval_notifier_remove(&va_block->hmm.notifier); 174 uvm_va_block_kill(va_block); 175 } 176 } 177 178 static void hmm_va_block_unregister_gpu(uvm_va_block_t *va_block, 179 uvm_gpu_t *gpu, 180 struct mm_struct *mm) 181 { 182 uvm_va_policy_node_t *node; 183 184 uvm_mutex_lock(&va_block->lock); 185 186 // Reset preferred location and accessed-by of policy nodes if needed. 187 uvm_for_each_va_policy_node_in(node, va_block, va_block->start, va_block->end) { 188 if (uvm_id_equal(node->policy.preferred_location, gpu->id)) 189 node->policy.preferred_location = UVM_ID_INVALID; 190 191 uvm_processor_mask_clear(&node->policy.accessed_by, gpu->id); 192 } 193 194 // Migrate and free any remaining resident allocations on this GPU. 195 uvm_va_block_unregister_gpu_locked(va_block, gpu, mm); 196 197 uvm_mutex_unlock(&va_block->lock); 198 } 199 200 void uvm_hmm_unregister_gpu(uvm_va_space_t *va_space, uvm_gpu_t *gpu, struct mm_struct *mm) 201 { 202 uvm_range_tree_node_t *node; 203 uvm_va_block_t *va_block; 204 205 if (!uvm_hmm_is_enabled(va_space)) 206 return; 207 208 if (mm) 209 uvm_assert_mmap_lock_locked(mm); 210 uvm_assert_rwsem_locked_write(&va_space->lock); 211 212 uvm_range_tree_for_each(node, &va_space->hmm.blocks) { 213 va_block = hmm_va_block_from_node(node); 214 215 hmm_va_block_unregister_gpu(va_block, gpu, mm); 216 } 217 } 218 219 static void hmm_va_block_remove_gpu_va_space(uvm_va_block_t *va_block, 220 uvm_gpu_va_space_t *gpu_va_space, 221 uvm_va_block_context_t *va_block_context) 222 { 223 uvm_mutex_lock(&va_block->lock); 224 225 uvm_va_block_remove_gpu_va_space(va_block, gpu_va_space, va_block_context); 226 227 uvm_mutex_unlock(&va_block->lock); 228 229 // TODO: Bug 3660922: Need to handle read duplication at some point. 230 // See range_remove_gpu_va_space_managed(). 231 } 232 233 void uvm_hmm_remove_gpu_va_space(uvm_va_space_t *va_space, 234 uvm_gpu_va_space_t *gpu_va_space, 235 struct mm_struct *mm) 236 { 237 uvm_va_block_context_t *va_block_context; 238 uvm_range_tree_node_t *node, *next; 239 uvm_va_block_t *va_block; 240 241 if (!uvm_hmm_is_enabled(va_space)) 242 return; 243 244 if (mm) 245 uvm_assert_mmap_lock_locked(mm); 246 uvm_assert_rwsem_locked_write(&va_space->lock); 247 248 va_block_context = uvm_va_space_block_context(va_space, mm); 249 250 uvm_range_tree_for_each_safe(node, next, &va_space->hmm.blocks) { 251 va_block = hmm_va_block_from_node(node); 252 253 hmm_va_block_remove_gpu_va_space(va_block, gpu_va_space, va_block_context); 254 } 255 } 256 257 static bool hmm_invalidate(uvm_va_block_t *va_block, 258 const struct mmu_notifier_range *range, 259 unsigned long cur_seq) 260 { 261 uvm_thread_context_t *uvm_context = uvm_thread_context(); 262 struct mmu_interval_notifier *mni = &va_block->hmm.notifier; 263 struct mm_struct *mm = mni->mm; 264 uvm_va_block_context_t *va_block_context; 265 uvm_va_block_region_t region; 266 NvU64 start, end; 267 uvm_processor_id_t id; 268 NV_STATUS status = NV_OK; 269 270 // The MMU_NOTIFY_RELEASE event isn't really needed since mn_itree_release() 271 // doesn't remove the interval notifiers from the struct_mm so there will 272 // be a full range MMU_NOTIFY_UNMAP event after the release from 273 // unmap_vmas() during exit_mmap(). 274 if (range->event == MMU_NOTIFY_SOFT_DIRTY || range->event == MMU_NOTIFY_RELEASE) 275 return true; 276 277 // Blockable is only set false by 278 // mmu_notifier_invalidate_range_start_nonblock() which is only called in 279 // __oom_reap_task_mm(). 280 if (!mmu_notifier_range_blockable(range)) 281 return false; 282 283 // We only ignore invalidations in this context whilst holding the 284 // va_block lock. This prevents deadlock when try_to_migrate() 285 // calls the notifier, but holding the lock prevents other threads 286 // invalidating PTEs so we can safely assume the results of 287 // migrate_vma_setup() are correct. 288 if (uvm_context->ignore_hmm_invalidate_va_block == va_block || 289 ((range->event == MMU_NOTIFY_MIGRATE || range->event == MMU_NOTIFY_EXCLUSIVE) && 290 range->owner == &g_uvm_global)) 291 return true; 292 293 va_block_context = uvm_va_block_context_alloc(mm); 294 if (!va_block_context) 295 return true; 296 297 uvm_mutex_lock(&va_block->lock); 298 299 // mmu_interval_notifier_remove() is always called before marking a 300 // va_block as dead so this va_block has to be alive. 301 UVM_ASSERT(!uvm_va_block_is_dead(va_block)); 302 303 // Note: unmap_vmas() does MMU_NOTIFY_UNMAP [0, 0xffffffffffffffff] 304 // Also note that hmm_invalidate() can be called when a new va_block is not 305 // yet inserted into the va_space->hmm.blocks table while the original 306 // va_block is being split. The original va_block may have its end address 307 // updated before the mmu interval notifier is updated so this invalidate 308 // may be for a range past the va_block end address. 309 start = range->start; 310 end = (range->end == ULONG_MAX) ? range->end : range->end - 1; 311 if (start < va_block->start) 312 start = va_block->start; 313 if (end > va_block->end) 314 end = va_block->end; 315 if (start > end) 316 goto unlock; 317 318 // These will be equal if no other thread causes an invalidation 319 // whilst the va_block lock was dropped. 320 uvm_context->hmm_invalidate_seqnum++; 321 va_block->hmm.changed++; 322 323 mmu_interval_set_seq(mni, cur_seq); 324 325 region = uvm_va_block_region_from_start_end(va_block, start, end); 326 327 va_block_context->hmm.vma = NULL; 328 va_block_context->policy = NULL; 329 330 // We only need to unmap GPUs since Linux handles the CPUs. 331 for_each_gpu_id_in_mask(id, &va_block->mapped) { 332 status = uvm_va_block_unmap(va_block, 333 va_block_context, 334 id, 335 region, 336 uvm_va_block_map_mask_get(va_block, id), 337 &va_block->tracker); 338 // Note that the va_block lock can be dropped, relocked, and 339 // NV_ERR_MORE_PROCESSING_REQUIRED returned. 340 if (status != NV_OK) 341 break; 342 } 343 344 if (range->event == MMU_NOTIFY_UNMAP || range->event == MMU_NOTIFY_CLEAR) 345 uvm_va_block_munmap_region(va_block, region); 346 347 if (status == NV_OK) 348 status = uvm_tracker_wait(&va_block->tracker); 349 350 // Remove stale HMM struct page pointers to system memory. 351 uvm_va_block_remove_cpu_chunks(va_block, region); 352 353 unlock: 354 uvm_mutex_unlock(&va_block->lock); 355 356 uvm_va_block_context_free(va_block_context); 357 358 UVM_ASSERT(status == NV_OK); 359 return true; 360 } 361 362 static bool uvm_hmm_invalidate_entry(struct mmu_interval_notifier *mni, 363 const struct mmu_notifier_range *range, 364 unsigned long cur_seq) 365 { 366 uvm_va_block_t *va_block = container_of(mni, uvm_va_block_t, hmm.notifier); 367 368 UVM_ENTRY_RET(hmm_invalidate(va_block, range, cur_seq)); 369 } 370 371 static const struct mmu_interval_notifier_ops uvm_hmm_notifier_ops = 372 { 373 .invalidate = uvm_hmm_invalidate_entry, 374 }; 375 376 NV_STATUS uvm_hmm_va_block_find(uvm_va_space_t *va_space, 377 NvU64 addr, 378 uvm_va_block_t **va_block_ptr) 379 { 380 uvm_range_tree_node_t *node; 381 382 if (!uvm_hmm_is_enabled(va_space)) 383 return NV_ERR_INVALID_ADDRESS; 384 385 uvm_assert_rwsem_locked(&va_space->lock); 386 387 uvm_mutex_lock(&va_space->hmm.blocks_lock); 388 node = uvm_range_tree_find(&va_space->hmm.blocks, addr); 389 uvm_mutex_unlock(&va_space->hmm.blocks_lock); 390 391 if (!node) 392 return NV_ERR_OBJECT_NOT_FOUND; 393 394 *va_block_ptr = hmm_va_block_from_node(node); 395 396 return NV_OK; 397 } 398 399 static int migrate_vma_setup_locked(struct migrate_vma *args, uvm_va_block_t *va_block) 400 { 401 uvm_thread_context_t *uvm_context = uvm_thread_context(); 402 int ret; 403 404 // It's only safe to ignore invalidations whilst doing a migration 405 // and holding the va_block lock. 406 uvm_assert_mutex_locked(&va_block->lock); 407 uvm_context->ignore_hmm_invalidate_va_block = va_block; 408 ret = migrate_vma_setup(args); 409 410 // We shouldn't be generating any more invalidations now. 411 uvm_context->ignore_hmm_invalidate_va_block = NULL; 412 return ret; 413 } 414 415 static bool uvm_hmm_vma_is_valid(struct vm_area_struct *vma, 416 unsigned long addr, 417 bool allow_unreadable_vma) 418 { 419 // UVM doesn't support userfaultfd. hmm_range_fault() doesn't support 420 // VM_IO or VM_PFNMAP VMAs. It also doesn't support VMAs without VM_READ 421 // but we allow those VMAs to have policy set on them. 422 // migrate_vma_setup() doesn't support VM_SPECIAL VMAs but that is handled 423 // by uvm_hmm_must_use_sysmem() forcing residency to the CPU. 424 return vma && 425 addr >= vma->vm_start && 426 !userfaultfd_armed(vma) && 427 !(vma->vm_flags & (VM_IO | VM_PFNMAP)) && 428 !uvm_vma_is_managed(vma) && 429 (allow_unreadable_vma || (vma->vm_flags & VM_READ)); 430 } 431 432 static void hmm_va_block_init(uvm_va_block_t *va_block, 433 uvm_va_space_t *va_space, 434 NvU64 start, 435 NvU64 end) 436 { 437 va_block->hmm.va_space = va_space; 438 va_block->hmm.node.start = start; 439 va_block->hmm.node.end = end; 440 uvm_range_tree_init(&va_block->hmm.va_policy_tree); 441 uvm_mutex_init(&va_block->hmm.migrate_lock, UVM_LOCK_ORDER_VA_BLOCK_MIGRATE); 442 } 443 444 static NV_STATUS hmm_va_block_find_create(uvm_va_space_t *va_space, 445 NvU64 addr, 446 bool allow_unreadable_vma, 447 uvm_va_block_context_t *va_block_context, 448 uvm_va_block_t **va_block_ptr) 449 { 450 struct mm_struct *mm = va_space->va_space_mm.mm; 451 struct vm_area_struct *vma; 452 uvm_va_block_t *va_block; 453 NvU64 start, end; 454 NV_STATUS status; 455 int ret; 456 457 if (!uvm_hmm_is_enabled(va_space)) 458 return NV_ERR_INVALID_ADDRESS; 459 460 UVM_ASSERT(mm); 461 UVM_ASSERT(!va_block_context || va_block_context->mm == mm); 462 uvm_assert_mmap_lock_locked(mm); 463 uvm_assert_rwsem_locked(&va_space->lock); 464 UVM_ASSERT(PAGE_ALIGNED(addr)); 465 466 // Note that we have to allow PROT_NONE VMAs so that policies can be set. 467 vma = find_vma(mm, addr); 468 if (!uvm_hmm_vma_is_valid(vma, addr, allow_unreadable_vma)) 469 return NV_ERR_INVALID_ADDRESS; 470 471 // Since we only hold the va_space read lock, there can be multiple 472 // parallel va_block insertions. 473 uvm_mutex_lock(&va_space->hmm.blocks_lock); 474 475 va_block = hmm_va_block_from_node(uvm_range_tree_find(&va_space->hmm.blocks, addr)); 476 if (va_block) 477 goto done; 478 479 // The va_block is always created to cover the whole aligned 480 // UVM_VA_BLOCK_SIZE interval unless there are existing UVM va_ranges or 481 // HMM va_blocks. In that case, the new HMM va_block size is adjusted so it 482 // doesn't overlap. 483 start = UVM_VA_BLOCK_ALIGN_DOWN(addr); 484 end = start + UVM_VA_BLOCK_SIZE - 1; 485 486 // Search for existing UVM va_ranges in the start/end interval and create 487 // a maximum interval that doesn't overlap any existing UVM va_ranges. 488 // We know that 'addr' is not within a va_range or 489 // hmm_va_block_find_create() wouldn't be called. 490 status = uvm_range_tree_find_hole_in(&va_space->va_range_tree, addr, &start, &end); 491 UVM_ASSERT(status == NV_OK); 492 493 // Search for existing HMM va_blocks in the start/end interval and create 494 // a maximum interval that doesn't overlap any existing HMM va_blocks. 495 status = uvm_range_tree_find_hole_in(&va_space->hmm.blocks, addr, &start, &end); 496 UVM_ASSERT(status == NV_OK); 497 498 // Create a HMM va_block with a NULL va_range pointer. 499 status = uvm_va_block_create(NULL, start, end, &va_block); 500 if (status != NV_OK) 501 goto err_unlock; 502 503 hmm_va_block_init(va_block, va_space, start, end); 504 505 ret = mmu_interval_notifier_insert(&va_block->hmm.notifier, 506 mm, 507 start, 508 end - start + 1, 509 &uvm_hmm_notifier_ops); 510 if (ret) { 511 status = errno_to_nv_status(ret); 512 goto err_release; 513 } 514 515 status = uvm_range_tree_add(&va_space->hmm.blocks, &va_block->hmm.node); 516 UVM_ASSERT(status == NV_OK); 517 518 done: 519 uvm_mutex_unlock(&va_space->hmm.blocks_lock); 520 if (va_block_context) 521 va_block_context->hmm.vma = vma; 522 *va_block_ptr = va_block; 523 return NV_OK; 524 525 err_release: 526 uvm_va_block_release(va_block); 527 528 err_unlock: 529 uvm_mutex_unlock(&va_space->hmm.blocks_lock); 530 return status; 531 } 532 533 NV_STATUS uvm_hmm_va_block_find_create(uvm_va_space_t *va_space, 534 NvU64 addr, 535 uvm_va_block_context_t *va_block_context, 536 uvm_va_block_t **va_block_ptr) 537 { 538 return hmm_va_block_find_create(va_space, addr, false, va_block_context, va_block_ptr); 539 } 540 541 NV_STATUS uvm_hmm_find_vma(uvm_va_block_context_t *va_block_context, NvU64 addr) 542 { 543 struct mm_struct *mm = va_block_context->mm; 544 struct vm_area_struct *vma; 545 546 if (!mm) 547 return NV_ERR_INVALID_ADDRESS; 548 549 uvm_assert_mmap_lock_locked(mm); 550 551 vma = find_vma(mm, addr); 552 if (!uvm_hmm_vma_is_valid(vma, addr, false)) 553 return NV_ERR_INVALID_ADDRESS; 554 555 va_block_context->hmm.vma = vma; 556 557 return NV_OK; 558 } 559 560 bool uvm_hmm_check_context_vma_is_valid(uvm_va_block_t *va_block, 561 uvm_va_block_context_t *va_block_context, 562 uvm_va_block_region_t region) 563 { 564 uvm_assert_mutex_locked(&va_block->lock); 565 566 if (uvm_va_block_is_hmm(va_block)) { 567 struct vm_area_struct *vma = va_block_context->hmm.vma; 568 569 UVM_ASSERT(vma); 570 UVM_ASSERT(va_block_context->mm == vma->vm_mm); 571 uvm_assert_mmap_lock_locked(va_block_context->mm); 572 UVM_ASSERT(vma->vm_start <= uvm_va_block_region_start(va_block, region)); 573 UVM_ASSERT(vma->vm_end > uvm_va_block_region_end(va_block, region)); 574 } 575 576 return true; 577 } 578 579 void uvm_hmm_service_context_init(uvm_service_block_context_t *service_context) 580 { 581 // TODO: Bug 4050579: Remove this when swap cached pages can be migrated. 582 service_context->block_context.hmm.swap_cached = false; 583 } 584 585 NV_STATUS uvm_hmm_migrate_begin(uvm_va_block_t *va_block) 586 { 587 if (uvm_mutex_trylock(&va_block->hmm.migrate_lock)) 588 return NV_OK; 589 590 return NV_ERR_BUSY_RETRY; 591 } 592 593 void uvm_hmm_migrate_begin_wait(uvm_va_block_t *va_block) 594 { 595 uvm_mutex_lock(&va_block->hmm.migrate_lock); 596 } 597 598 void uvm_hmm_migrate_finish(uvm_va_block_t *va_block) 599 { 600 uvm_mutex_unlock(&va_block->hmm.migrate_lock); 601 } 602 603 // Migrate the given range [start end] within a va_block to dest_id. 604 static NV_STATUS hmm_migrate_range(uvm_va_block_t *va_block, 605 uvm_va_block_retry_t *va_block_retry, 606 uvm_va_block_context_t *va_block_context, 607 uvm_processor_id_t dest_id, 608 NvU64 start, 609 NvU64 end, 610 uvm_migrate_mode_t mode, 611 uvm_tracker_t *out_tracker) 612 { 613 uvm_va_block_region_t region; 614 uvm_va_policy_node_t *node; 615 const uvm_va_policy_t *policy; 616 NV_STATUS status = NV_OK; 617 618 uvm_hmm_migrate_begin_wait(va_block); 619 uvm_mutex_lock(&va_block->lock); 620 621 uvm_for_each_va_policy_in(policy, va_block, start, end, node, region) { 622 va_block_context->policy = policy; 623 624 // Even though UVM_VA_BLOCK_RETRY_LOCKED() may unlock and relock the 625 // va_block lock, the policy remains valid because we hold the mmap 626 // lock so munmap can't remove the policy, and the va_space lock so the 627 // policy APIs can't change the policy. 628 status = UVM_VA_BLOCK_RETRY_LOCKED(va_block, 629 va_block_retry, 630 uvm_va_block_migrate_locked(va_block, 631 va_block_retry, 632 va_block_context, 633 region, 634 dest_id, 635 mode, 636 out_tracker)); 637 if (status != NV_OK) 638 break; 639 } 640 641 uvm_mutex_unlock(&va_block->lock); 642 uvm_hmm_migrate_finish(va_block); 643 644 return status; 645 } 646 647 void uvm_hmm_evict_va_blocks(uvm_va_space_t *va_space) 648 { 649 // We can't use uvm_va_space_mm_retain(), because the va_space_mm 650 // should already be dead by now. 651 struct mm_struct *mm = va_space->va_space_mm.mm; 652 uvm_hmm_va_space_t *hmm_va_space = &va_space->hmm; 653 uvm_range_tree_node_t *node, *next; 654 uvm_va_block_t *va_block; 655 uvm_va_block_context_t *block_context; 656 657 uvm_down_read_mmap_lock(mm); 658 uvm_va_space_down_write(va_space); 659 660 uvm_range_tree_for_each_safe(node, next, &hmm_va_space->blocks) { 661 uvm_va_block_region_t region; 662 struct vm_area_struct *vma; 663 664 va_block = hmm_va_block_from_node(node); 665 block_context = uvm_va_space_block_context(va_space, mm); 666 uvm_hmm_migrate_begin_wait(va_block); 667 uvm_mutex_lock(&va_block->lock); 668 for_each_va_block_vma_region(va_block, mm, vma, ®ion) { 669 if (!uvm_hmm_vma_is_valid(vma, vma->vm_start, false)) 670 continue; 671 672 block_context->hmm.vma = vma; 673 block_context->policy = &uvm_va_policy_default; 674 uvm_hmm_va_block_migrate_locked(va_block, 675 NULL, 676 block_context, 677 UVM_ID_CPU, 678 region, 679 UVM_MAKE_RESIDENT_CAUSE_API_MIGRATE); 680 } 681 uvm_mutex_unlock(&va_block->lock); 682 uvm_hmm_migrate_finish(va_block); 683 } 684 685 uvm_va_space_up_write(va_space); 686 uvm_up_read_mmap_lock(mm); 687 } 688 689 NV_STATUS uvm_hmm_test_va_block_inject_split_error(uvm_va_space_t *va_space, NvU64 addr) 690 { 691 uvm_va_block_test_t *block_test; 692 uvm_va_block_t *va_block; 693 NV_STATUS status; 694 695 if (!uvm_hmm_is_enabled(va_space)) 696 return NV_ERR_INVALID_ADDRESS; 697 698 status = hmm_va_block_find_create(va_space, addr, false, NULL, &va_block); 699 if (status != NV_OK) 700 return status; 701 702 block_test = uvm_va_block_get_test(va_block); 703 if (block_test) 704 block_test->inject_split_error = true; 705 706 return NV_OK; 707 } 708 709 typedef struct { 710 struct mmu_interval_notifier notifier; 711 uvm_va_block_t *existing_block; 712 } hmm_split_invalidate_data_t; 713 714 static bool hmm_split_invalidate(struct mmu_interval_notifier *mni, 715 const struct mmu_notifier_range *range, 716 unsigned long cur_seq) 717 { 718 hmm_split_invalidate_data_t *split_data = container_of(mni, hmm_split_invalidate_data_t, notifier); 719 720 uvm_tools_test_hmm_split_invalidate(split_data->existing_block->hmm.va_space); 721 hmm_invalidate(split_data->existing_block, range, cur_seq); 722 723 return true; 724 } 725 726 static bool hmm_split_invalidate_entry(struct mmu_interval_notifier *mni, 727 const struct mmu_notifier_range *range, 728 unsigned long cur_seq) 729 { 730 UVM_ENTRY_RET(hmm_split_invalidate(mni, range, cur_seq)); 731 } 732 733 static const struct mmu_interval_notifier_ops hmm_notifier_split_ops = 734 { 735 .invalidate = hmm_split_invalidate_entry, 736 }; 737 738 // Splits existing va_block into two pieces, with new_va_block always after 739 // va_block. va_block is updated to have new_end. new_end+1 must be page- 740 // aligned. 741 // 742 // Before: [----------- existing ------------] 743 // After: [---- existing ----][---- new ----] 744 // ^new_end 745 // 746 // On error, va_block is still accessible and is left in its original 747 // functional state. 748 static NV_STATUS hmm_split_block(uvm_va_block_t *va_block, 749 NvU64 new_end, 750 uvm_va_block_t **new_block_ptr) 751 { 752 uvm_va_space_t *va_space = va_block->hmm.va_space; 753 struct mm_struct *mm = va_space->va_space_mm.mm; 754 hmm_split_invalidate_data_t split_data; 755 NvU64 delay_us; 756 uvm_va_block_t *new_va_block; 757 NV_STATUS status; 758 int ret; 759 760 uvm_assert_rwsem_locked_write(&va_space->lock); 761 762 UVM_ASSERT(new_end > va_block->start); 763 UVM_ASSERT(new_end < va_block->end); 764 UVM_ASSERT(PAGE_ALIGNED(new_end + 1)); 765 766 status = uvm_va_block_create(NULL, new_end + 1, va_block->end, &new_va_block); 767 if (status != NV_OK) 768 return status; 769 770 // Initialize the newly created HMM va_block. 771 hmm_va_block_init(new_va_block, va_space, new_va_block->start, new_va_block->end); 772 773 ret = mmu_interval_notifier_insert(&new_va_block->hmm.notifier, 774 mm, 775 new_va_block->start, 776 uvm_va_block_size(new_va_block), 777 &uvm_hmm_notifier_ops); 778 779 // Since __mmu_notifier_register() was called when the va_space was 780 // initially created, we know that mm->notifier_subscriptions is valid 781 // and mmu_interval_notifier_insert() can't return ENOMEM. 782 // The only error return is for start + length overflowing but we already 783 // registered the same address range before so there should be no error. 784 UVM_ASSERT(!ret); 785 786 uvm_mutex_lock(&va_block->lock); 787 788 status = uvm_va_block_split_locked(va_block, new_end, new_va_block, NULL); 789 if (status != NV_OK) 790 goto err; 791 792 uvm_mutex_unlock(&va_block->lock); 793 794 // The MMU interval notifier has to be removed in order to resize it. 795 // That means there would be a window of time when invalidation callbacks 796 // could be missed. To handle this case, we register a temporary notifier 797 // to cover the address range while resizing the old notifier (it is 798 // OK to have multiple notifiers for the same range, we may simply try to 799 // invalidate twice). 800 split_data.existing_block = va_block; 801 ret = mmu_interval_notifier_insert(&split_data.notifier, 802 mm, 803 va_block->start, 804 new_end - va_block->start + 1, 805 &hmm_notifier_split_ops); 806 UVM_ASSERT(!ret); 807 808 // Delay to allow hmm_sanity test to trigger an mmu_notifier during the 809 // critical window where the split invalidate callback is active. 810 delay_us = atomic64_read(&va_space->test.split_invalidate_delay_us); 811 if (delay_us) 812 udelay(delay_us); 813 814 mmu_interval_notifier_remove(&va_block->hmm.notifier); 815 816 // Enable notifications on the old block with the smaller size. 817 ret = mmu_interval_notifier_insert(&va_block->hmm.notifier, 818 mm, 819 va_block->start, 820 uvm_va_block_size(va_block), 821 &uvm_hmm_notifier_ops); 822 UVM_ASSERT(!ret); 823 824 mmu_interval_notifier_remove(&split_data.notifier); 825 826 if (new_block_ptr) 827 *new_block_ptr = new_va_block; 828 829 return status; 830 831 err: 832 uvm_mutex_unlock(&va_block->lock); 833 mmu_interval_notifier_remove(&new_va_block->hmm.notifier); 834 uvm_va_block_release(new_va_block); 835 return status; 836 } 837 838 // Check to see if the HMM va_block would overlap the range start/end and 839 // split it so it can be removed. That breaks down to the following cases: 840 // start/end could cover all of the HMM va_block -> 841 // remove the va_block 842 // start/end could cover the left part of the HMM va_block -> 843 // remove the left part 844 // start/end could cover the right part of the HMM va_block -> 845 // remove the right part 846 // or start/end could "punch a hole" in the middle and leave the ends intact. 847 // In each case, only one HMM va_block is removed so return it in out_va_block. 848 static NV_STATUS split_block_if_needed(uvm_va_block_t *va_block, 849 NvU64 start, 850 NvU64 end, 851 uvm_va_block_t **out_va_block) 852 { 853 uvm_va_block_context_t *va_block_context; 854 uvm_va_space_t *va_space; 855 struct mm_struct *mm; 856 struct vm_area_struct *vma; 857 uvm_va_block_region_t region; 858 NvU64 addr, from, to; 859 uvm_va_block_t *new; 860 NV_STATUS status; 861 862 if (va_block->start < start) { 863 status = hmm_split_block(va_block, start - 1, &new); 864 if (status != NV_OK) 865 return status; 866 867 // Keep the left part, the right part will be deleted. 868 va_block = new; 869 } 870 871 if (va_block->end > end) { 872 status = hmm_split_block(va_block, end, NULL); 873 if (status != NV_OK) 874 return status; 875 876 // Keep the right part, the left part will be deleted. 877 } 878 879 *out_va_block = va_block; 880 881 // Migrate any GPU data to sysmem before destroying the HMM va_block. 882 // We do this because the new va_range might be for a UVM external 883 // allocation which could be converting an address range that was first 884 // operated on by UVM-HMM and the exteral allocation should see that data. 885 va_space = va_block->hmm.va_space; 886 mm = va_space->va_space_mm.mm; 887 va_block_context = uvm_va_space_block_context(va_space, mm); 888 889 for (addr = va_block->start; addr < va_block->end; addr = to + 1) { 890 vma = find_vma_intersection(mm, addr, va_block->end); 891 if (!vma) 892 break; 893 894 from = max(addr, (NvU64)vma->vm_start); 895 to = min(va_block->end, (NvU64)vma->vm_end - 1); 896 region = uvm_va_block_region_from_start_end(va_block, from, to); 897 898 if (!uvm_hmm_vma_is_valid(vma, from, false)) 899 continue; 900 901 va_block_context->hmm.vma = vma; 902 903 status = hmm_migrate_range(va_block, 904 NULL, 905 va_block_context, 906 UVM_ID_CPU, 907 from, 908 to, 909 UVM_MIGRATE_MODE_MAKE_RESIDENT_AND_MAP, 910 NULL); 911 if (status != NV_OK) 912 return status; 913 } 914 915 return NV_OK; 916 } 917 918 // Normally, the HMM va_block is destroyed when the va_space is destroyed 919 // (i.e., when the /dev/nvidia-uvm device is closed). A munmap() call triggers 920 // a uvm_hmm_invalidate() callback which unmaps the VMA's range from the GPU's 921 // page tables. However, it doesn't destroy the va_block because that would 922 // require calling mmu_interval_notifier_remove() which can't be called from 923 // the invalidate callback due to Linux locking constraints. If a process 924 // calls mmap()/munmap() for SAM and then creates a managed allocation, 925 // the same VMA range can be picked and there would be a UVM/HMM va_block 926 // conflict. Creating a managed allocation, external allocation, or other 927 // va_range types, calls this function to remove stale HMM va_blocks or split 928 // the HMM va_block so there is no overlap. 929 NV_STATUS uvm_hmm_va_block_reclaim(uvm_va_space_t *va_space, 930 struct mm_struct *mm, 931 NvU64 start, 932 NvU64 end) 933 { 934 uvm_range_tree_node_t *node, *next; 935 uvm_va_block_t *va_block; 936 NV_STATUS status; 937 938 if (!uvm_hmm_is_enabled(va_space)) 939 return NV_OK; 940 941 if (mm) 942 uvm_assert_mmap_lock_locked(mm); 943 uvm_assert_rwsem_locked_write(&va_space->lock); 944 945 // Process each HMM va_block that overlaps the interval [start, end]. 946 // Note that end is inclusive. 947 // The blocks_lock is not needed when the va_space lock is held for write. 948 uvm_range_tree_for_each_in_safe(node, next, &va_space->hmm.blocks, start, end) { 949 va_block = hmm_va_block_from_node(node); 950 951 if (mm) { 952 status = split_block_if_needed(va_block, start, end, &va_block); 953 if (status != NV_OK) 954 return status; 955 } 956 957 // Note that this waits for any invalidations callbacks to complete 958 // so uvm_hmm_invalidate() won't see a block disapear. 959 // The va_space write lock should prevent uvm_hmm_va_block_find_create() 960 // from adding it back. 961 mmu_interval_notifier_remove(&va_block->hmm.notifier); 962 uvm_range_tree_remove(&va_space->hmm.blocks, &va_block->hmm.node); 963 uvm_va_block_kill(va_block); 964 } 965 966 UVM_ASSERT(!uvm_range_tree_iter_first(&va_space->hmm.blocks, start, end)); 967 968 return NV_OK; 969 } 970 971 void uvm_hmm_va_block_split_tree(uvm_va_block_t *existing_va_block, uvm_va_block_t *new_block) 972 { 973 uvm_va_space_t *va_space = existing_va_block->hmm.va_space; 974 975 UVM_ASSERT(uvm_va_block_is_hmm(existing_va_block)); 976 uvm_assert_rwsem_locked_write(&va_space->lock); 977 978 uvm_range_tree_split(&existing_va_block->hmm.va_space->hmm.blocks, 979 &existing_va_block->hmm.node, 980 &new_block->hmm.node); 981 } 982 983 NV_STATUS uvm_hmm_split_as_needed(uvm_va_space_t *va_space, 984 NvU64 addr, 985 uvm_va_policy_is_split_needed_t split_needed_cb, 986 void *data) 987 { 988 uvm_va_block_t *va_block; 989 uvm_va_policy_node_t *node; 990 NV_STATUS status; 991 992 uvm_assert_rwsem_locked_write(&va_space->lock); 993 994 // If there is no HMM va_block or the va_block doesn't span the policy 995 // addr, there is no need to split. 996 status = uvm_hmm_va_block_find(va_space, addr, &va_block); 997 if (status != NV_OK || va_block->start == addr) 998 return NV_OK; 999 1000 uvm_mutex_lock(&va_block->lock); 1001 1002 node = uvm_va_policy_node_find(va_block, addr); 1003 if (!node) 1004 goto done; 1005 1006 // If the policy range doesn't span addr, we're done. 1007 if (addr == node->node.start) 1008 goto done; 1009 1010 if (split_needed_cb(&node->policy, data)) 1011 status = uvm_va_policy_node_split(va_block, node, addr - 1, NULL); 1012 1013 done: 1014 uvm_mutex_unlock(&va_block->lock); 1015 return status; 1016 } 1017 1018 static NV_STATUS hmm_set_preferred_location_locked(uvm_va_block_t *va_block, 1019 uvm_va_block_context_t *va_block_context, 1020 uvm_processor_id_t preferred_location, 1021 NvU64 addr, 1022 NvU64 end, 1023 uvm_tracker_t *out_tracker) 1024 { 1025 uvm_processor_mask_t set_accessed_by_processors; 1026 const uvm_va_policy_t *old_policy; 1027 uvm_va_policy_node_t *node; 1028 uvm_va_block_region_t region; 1029 uvm_processor_id_t id; 1030 NV_STATUS status, tracker_status; 1031 1032 // Note that we can't just call uvm_va_policy_set_range() for the whole 1033 // range [addr end] because we need to examine the old value of 1034 // policy->preferred_location before setting it. Thus we iterate over 1035 // the existing policy nodes. 1036 uvm_for_each_va_policy_in(old_policy, va_block, addr, end, node, region) { 1037 if (uvm_id_equal(old_policy->preferred_location, preferred_location)) 1038 continue; 1039 1040 // If the old preferred location is a valid processor ID, remote 1041 // mappings should be established to the new preferred location if 1042 // accessed-by is set. 1043 uvm_processor_mask_zero(&set_accessed_by_processors); 1044 1045 if (UVM_ID_IS_VALID(old_policy->preferred_location) && 1046 uvm_processor_mask_test(&old_policy->accessed_by, old_policy->preferred_location)) 1047 uvm_processor_mask_set(&set_accessed_by_processors, old_policy->preferred_location); 1048 1049 va_block_context->policy = uvm_va_policy_set_preferred_location(va_block, 1050 region, 1051 preferred_location, 1052 old_policy); 1053 if (!va_block_context->policy) 1054 return NV_ERR_NO_MEMORY; 1055 1056 // Establish new remote mappings if the old preferred location had 1057 // accessed-by set. 1058 for_each_id_in_mask(id, &set_accessed_by_processors) { 1059 status = uvm_va_block_set_accessed_by_locked(va_block, va_block_context, id, region, out_tracker); 1060 if (status != NV_OK) 1061 return status; 1062 } 1063 1064 // Even though the UVM_VA_BLOCK_RETRY_LOCKED() may unlock and relock 1065 // the va_block lock, the policy remains valid because we hold the mmap 1066 // lock so munmap can't remove the policy, and the va_space lock so the 1067 // policy APIs can't change the policy. 1068 status = UVM_VA_BLOCK_RETRY_LOCKED(va_block, 1069 NULL, 1070 uvm_va_block_set_preferred_location_locked(va_block, 1071 va_block_context, 1072 region)); 1073 1074 tracker_status = uvm_tracker_add_tracker_safe(out_tracker, &va_block->tracker); 1075 if (status == NV_OK) 1076 status = tracker_status; 1077 1078 if (status != NV_OK) 1079 return status; 1080 } 1081 1082 return NV_OK; 1083 } 1084 1085 NV_STATUS uvm_hmm_set_preferred_location(uvm_va_space_t *va_space, 1086 uvm_processor_id_t preferred_location, 1087 NvU64 base, 1088 NvU64 last_address, 1089 uvm_tracker_t *out_tracker) 1090 { 1091 uvm_va_block_context_t *va_block_context; 1092 uvm_va_block_t *va_block; 1093 NvU64 addr; 1094 NV_STATUS status = NV_OK; 1095 1096 if (!uvm_hmm_is_enabled(va_space)) 1097 return NV_ERR_INVALID_ADDRESS; 1098 1099 uvm_assert_mmap_lock_locked(va_space->va_space_mm.mm); 1100 uvm_assert_rwsem_locked_write(&va_space->lock); 1101 UVM_ASSERT(PAGE_ALIGNED(base)); 1102 UVM_ASSERT(PAGE_ALIGNED(last_address + 1)); 1103 UVM_ASSERT(base < last_address); 1104 1105 // Update HMM preferred location policy. 1106 1107 va_block_context = uvm_va_space_block_context(va_space, va_space->va_space_mm.mm); 1108 1109 for (addr = base; addr < last_address; addr = va_block->end + 1) { 1110 NvU64 end; 1111 1112 status = hmm_va_block_find_create(va_space, addr, true, va_block_context, &va_block); 1113 if (status != NV_OK) 1114 break; 1115 1116 end = min(last_address, va_block->end); 1117 1118 uvm_mutex_lock(&va_block->lock); 1119 1120 status = hmm_set_preferred_location_locked(va_block, 1121 va_block_context, 1122 preferred_location, 1123 addr, 1124 end, 1125 out_tracker); 1126 1127 uvm_mutex_unlock(&va_block->lock); 1128 1129 if (status != NV_OK) 1130 break; 1131 } 1132 1133 return status; 1134 } 1135 1136 static NV_STATUS hmm_set_accessed_by_start_end_locked(uvm_va_block_t *va_block, 1137 uvm_va_block_context_t *va_block_context, 1138 uvm_processor_id_t processor_id, 1139 NvU64 start, 1140 NvU64 end, 1141 uvm_tracker_t *out_tracker) 1142 { 1143 uvm_va_space_t *va_space = va_block->hmm.va_space; 1144 uvm_va_policy_node_t *node; 1145 uvm_va_block_region_t region; 1146 NV_STATUS status = NV_OK; 1147 1148 uvm_for_each_va_policy_node_in(node, va_block, start, end) { 1149 // Read duplication takes precedence over SetAccessedBy. 1150 // Do not add mappings if read duplication is enabled. 1151 if (uvm_va_policy_is_read_duplicate(&node->policy, va_space)) 1152 continue; 1153 1154 va_block_context->policy = &node->policy; 1155 region = uvm_va_block_region_from_start_end(va_block, 1156 max(start, node->node.start), 1157 min(end, node->node.end)); 1158 1159 status = uvm_va_block_set_accessed_by_locked(va_block, 1160 va_block_context, 1161 processor_id, 1162 region, 1163 out_tracker); 1164 if (status != NV_OK) 1165 break; 1166 } 1167 1168 return status; 1169 } 1170 1171 NV_STATUS uvm_hmm_set_accessed_by(uvm_va_space_t *va_space, 1172 uvm_processor_id_t processor_id, 1173 bool set_bit, 1174 NvU64 base, 1175 NvU64 last_address, 1176 uvm_tracker_t *out_tracker) 1177 { 1178 uvm_va_block_context_t *va_block_context; 1179 uvm_va_block_t *va_block; 1180 NvU64 addr; 1181 NV_STATUS status = NV_OK; 1182 1183 if (!uvm_hmm_is_enabled(va_space)) 1184 return NV_ERR_INVALID_ADDRESS; 1185 1186 uvm_assert_mmap_lock_locked(va_space->va_space_mm.mm); 1187 uvm_assert_rwsem_locked_write(&va_space->lock); 1188 UVM_ASSERT(PAGE_ALIGNED(base)); 1189 UVM_ASSERT(PAGE_ALIGNED(last_address + 1)); 1190 UVM_ASSERT(base < last_address); 1191 1192 // Update HMM accessed by policy. 1193 1194 va_block_context = uvm_va_space_block_context(va_space, va_space->va_space_mm.mm); 1195 1196 for (addr = base; addr < last_address; addr = va_block->end + 1) { 1197 NvU64 end; 1198 1199 status = hmm_va_block_find_create(va_space, addr, true, va_block_context, &va_block); 1200 if (status != NV_OK) 1201 break; 1202 1203 end = min(last_address, va_block->end); 1204 1205 uvm_mutex_lock(&va_block->lock); 1206 1207 status = uvm_va_policy_set_range(va_block, 1208 addr, 1209 end, 1210 UVM_VA_POLICY_ACCESSED_BY, 1211 !set_bit, 1212 processor_id, 1213 UVM_READ_DUPLICATION_MAX); 1214 1215 if (status == NV_OK && set_bit) { 1216 status = hmm_set_accessed_by_start_end_locked(va_block, 1217 va_block_context, 1218 processor_id, 1219 addr, 1220 end, 1221 out_tracker); 1222 } 1223 1224 uvm_mutex_unlock(&va_block->lock); 1225 1226 if (status != NV_OK) 1227 break; 1228 } 1229 1230 return status; 1231 } 1232 1233 void uvm_hmm_block_add_eviction_mappings(uvm_va_space_t *va_space, 1234 uvm_va_block_t *va_block, 1235 uvm_va_block_context_t *block_context) 1236 { 1237 uvm_tracker_t local_tracker = UVM_TRACKER_INIT(); 1238 uvm_va_policy_node_t *node; 1239 uvm_va_block_region_t region; 1240 uvm_processor_mask_t map_processors; 1241 uvm_processor_id_t id; 1242 NV_STATUS tracker_status; 1243 NV_STATUS status = NV_OK; 1244 1245 UVM_ASSERT(uvm_va_block_is_hmm(va_block)); 1246 uvm_assert_mmap_lock_locked(va_space->va_space_mm.mm); 1247 uvm_assert_rwsem_locked(&va_space->lock); 1248 1249 uvm_mutex_lock(&va_block->lock); 1250 1251 uvm_for_each_va_policy_node_in(node, va_block, va_block->start, va_block->end) { 1252 block_context->policy = &node->policy; 1253 1254 for_each_id_in_mask(id, &node->policy.accessed_by) { 1255 status = hmm_set_accessed_by_start_end_locked(va_block, 1256 block_context, 1257 id, 1258 node->node.start, 1259 node->node.end, 1260 &local_tracker); 1261 if (status != NV_OK) 1262 break; 1263 1264 if (!uvm_va_space_map_remote_on_eviction(va_space)) 1265 continue; 1266 1267 // Exclude the processors that have been already mapped due to 1268 // AccessedBy. 1269 uvm_processor_mask_andnot(&map_processors, &va_block->evicted_gpus, &node->policy.accessed_by); 1270 1271 for_each_gpu_id_in_mask(id, &map_processors) { 1272 uvm_gpu_t *gpu = uvm_va_space_get_gpu(va_space, id); 1273 uvm_va_block_gpu_state_t *gpu_state; 1274 1275 if (!gpu->parent->access_counters_supported) 1276 continue; 1277 1278 gpu_state = uvm_va_block_gpu_state_get(va_block, id); 1279 UVM_ASSERT(gpu_state); 1280 1281 // TODO: Bug 2096389: uvm_va_block_add_mappings does not add 1282 // remote mappings to read-duplicated pages. Add support for it 1283 // or create a new function. 1284 status = uvm_va_block_add_mappings(va_block, 1285 block_context, 1286 id, 1287 region, 1288 &gpu_state->evicted, 1289 UvmEventMapRemoteCauseEviction); 1290 tracker_status = uvm_tracker_add_tracker_safe(&local_tracker, &va_block->tracker); 1291 status = (status == NV_OK) ? tracker_status : status; 1292 if (status != NV_OK) { 1293 UVM_ASSERT(status != NV_ERR_MORE_PROCESSING_REQUIRED); 1294 break; 1295 } 1296 } 1297 } 1298 } 1299 1300 uvm_mutex_unlock(&va_block->lock); 1301 1302 tracker_status = uvm_tracker_wait_deinit(&local_tracker); 1303 status = (status == NV_OK) ? tracker_status : status; 1304 if (status != NV_OK) { 1305 UVM_ERR_PRINT("Deferred mappings to evicted memory for block [0x%llx, 0x%llx] failed %s\n", 1306 va_block->start, 1307 va_block->end, 1308 nvstatusToString(status)); 1309 } 1310 } 1311 1312 void uvm_hmm_find_policy_end(uvm_va_block_t *va_block, 1313 uvm_va_block_context_t *va_block_context, 1314 unsigned long addr, 1315 NvU64 *endp) 1316 { 1317 struct vm_area_struct *vma = va_block_context->hmm.vma; 1318 const uvm_va_policy_node_t *node; 1319 NvU64 end = va_block->end; 1320 1321 uvm_assert_mmap_lock_locked(vma->vm_mm); 1322 uvm_assert_mutex_locked(&va_block->lock); 1323 1324 if (end > vma->vm_end - 1) 1325 end = vma->vm_end - 1; 1326 1327 node = uvm_va_policy_node_find(va_block, addr); 1328 if (node) { 1329 va_block_context->policy = &node->policy; 1330 if (end > node->node.end) 1331 end = node->node.end; 1332 } 1333 else { 1334 va_block_context->policy = &uvm_va_policy_default; 1335 } 1336 1337 *endp = end; 1338 } 1339 1340 NV_STATUS uvm_hmm_find_policy_vma_and_outer(uvm_va_block_t *va_block, 1341 uvm_va_block_context_t *va_block_context, 1342 uvm_page_index_t page_index, 1343 uvm_page_index_t *outerp) 1344 { 1345 struct vm_area_struct *vma; 1346 unsigned long addr; 1347 NvU64 end; 1348 uvm_page_index_t outer; 1349 1350 UVM_ASSERT(uvm_va_block_is_hmm(va_block)); 1351 uvm_assert_mmap_lock_locked(va_block_context->mm); 1352 uvm_assert_mutex_locked(&va_block->lock); 1353 1354 addr = uvm_va_block_cpu_page_address(va_block, page_index); 1355 1356 vma = vma_lookup(va_block_context->mm, addr); 1357 if (!vma || !(vma->vm_flags & VM_READ)) 1358 return NV_ERR_INVALID_ADDRESS; 1359 1360 va_block_context->hmm.vma = vma; 1361 1362 uvm_hmm_find_policy_end(va_block, va_block_context, addr, &end); 1363 1364 outer = uvm_va_block_cpu_page_index(va_block, end) + 1; 1365 if (*outerp > outer) 1366 *outerp = outer; 1367 1368 return NV_OK; 1369 } 1370 1371 static NV_STATUS hmm_clear_thrashing_policy(uvm_va_block_t *va_block, 1372 uvm_va_block_context_t *block_context) 1373 { 1374 const uvm_va_policy_t *policy; 1375 uvm_va_policy_node_t *node; 1376 uvm_va_block_region_t region; 1377 NV_STATUS status = NV_OK; 1378 1379 uvm_mutex_lock(&va_block->lock); 1380 1381 uvm_for_each_va_policy_in(policy, va_block, va_block->start, va_block->end, node, region) { 1382 block_context->policy = policy; 1383 1384 // Unmap may split PTEs and require a retry. Needs to be called 1385 // before the pinned pages information is destroyed. 1386 status = UVM_VA_BLOCK_RETRY_LOCKED(va_block, 1387 NULL, 1388 uvm_perf_thrashing_unmap_remote_pinned_pages_all(va_block, 1389 block_context, 1390 region)); 1391 1392 uvm_perf_thrashing_info_destroy(va_block); 1393 1394 if (status != NV_OK) 1395 break; 1396 } 1397 1398 uvm_mutex_unlock(&va_block->lock); 1399 1400 return status; 1401 } 1402 1403 NV_STATUS uvm_hmm_clear_thrashing_policy(uvm_va_space_t *va_space) 1404 { 1405 uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, NULL); 1406 uvm_range_tree_node_t *node, *next; 1407 uvm_va_block_t *va_block; 1408 NV_STATUS status = NV_OK; 1409 1410 if (!uvm_hmm_is_enabled(va_space)) 1411 return NV_OK; 1412 1413 uvm_assert_rwsem_locked_write(&va_space->lock); 1414 1415 uvm_range_tree_for_each_safe(node, next, &va_space->hmm.blocks) { 1416 va_block = hmm_va_block_from_node(node); 1417 1418 status = hmm_clear_thrashing_policy(va_block, block_context); 1419 if (status != NV_OK) 1420 break; 1421 } 1422 1423 return status; 1424 } 1425 1426 uvm_va_block_region_t uvm_hmm_get_prefetch_region(uvm_va_block_t *va_block, 1427 uvm_va_block_context_t *va_block_context, 1428 NvU64 address) 1429 { 1430 struct vm_area_struct *vma = va_block_context->hmm.vma; 1431 const uvm_va_policy_t *policy = va_block_context->policy; 1432 NvU64 start, end; 1433 1434 UVM_ASSERT(uvm_va_block_is_hmm(va_block)); 1435 1436 // We need to limit the prefetch region to the VMA. 1437 start = max(va_block->start, (NvU64)vma->vm_start); 1438 end = min(va_block->end, (NvU64)vma->vm_end - 1); 1439 1440 // Also, we need to limit the prefetch region to the policy range. 1441 if (uvm_va_policy_is_default(policy)) { 1442 NV_STATUS status = uvm_range_tree_find_hole_in(&va_block->hmm.va_policy_tree, 1443 address, 1444 &start, 1445 &end); 1446 // We already know the hole exists and covers the fault region. 1447 UVM_ASSERT(status == NV_OK); 1448 } 1449 else { 1450 const uvm_va_policy_node_t *node = uvm_va_policy_node_from_policy(policy); 1451 1452 start = max(start, node->node.start); 1453 end = min(end, node->node.end); 1454 } 1455 1456 return uvm_va_block_region_from_start_end(va_block, start, end); 1457 } 1458 1459 uvm_prot_t uvm_hmm_compute_logical_prot(uvm_va_block_t *va_block, 1460 uvm_va_block_context_t *va_block_context, 1461 NvU64 addr) 1462 { 1463 struct vm_area_struct *vma = va_block_context->hmm.vma; 1464 1465 UVM_ASSERT(uvm_va_block_is_hmm(va_block)); 1466 uvm_assert_mmap_lock_locked(va_block_context->mm); 1467 UVM_ASSERT(vma && addr >= vma->vm_start && addr < vma->vm_end); 1468 1469 if (!(vma->vm_flags & VM_READ)) 1470 return UVM_PROT_NONE; 1471 else if (!(vma->vm_flags & VM_WRITE)) 1472 return UVM_PROT_READ_ONLY; 1473 else 1474 return UVM_PROT_READ_WRITE_ATOMIC; 1475 } 1476 1477 static NV_STATUS hmm_va_block_cpu_page_populate(uvm_va_block_t *va_block, 1478 uvm_page_index_t page_index, 1479 struct page *page) 1480 { 1481 uvm_cpu_chunk_t *chunk; 1482 NV_STATUS status; 1483 1484 UVM_ASSERT(uvm_va_block_is_hmm(va_block)); 1485 UVM_ASSERT(!uvm_page_mask_test(&va_block->cpu.allocated, page_index)); 1486 1487 if (page == ZERO_PAGE(uvm_va_block_cpu_page_address(va_block, page_index))) 1488 return NV_ERR_INVALID_ADDRESS; 1489 1490 status = uvm_cpu_chunk_alloc_hmm(page, &chunk); 1491 if (status != NV_OK) 1492 return status; 1493 1494 status = uvm_cpu_chunk_insert_in_block(va_block, chunk, page_index); 1495 if (status != NV_OK) { 1496 uvm_cpu_chunk_free(chunk); 1497 return status; 1498 } 1499 1500 status = uvm_va_block_map_cpu_chunk_on_gpus(va_block, page_index); 1501 if (status != NV_OK) { 1502 uvm_cpu_chunk_remove_from_block(va_block, page_index); 1503 uvm_cpu_chunk_free(chunk); 1504 } 1505 1506 return status; 1507 } 1508 1509 static void hmm_va_block_cpu_page_unpopulate(uvm_va_block_t *va_block, 1510 uvm_page_index_t page_index) 1511 { 1512 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, page_index); 1513 1514 UVM_ASSERT(uvm_va_block_is_hmm(va_block)); 1515 1516 if (!chunk) 1517 return; 1518 1519 UVM_ASSERT(!uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU) || 1520 !uvm_page_mask_test(&va_block->cpu.resident, page_index)); 1521 1522 uvm_cpu_chunk_remove_from_block(va_block, page_index); 1523 uvm_va_block_unmap_cpu_chunk_on_gpus(va_block, chunk, page_index); 1524 uvm_cpu_chunk_free(chunk); 1525 } 1526 1527 static bool hmm_va_block_cpu_page_is_same(uvm_va_block_t *va_block, 1528 uvm_page_index_t page_index, 1529 struct page *page) 1530 { 1531 struct page *old_page = uvm_cpu_chunk_get_cpu_page(va_block, page_index); 1532 1533 UVM_ASSERT(uvm_cpu_chunk_is_hmm(uvm_cpu_chunk_get_chunk_for_page(va_block, page_index))); 1534 return old_page == page; 1535 } 1536 1537 // uvm_va_block_service_copy() and uvm_va_block_service_finish() expect the 1538 // service_context masks to match what is being processed. Since a page 1539 // that was expected to be processed isn't migrating, we have to clear the 1540 // masks to make service_context consistent with what is actually being 1541 // handled. 1542 static void clear_service_context_masks(uvm_service_block_context_t *service_context, 1543 uvm_processor_id_t new_residency, 1544 uvm_page_index_t page_index) 1545 { 1546 uvm_page_mask_clear(&service_context->block_context.caller_page_mask, page_index); 1547 1548 uvm_page_mask_clear(&service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency, 1549 page_index); 1550 1551 if (uvm_page_mask_empty(&service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency)) 1552 uvm_processor_mask_clear(&service_context->resident_processors, new_residency); 1553 1554 if (UVM_ID_IS_VALID(service_context->prefetch_hint.residency)) 1555 uvm_page_mask_clear(&service_context->prefetch_hint.prefetch_pages_mask, page_index); 1556 1557 if (service_context->thrashing_pin_count > 0 && 1558 uvm_page_mask_test_and_clear(&service_context->thrashing_pin_mask, page_index)) { 1559 service_context->thrashing_pin_count--; 1560 } 1561 1562 if (service_context->read_duplicate_count > 0 && 1563 uvm_page_mask_test_and_clear(&service_context->read_duplicate_mask, page_index)) { 1564 service_context->read_duplicate_count--; 1565 } 1566 } 1567 1568 static void cpu_mapping_set(uvm_va_block_t *va_block, 1569 bool is_write, 1570 uvm_page_index_t page_index) 1571 { 1572 uvm_processor_mask_set(&va_block->mapped, UVM_ID_CPU); 1573 uvm_page_mask_set(&va_block->maybe_mapped_pages, page_index); 1574 uvm_page_mask_set(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ], page_index); 1575 if (is_write) 1576 uvm_page_mask_set(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE], page_index); 1577 else 1578 uvm_page_mask_clear(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE], page_index); 1579 } 1580 1581 static void cpu_mapping_clear(uvm_va_block_t *va_block, uvm_page_index_t page_index) 1582 { 1583 uvm_page_mask_clear(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE], page_index); 1584 uvm_page_mask_clear(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ], page_index); 1585 if (uvm_page_mask_empty(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ])) 1586 uvm_processor_mask_clear(&va_block->mapped, UVM_ID_CPU); 1587 } 1588 1589 static void gpu_chunk_remove(uvm_va_block_t *va_block, 1590 uvm_page_index_t page_index, 1591 struct page *page) 1592 { 1593 uvm_va_block_gpu_state_t *gpu_state; 1594 uvm_gpu_chunk_t *gpu_chunk; 1595 uvm_gpu_id_t id; 1596 1597 id = uvm_pmm_devmem_page_to_gpu_id(page); 1598 gpu_state = uvm_va_block_gpu_state_get(va_block, id); 1599 UVM_ASSERT(gpu_state); 1600 1601 gpu_chunk = gpu_state->chunks[page_index]; 1602 if (!gpu_chunk) { 1603 // If we didn't find a chunk it's because the page was unmapped for 1604 // mremap and no fault has established a new mapping. 1605 UVM_ASSERT(!uvm_page_mask_test(&gpu_state->resident, page_index)); 1606 return; 1607 } 1608 1609 // TODO: Bug 3898467: unmap indirect peers when freeing GPU chunks 1610 1611 uvm_mmu_chunk_unmap(gpu_chunk, &va_block->tracker); 1612 gpu_state->chunks[page_index] = NULL; 1613 } 1614 1615 static NV_STATUS gpu_chunk_add(uvm_va_block_t *va_block, 1616 uvm_page_index_t page_index, 1617 struct page *page) 1618 { 1619 uvm_va_block_gpu_state_t *gpu_state; 1620 uvm_gpu_chunk_t *gpu_chunk; 1621 uvm_gpu_id_t id; 1622 NV_STATUS status; 1623 1624 id = uvm_pmm_devmem_page_to_gpu_id(page); 1625 gpu_state = uvm_va_block_gpu_state_get(va_block, id); 1626 1627 // It's possible that this is a fresh va_block we're trying to add an 1628 // existing gpu_chunk to. This occurs for example when a GPU faults on a 1629 // virtual address that has been remapped with mremap(). 1630 if (!gpu_state) { 1631 status = uvm_va_block_gpu_state_alloc(va_block); 1632 if (status != NV_OK) 1633 return status; 1634 gpu_state = uvm_va_block_gpu_state_get(va_block, id); 1635 } 1636 1637 UVM_ASSERT(gpu_state); 1638 1639 // Note that a mremap() might be to a CPU virtual address that is nolonger 1640 // aligned with a larger GPU chunk size. We would need to allocate a new 1641 // aligned GPU chunk and copy from old to new. 1642 // TODO: Bug 3368756: add support for large GPU pages. 1643 gpu_chunk = uvm_pmm_devmem_page_to_chunk(page); 1644 UVM_ASSERT(gpu_chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED); 1645 UVM_ASSERT(gpu_chunk->is_referenced); 1646 UVM_ASSERT(page->zone_device_data == va_block->hmm.va_space); 1647 1648 if (gpu_state->chunks[page_index] == gpu_chunk) 1649 return NV_OK; 1650 1651 UVM_ASSERT(!gpu_state->chunks[page_index]); 1652 1653 // In some configurations such as SR-IOV heavy, the chunk cannot be 1654 // referenced using its physical address. Create a virtual mapping. 1655 status = uvm_mmu_chunk_map(gpu_chunk); 1656 if (status != NV_OK) 1657 return status; 1658 1659 // TODO: Bug 3898467: map indirect peers. 1660 1661 uvm_processor_mask_set(&va_block->resident, id); 1662 uvm_page_mask_set(&gpu_state->resident, page_index); 1663 1664 // It is safe to modify the page index field without holding any PMM locks 1665 // because the chunk is allocated, which means that none of the other 1666 // fields in the bitmap can change. 1667 gpu_chunk->va_block = va_block; 1668 gpu_chunk->va_block_page_index = page_index; 1669 1670 gpu_state->chunks[page_index] = gpu_chunk; 1671 1672 return NV_OK; 1673 } 1674 1675 // This is called just before calling migrate_vma_finalize() in order to wait 1676 // for GPU operations to complete and update the va_block state to match which 1677 // pages migrated (or not) and therefore which pages will be released by 1678 // migrate_vma_finalize(). 1679 // 'migrated_pages' is the mask of pages that migrated, 1680 // 'same_devmem_page_mask' is the mask of pages that are the same in src_pfns 1681 // and dst_pfns and therefore appear to migrate_vma_*() to be not migrating. 1682 // 'region' is the page index region of all migrated, non-migrated, and 1683 // same_devmem_page_mask pages. 1684 static NV_STATUS sync_page_and_chunk_state(uvm_va_block_t *va_block, 1685 const unsigned long *src_pfns, 1686 const unsigned long *dst_pfns, 1687 uvm_va_block_region_t region, 1688 const uvm_page_mask_t *migrated_pages, 1689 const uvm_page_mask_t *same_devmem_page_mask) 1690 { 1691 uvm_page_index_t page_index; 1692 NV_STATUS status; 1693 1694 // Wait for the GPU to finish. migrate_vma_finalize() will release the 1695 // migrated source pages (or non migrating destination pages), so GPU 1696 // opererations must be finished by then. 1697 status = uvm_tracker_wait(&va_block->tracker); 1698 1699 for_each_va_block_page_in_region(page_index, region) { 1700 struct page *page; 1701 1702 if (uvm_page_mask_test(same_devmem_page_mask, page_index)) 1703 continue; 1704 1705 // If a page migrated, clean up the source page. 1706 // Otherwise, clean up the destination page. 1707 if (uvm_page_mask_test(migrated_pages, page_index)) 1708 page = migrate_pfn_to_page(src_pfns[page_index]); 1709 else 1710 page = migrate_pfn_to_page(dst_pfns[page_index]); 1711 1712 if (!page) 1713 continue; 1714 1715 if (is_device_private_page(page)) { 1716 gpu_chunk_remove(va_block, page_index, page); 1717 } 1718 else { 1719 // If the source page is a system memory page, 1720 // migrate_vma_finalize() will release the reference so we should 1721 // clear our pointer to it. 1722 // TODO: Bug 3660922: Need to handle read duplication at some point. 1723 hmm_va_block_cpu_page_unpopulate(va_block, page_index); 1724 } 1725 } 1726 1727 return status; 1728 } 1729 1730 // Update va_block state to reflect that the page isn't migrating. 1731 static void clean_up_non_migrating_page(uvm_va_block_t *va_block, 1732 const unsigned long *src_pfns, 1733 unsigned long *dst_pfns, 1734 uvm_page_index_t page_index) 1735 { 1736 struct page *dst_page = migrate_pfn_to_page(dst_pfns[page_index]); 1737 1738 if (!dst_page) 1739 return; 1740 1741 // migrate_vma_finalize() will release the dst_page reference so don't keep 1742 // a pointer to it. 1743 if (is_device_private_page(dst_page)) { 1744 gpu_chunk_remove(va_block, page_index, dst_page); 1745 } 1746 else { 1747 UVM_ASSERT(page_ref_count(dst_page) == 1); 1748 1749 hmm_va_block_cpu_page_unpopulate(va_block, page_index); 1750 } 1751 1752 unlock_page(dst_page); 1753 put_page(dst_page); 1754 dst_pfns[page_index] = 0; 1755 } 1756 1757 static void clean_up_non_migrating_pages(uvm_va_block_t *va_block, 1758 const unsigned long *src_pfns, 1759 unsigned long *dst_pfns, 1760 uvm_va_block_region_t region, 1761 uvm_page_mask_t *page_mask) 1762 { 1763 uvm_page_index_t page_index; 1764 NV_STATUS status; 1765 1766 status = uvm_tracker_wait(&va_block->tracker); 1767 UVM_ASSERT(status == NV_OK); 1768 1769 for_each_va_block_page_in_region_mask(page_index, page_mask, region) { 1770 clean_up_non_migrating_page(va_block, src_pfns, dst_pfns, page_index); 1771 } 1772 } 1773 1774 // CPU page fault handling. 1775 1776 // Fill in the dst_pfns[page_index] entry given that there is an allocated 1777 // CPU page. 1778 static void lock_block_cpu_page(uvm_va_block_t *va_block, 1779 uvm_page_index_t page_index, 1780 struct page *src_page, 1781 unsigned long *dst_pfns, 1782 uvm_page_mask_t *same_devmem_page_mask) 1783 { 1784 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, page_index); 1785 uvm_va_block_region_t chunk_region; 1786 struct page *dst_page; 1787 1788 UVM_ASSERT(chunk); 1789 UVM_ASSERT(chunk->page); 1790 1791 chunk_region = uvm_va_block_chunk_region(va_block, uvm_cpu_chunk_get_size(chunk), page_index); 1792 1793 dst_page = chunk->page + (page_index - chunk_region.first); 1794 1795 UVM_ASSERT(dst_page != ZERO_PAGE(uvm_va_block_cpu_page_address(va_block, page_index))); 1796 UVM_ASSERT(!is_device_private_page(dst_page)); 1797 1798 // The source page is usually a device private page but it could be a GPU 1799 // remote mapped system memory page. It could also be a driver allocated 1800 // page for GPU-to-GPU staged copies (i.e., not a resident copy and owned 1801 // by the driver). 1802 if (is_device_private_page(src_page)) { 1803 // Since the page isn't mirrored, it was allocated by alloc_pages() 1804 // and UVM owns the reference. We leave the reference count unchanged 1805 // and mark the page pointer as mirrored since UVM is transferring 1806 // ownership to Linux and we don't want UVM to double free the page in 1807 // hmm_va_block_cpu_page_unpopulate() or block_kill(). If the page 1808 // does not migrate, it will be freed though. 1809 UVM_ASSERT(!uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU) || 1810 !uvm_page_mask_test(&va_block->cpu.resident, page_index)); 1811 UVM_ASSERT(chunk->type == UVM_CPU_CHUNK_TYPE_PHYSICAL); 1812 UVM_ASSERT(page_ref_count(dst_page) == 1); 1813 uvm_cpu_chunk_make_hmm(chunk); 1814 } 1815 else { 1816 UVM_ASSERT(same_devmem_page_mask); 1817 UVM_ASSERT(src_page == dst_page); 1818 uvm_page_mask_set(same_devmem_page_mask, page_index); 1819 1820 // The call to migrate_vma_setup() will have inserted a migration PTE 1821 // so the CPU has no access. 1822 cpu_mapping_clear(va_block, page_index); 1823 return; 1824 } 1825 1826 lock_page(dst_page); 1827 dst_pfns[page_index] = migrate_pfn(page_to_pfn(dst_page)); 1828 } 1829 1830 static void hmm_mark_gpu_chunk_referenced(uvm_va_block_t *va_block, 1831 uvm_gpu_t *gpu, 1832 uvm_gpu_chunk_t *gpu_chunk) 1833 { 1834 // Tell PMM to expect a callback from Linux to free the page since the 1835 // device private struct page reference count will determine when the 1836 // GPU chunk is free. 1837 UVM_ASSERT(gpu_chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED); 1838 list_del_init(&gpu_chunk->list); 1839 uvm_pmm_gpu_unpin_referenced(&gpu->pmm, gpu_chunk, va_block); 1840 } 1841 1842 static void fill_dst_pfn(uvm_va_block_t *va_block, 1843 uvm_gpu_t *gpu, 1844 const unsigned long *src_pfns, 1845 unsigned long *dst_pfns, 1846 uvm_page_index_t page_index, 1847 uvm_page_mask_t *same_devmem_page_mask) 1848 { 1849 unsigned long src_pfn = src_pfns[page_index]; 1850 uvm_gpu_chunk_t *gpu_chunk; 1851 unsigned long pfn; 1852 struct page *dpage; 1853 1854 gpu_chunk = uvm_va_block_lookup_gpu_chunk(va_block, gpu, uvm_va_block_cpu_page_address(va_block, page_index)); 1855 UVM_ASSERT(gpu_chunk); 1856 UVM_ASSERT(gpu_chunk->log2_size == PAGE_SHIFT); 1857 pfn = uvm_pmm_gpu_devmem_get_pfn(&gpu->pmm, gpu_chunk); 1858 1859 // If the same GPU page is both source and destination, migrate_vma_pages() 1860 // will see the wrong "expected" reference count and not migrate it, so we 1861 // mark it as not migrating but we keep track of this so we don't confuse 1862 // it with a page that migrate_vma_pages() actually does not migrate. 1863 if ((src_pfn & MIGRATE_PFN_VALID) && (src_pfn >> MIGRATE_PFN_SHIFT) == pfn) { 1864 uvm_page_mask_set(same_devmem_page_mask, page_index); 1865 return; 1866 } 1867 1868 dpage = pfn_to_page(pfn); 1869 UVM_ASSERT(is_device_private_page(dpage)); 1870 UVM_ASSERT(dpage->pgmap->owner == &g_uvm_global); 1871 1872 hmm_mark_gpu_chunk_referenced(va_block, gpu, gpu_chunk); 1873 UVM_ASSERT(!page_count(dpage)); 1874 zone_device_page_init(dpage); 1875 dpage->zone_device_data = va_block->hmm.va_space; 1876 1877 dst_pfns[page_index] = migrate_pfn(pfn); 1878 } 1879 1880 static void fill_dst_pfns(uvm_va_block_t *va_block, 1881 const unsigned long *src_pfns, 1882 unsigned long *dst_pfns, 1883 uvm_va_block_region_t region, 1884 uvm_page_mask_t *page_mask, 1885 uvm_page_mask_t *same_devmem_page_mask, 1886 uvm_processor_id_t dest_id) 1887 { 1888 uvm_gpu_t *gpu = uvm_va_space_get_gpu(va_block->hmm.va_space, dest_id); 1889 uvm_page_index_t page_index; 1890 1891 uvm_page_mask_zero(same_devmem_page_mask); 1892 1893 for_each_va_block_page_in_region_mask(page_index, page_mask, region) { 1894 if (!(src_pfns[page_index] & MIGRATE_PFN_MIGRATE)) 1895 continue; 1896 1897 fill_dst_pfn(va_block, 1898 gpu, 1899 src_pfns, 1900 dst_pfns, 1901 page_index, 1902 same_devmem_page_mask); 1903 } 1904 } 1905 1906 static NV_STATUS alloc_and_copy_to_cpu(uvm_va_block_t *va_block, 1907 struct vm_area_struct *vma, 1908 const unsigned long *src_pfns, 1909 unsigned long *dst_pfns, 1910 uvm_va_block_region_t region, 1911 uvm_page_mask_t *page_mask, 1912 uvm_page_mask_t *same_devmem_page_mask, 1913 uvm_processor_id_t processor_id, 1914 uvm_service_block_context_t *service_context) 1915 { 1916 uvm_page_index_t page_index; 1917 NV_STATUS status = NV_OK; 1918 1919 for_each_va_block_page_in_region_mask(page_index, page_mask, region) { 1920 struct page *src_page; 1921 struct page *dst_page; 1922 gfp_t gfp; 1923 1924 if (!(src_pfns[page_index] & MIGRATE_PFN_MIGRATE)) { 1925 // Device exclusive PTEs are not selected but we still want to 1926 // process the page so record it as such. 1927 if (service_context && !UVM_ID_IS_CPU(processor_id) && 1928 service_context->access_type[page_index] == UVM_FAULT_ACCESS_TYPE_ATOMIC_STRONG) { 1929 uvm_page_mask_set(same_devmem_page_mask, page_index); 1930 continue; 1931 } 1932 1933 // We have previously found a page that is CPU resident which can't 1934 // be migrated (probably a shared mapping) so make sure we establish 1935 // a remote mapping for it. 1936 if (uvm_page_mask_test(same_devmem_page_mask, page_index)) 1937 continue; 1938 1939 goto clr_mask; 1940 } 1941 1942 // This is the page that will be copied to system memory. 1943 src_page = migrate_pfn_to_page(src_pfns[page_index]); 1944 1945 if (src_page) { 1946 // mremap may have caused us to loose the gpu_chunk associated with 1947 // this va_block/page_index so make sure we have the correct chunk. 1948 if (is_device_private_page(src_page)) 1949 gpu_chunk_add(va_block, page_index, src_page); 1950 1951 if (uvm_page_mask_test(&va_block->cpu.allocated, page_index)) { 1952 lock_block_cpu_page(va_block, page_index, src_page, dst_pfns, same_devmem_page_mask); 1953 continue; 1954 } 1955 } 1956 1957 UVM_ASSERT(!uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU) || 1958 !uvm_page_mask_test(&va_block->cpu.resident, page_index)); 1959 1960 // Allocate a user system memory page for the destination. 1961 // This is the typical case since Linux will free the source page when 1962 // migrating to device private memory. 1963 // If there is no source page, it means the page is pte_none() or the 1964 // zero page. This case "shouldn't happen" because we asked 1965 // migrate_vma_setup() only for device private pages but 1966 // migrate_vma_collect_hole() doesn't check the 1967 // MIGRATE_VMA_SELECT_SYSTEM flag. 1968 gfp = GFP_HIGHUSER_MOVABLE; 1969 if (!src_page) 1970 gfp |= __GFP_ZERO; 1971 1972 dst_page = alloc_page_vma(gfp, 1973 vma, 1974 va_block->start + (page_index << PAGE_SHIFT)); 1975 if (!dst_page) { 1976 // Ignore errors if the page is only for prefetching. 1977 if (service_context && 1978 service_context->access_type[page_index] == UVM_FAULT_ACCESS_TYPE_PREFETCH) 1979 goto clr_mask; 1980 UVM_ERR_PRINT("cannot allocate page %u (addr 0x%llx)\n", 1981 page_index, va_block->start + (page_index << PAGE_SHIFT)); 1982 status = NV_ERR_NO_MEMORY; 1983 break; 1984 } 1985 1986 status = hmm_va_block_cpu_page_populate(va_block, page_index, dst_page); 1987 if (status != NV_OK) { 1988 __free_page(dst_page); 1989 // Ignore errors if the page is only for prefetching. 1990 if (service_context && 1991 service_context->access_type[page_index] == UVM_FAULT_ACCESS_TYPE_PREFETCH) 1992 goto clr_mask; 1993 break; 1994 } 1995 1996 // Note that we don't call get_page(dst_page) since alloc_page_vma() 1997 // returns with a page reference count of one and we are passing 1998 // ownership to Linux. Also, uvm_va_block_cpu_page_populate() recorded 1999 // the page as "mirrored" so that migrate_vma_finalize() and 2000 // hmm_va_block_cpu_page_unpopulate() don't double free the page. 2001 lock_page(dst_page); 2002 dst_pfns[page_index] = migrate_pfn(page_to_pfn(dst_page)); 2003 continue; 2004 2005 clr_mask: 2006 // TODO: Bug 3900774: clean up murky mess of mask clearing. 2007 uvm_page_mask_clear(page_mask, page_index); 2008 if (service_context) 2009 clear_service_context_masks(service_context, UVM_ID_CPU, page_index); 2010 } 2011 2012 if (status != NV_OK) 2013 clean_up_non_migrating_pages(va_block, src_pfns, dst_pfns, region, page_mask); 2014 else if (uvm_page_mask_empty(page_mask)) 2015 return NV_WARN_MORE_PROCESSING_REQUIRED; 2016 2017 return status; 2018 } 2019 2020 static NV_STATUS uvm_hmm_devmem_fault_alloc_and_copy(uvm_hmm_devmem_fault_context_t *devmem_fault_context) 2021 { 2022 uvm_processor_id_t processor_id; 2023 uvm_service_block_context_t *service_context; 2024 uvm_va_block_retry_t *va_block_retry; 2025 const unsigned long *src_pfns; 2026 unsigned long *dst_pfns; 2027 uvm_page_mask_t *page_mask; 2028 uvm_page_mask_t *same_devmem_page_mask = &devmem_fault_context->same_devmem_page_mask; 2029 uvm_va_block_t *va_block; 2030 NV_STATUS status = NV_OK; 2031 2032 processor_id = devmem_fault_context->processor_id; 2033 service_context = devmem_fault_context->service_context; 2034 va_block_retry = devmem_fault_context->va_block_retry; 2035 va_block = devmem_fault_context->va_block; 2036 src_pfns = service_context->block_context.hmm.src_pfns; 2037 dst_pfns = service_context->block_context.hmm.dst_pfns; 2038 2039 // Build the migration page mask. 2040 // Note that thrashing pinned pages and prefetch pages are already 2041 // accounted for in service_context->per_processor_masks. 2042 page_mask = &devmem_fault_context->page_mask; 2043 uvm_page_mask_copy(page_mask, &service_context->per_processor_masks[UVM_ID_CPU_VALUE].new_residency); 2044 2045 status = alloc_and_copy_to_cpu(va_block, 2046 service_context->block_context.hmm.vma, 2047 src_pfns, 2048 dst_pfns, 2049 service_context->region, 2050 page_mask, 2051 same_devmem_page_mask, 2052 processor_id, 2053 service_context); 2054 if (status != NV_OK) 2055 return status; 2056 2057 // Do the copy but don't update the residency or mapping for the new 2058 // location yet. 2059 return uvm_va_block_service_copy(processor_id, UVM_ID_CPU, va_block, va_block_retry, service_context); 2060 } 2061 2062 static NV_STATUS uvm_hmm_devmem_fault_finalize_and_map(uvm_hmm_devmem_fault_context_t *devmem_fault_context) 2063 { 2064 uvm_processor_id_t processor_id; 2065 uvm_service_block_context_t *service_context; 2066 uvm_perf_prefetch_hint_t *prefetch_hint; 2067 uvm_va_block_retry_t *va_block_retry; 2068 const unsigned long *src_pfns; 2069 unsigned long *dst_pfns; 2070 uvm_page_mask_t *page_mask; 2071 uvm_va_block_t *va_block; 2072 uvm_va_block_region_t region; 2073 uvm_page_index_t page_index; 2074 NV_STATUS status, tracker_status; 2075 2076 processor_id = devmem_fault_context->processor_id; 2077 service_context = devmem_fault_context->service_context; 2078 prefetch_hint = &service_context->prefetch_hint; 2079 va_block = devmem_fault_context->va_block; 2080 va_block_retry = devmem_fault_context->va_block_retry; 2081 src_pfns = service_context->block_context.hmm.src_pfns; 2082 dst_pfns = service_context->block_context.hmm.dst_pfns; 2083 region = service_context->region; 2084 2085 page_mask = &devmem_fault_context->page_mask; 2086 2087 // There are a number of reasons why HMM will mark a page as not migrating 2088 // even if we set a valid entry in dst_pfns[]. Mark these pages accordingly. 2089 for_each_va_block_page_in_region_mask(page_index, page_mask, region) { 2090 if (src_pfns[page_index] & MIGRATE_PFN_MIGRATE) 2091 continue; 2092 2093 // If a page isn't migrating and only the GPU page table is being 2094 // updated, continue to process it normally. 2095 if (uvm_page_mask_test(&devmem_fault_context->same_devmem_page_mask, page_index)) 2096 continue; 2097 2098 // TODO: Bug 3900774: clean up murky mess of mask clearing. 2099 uvm_page_mask_clear(page_mask, page_index); 2100 clear_service_context_masks(service_context, UVM_ID_CPU, page_index); 2101 } 2102 2103 if (uvm_page_mask_empty(page_mask)) 2104 status = NV_WARN_MORE_PROCESSING_REQUIRED; 2105 else 2106 status = uvm_va_block_service_finish(processor_id, va_block, service_context); 2107 2108 tracker_status = sync_page_and_chunk_state(va_block, 2109 src_pfns, 2110 dst_pfns, 2111 region, 2112 page_mask, 2113 &devmem_fault_context->same_devmem_page_mask); 2114 2115 return status == NV_OK ? tracker_status : status; 2116 } 2117 2118 static NV_STATUS populate_region(uvm_va_block_t *va_block, 2119 unsigned long *pfns, 2120 uvm_va_block_region_t region, 2121 uvm_page_mask_t *populated_page_mask) 2122 { 2123 uvm_page_index_t page_index; 2124 NV_STATUS status; 2125 2126 // Make sure GPU state is allocated or else the GPU DMA mappings to 2127 // system memory won't be saved. 2128 status = uvm_va_block_gpu_state_alloc(va_block); 2129 if (status != NV_OK) 2130 return status; 2131 2132 for_each_va_block_page_in_region(page_index, region) { 2133 struct page *page; 2134 2135 // This case should only happen when querying CPU residency and we ask 2136 // for something not covered by a VMA. Otherwise, hmm_range_fault() 2137 // returns -EFAULT instead of setting the HMM_PFN_ERROR bit. 2138 if (pfns[page_index] & HMM_PFN_ERROR) 2139 return NV_ERR_INVALID_ADDRESS; 2140 2141 if (pfns[page_index] & HMM_PFN_VALID) { 2142 page = hmm_pfn_to_page(pfns[page_index]); 2143 } 2144 else { 2145 // The page can't be evicted since it has to be migrated to the GPU 2146 // first which would leave a device private page entry so this has 2147 // to be a pte_none(), swapped out, or similar entry. 2148 // The page would have been allocated if populate_region() is being 2149 // called from uvm_hmm_va_block_service_locked() so this must be 2150 // for uvm_hmm_va_block_update_residency_info(). Just leave the 2151 // residency/populated information unchanged since 2152 // uvm_hmm_invalidate() should handle that if the underlying page 2153 // is invalidated. 2154 // Also note there can be an allocated page due to GPU-to-GPU 2155 // migration between non-peer or indirect peer GPUs. 2156 continue; 2157 } 2158 2159 if (is_device_private_page(page)) { 2160 // Linux can call hmm_invalidate() and we have to clear the GPU 2161 // chunk pointer in uvm_va_block_gpu_state_t::chunks[] but it might 2162 // not release the device private struct page reference. Since 2163 // hmm_range_fault() did find a device private PTE, we can 2164 // re-establish the GPU chunk pointer. 2165 status = gpu_chunk_add(va_block, page_index, page); 2166 if (status != NV_OK) 2167 return status; 2168 continue; 2169 } 2170 2171 // If a CPU chunk is already allocated, check to see it matches what 2172 // hmm_range_fault() found. 2173 if (uvm_page_mask_test(&va_block->cpu.allocated, page_index)) { 2174 UVM_ASSERT(hmm_va_block_cpu_page_is_same(va_block, page_index, page)); 2175 } 2176 else { 2177 status = hmm_va_block_cpu_page_populate(va_block, page_index, page); 2178 if (status != NV_OK) 2179 return status; 2180 2181 // Record that we populated this page. hmm_block_cpu_fault_locked() 2182 // uses this to ensure pages that don't migrate get remote mapped. 2183 if (populated_page_mask) 2184 uvm_page_mask_set(populated_page_mask, page_index); 2185 } 2186 2187 // Since we have a stable snapshot of the CPU pages, we can 2188 // update the residency and protection information. 2189 uvm_processor_mask_set(&va_block->resident, UVM_ID_CPU); 2190 uvm_page_mask_set(&va_block->cpu.resident, page_index); 2191 2192 cpu_mapping_set(va_block, pfns[page_index] & HMM_PFN_WRITE, page_index); 2193 } 2194 2195 return NV_OK; 2196 } 2197 2198 static void hmm_range_fault_begin(uvm_va_block_t *va_block) 2199 { 2200 uvm_thread_context_t *uvm_context = uvm_thread_context(); 2201 2202 uvm_assert_mutex_locked(&va_block->lock); 2203 uvm_context->hmm_invalidate_seqnum = va_block->hmm.changed; 2204 } 2205 2206 static bool hmm_range_fault_retry(uvm_va_block_t *va_block) 2207 { 2208 uvm_thread_context_t *uvm_context = uvm_thread_context(); 2209 2210 uvm_assert_mutex_locked(&va_block->lock); 2211 return uvm_context->hmm_invalidate_seqnum != va_block->hmm.changed; 2212 } 2213 2214 // Make the region be resident on the CPU by calling hmm_range_fault() to fault 2215 // in CPU pages. 2216 static NV_STATUS hmm_make_resident_cpu(uvm_va_block_t *va_block, 2217 struct vm_area_struct *vma, 2218 unsigned long *hmm_pfns, 2219 uvm_va_block_region_t region, 2220 NvU8 *access_type, 2221 uvm_page_mask_t *populated_page_mask) 2222 { 2223 uvm_page_index_t page_index; 2224 int ret; 2225 struct hmm_range range = { 2226 .notifier = &va_block->hmm.notifier, 2227 .start = uvm_va_block_region_start(va_block, region), 2228 .end = uvm_va_block_region_end(va_block, region) + 1, 2229 .hmm_pfns = hmm_pfns + region.first, 2230 .pfn_flags_mask = HMM_PFN_REQ_FAULT | HMM_PFN_REQ_WRITE, 2231 .dev_private_owner = &g_uvm_global, 2232 }; 2233 2234 for_each_va_block_page_in_region(page_index, region) { 2235 if ((access_type && access_type[page_index] >= UVM_FAULT_ACCESS_TYPE_WRITE) || 2236 (vma->vm_flags & VM_WRITE)) 2237 hmm_pfns[page_index] = HMM_PFN_REQ_FAULT | HMM_PFN_REQ_WRITE; 2238 else 2239 hmm_pfns[page_index] = HMM_PFN_REQ_FAULT; 2240 } 2241 2242 hmm_range_fault_begin(va_block); 2243 2244 // Mirror the VA block to the HMM address range. 2245 // Note that we request HMM to handle page faults, which means that it will 2246 // populate and map potentially not-yet-existing pages to the VMA. 2247 // Also note that mmu_interval_read_begin() calls wait_event() for any 2248 // parallel invalidation callbacks to finish so we can't hold locks that 2249 // the invalidation callback acquires. 2250 uvm_mutex_unlock(&va_block->lock); 2251 2252 range.notifier_seq = mmu_interval_read_begin(range.notifier); 2253 ret = hmm_range_fault(&range); 2254 2255 uvm_mutex_lock(&va_block->lock); 2256 2257 if (ret) 2258 return (ret == -EBUSY) ? NV_WARN_MORE_PROCESSING_REQUIRED : errno_to_nv_status(ret); 2259 2260 if (hmm_range_fault_retry(va_block)) 2261 return NV_WARN_MORE_PROCESSING_REQUIRED; 2262 2263 return populate_region(va_block, 2264 hmm_pfns, 2265 region, 2266 populated_page_mask); 2267 } 2268 2269 // Release the reference count on any pages that were made device exclusive. 2270 static void hmm_release_atomic_pages(uvm_va_block_t *va_block, 2271 uvm_service_block_context_t *service_context) 2272 { 2273 uvm_va_block_region_t region = service_context->region; 2274 uvm_page_index_t page_index; 2275 2276 for_each_va_block_page_in_region(page_index, region) { 2277 struct page *page = service_context->block_context.hmm.pages[page_index]; 2278 2279 if (!page) 2280 continue; 2281 2282 unlock_page(page); 2283 put_page(page); 2284 } 2285 } 2286 2287 static NV_STATUS hmm_block_atomic_fault_locked(uvm_processor_id_t processor_id, 2288 uvm_va_block_t *va_block, 2289 uvm_va_block_retry_t *va_block_retry, 2290 uvm_service_block_context_t *service_context) 2291 { 2292 uvm_va_block_region_t region = service_context->region; 2293 struct page **pages = service_context->block_context.hmm.pages; 2294 int npages; 2295 uvm_page_index_t page_index; 2296 uvm_make_resident_cause_t cause; 2297 NV_STATUS status; 2298 2299 if (!uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU) || 2300 !uvm_page_mask_region_full(&va_block->cpu.resident, region)) { 2301 // There is an atomic GPU fault. We need to make sure no pages are 2302 // GPU resident so that make_device_exclusive_range() doesn't call 2303 // migrate_to_ram() and cause a va_space lock recursion problem. 2304 if (service_context->operation == UVM_SERVICE_OPERATION_REPLAYABLE_FAULTS) 2305 cause = UVM_MAKE_RESIDENT_CAUSE_REPLAYABLE_FAULT; 2306 else if (service_context->operation == UVM_SERVICE_OPERATION_NON_REPLAYABLE_FAULTS) 2307 cause = UVM_MAKE_RESIDENT_CAUSE_NON_REPLAYABLE_FAULT; 2308 else 2309 cause = UVM_MAKE_RESIDENT_CAUSE_ACCESS_COUNTER; 2310 2311 status = uvm_hmm_va_block_migrate_locked(va_block, 2312 va_block_retry, 2313 &service_context->block_context, 2314 UVM_ID_CPU, 2315 region, 2316 cause); 2317 if (status != NV_OK) 2318 goto done; 2319 2320 // make_device_exclusive_range() will try to call migrate_to_ram() 2321 // and deadlock with ourself if the data isn't CPU resident. 2322 if (!uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU) || 2323 !uvm_page_mask_region_full(&va_block->cpu.resident, region)) { 2324 status = NV_WARN_MORE_PROCESSING_REQUIRED; 2325 goto done; 2326 } 2327 } 2328 2329 // TODO: Bug 4014681: atomic GPU operations are not supported on MAP_SHARED 2330 // mmap() files so we check for that here and report a fatal fault. 2331 // Otherwise with the current Linux 6.1 make_device_exclusive_range(), 2332 // it doesn't make the page exclusive and we end up in an endless loop. 2333 if (service_context->block_context.hmm.vma->vm_flags & VM_SHARED) { 2334 status = NV_ERR_NOT_SUPPORTED; 2335 goto done; 2336 } 2337 2338 hmm_range_fault_begin(va_block); 2339 2340 uvm_mutex_unlock(&va_block->lock); 2341 2342 npages = make_device_exclusive_range(service_context->block_context.mm, 2343 uvm_va_block_cpu_page_address(va_block, region.first), 2344 uvm_va_block_cpu_page_address(va_block, region.outer - 1) + PAGE_SIZE, 2345 pages + region.first, 2346 &g_uvm_global); 2347 2348 uvm_mutex_lock(&va_block->lock); 2349 2350 if (npages < 0) { 2351 status = (npages == -EBUSY) ? NV_WARN_MORE_PROCESSING_REQUIRED : errno_to_nv_status(npages); 2352 goto done; 2353 } 2354 2355 while ((size_t)npages < uvm_va_block_region_num_pages(region)) 2356 pages[region.first + npages++] = NULL; 2357 2358 if (hmm_range_fault_retry(va_block)) { 2359 status = NV_WARN_MORE_PROCESSING_REQUIRED; 2360 goto release; 2361 } 2362 2363 status = NV_OK; 2364 2365 for_each_va_block_page_in_region(page_index, region) { 2366 struct page *page = pages[page_index]; 2367 2368 if (!page) { 2369 // Record that one of the pages isn't exclusive but keep converting 2370 // the others. 2371 status = NV_WARN_MORE_PROCESSING_REQUIRED; 2372 continue; 2373 } 2374 2375 // If a CPU chunk is already allocated, check to see it matches what 2376 // make_device_exclusive_range() found. 2377 if (uvm_page_mask_test(&va_block->cpu.allocated, page_index)) { 2378 UVM_ASSERT(hmm_va_block_cpu_page_is_same(va_block, page_index, page)); 2379 UVM_ASSERT(uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU)); 2380 UVM_ASSERT(uvm_page_mask_test(&va_block->cpu.resident, page_index)); 2381 } 2382 else { 2383 NV_STATUS s = hmm_va_block_cpu_page_populate(va_block, page_index, page); 2384 2385 if (s == NV_OK) { 2386 uvm_processor_mask_set(&va_block->resident, UVM_ID_CPU); 2387 uvm_page_mask_set(&va_block->cpu.resident, page_index); 2388 } 2389 } 2390 2391 cpu_mapping_clear(va_block, page_index); 2392 } 2393 2394 if (status != NV_OK) 2395 goto release; 2396 2397 status = uvm_va_block_service_copy(processor_id, UVM_ID_CPU, va_block, va_block_retry, service_context); 2398 if (status != NV_OK) 2399 goto release; 2400 2401 status = uvm_va_block_service_finish(processor_id, va_block, service_context); 2402 2403 release: 2404 hmm_release_atomic_pages(va_block, service_context); 2405 2406 done: 2407 return status; 2408 } 2409 2410 static bool is_atomic_fault(NvU8 *access_type, uvm_va_block_region_t region) 2411 { 2412 uvm_page_index_t page_index; 2413 2414 for_each_va_block_page_in_region(page_index, region) { 2415 if (access_type[page_index] == UVM_FAULT_ACCESS_TYPE_ATOMIC_STRONG) 2416 return true; 2417 } 2418 2419 return false; 2420 } 2421 2422 static bool is_gpu_resident(uvm_va_block_t *va_block, uvm_va_block_region_t region) 2423 { 2424 uvm_processor_id_t gpu_id; 2425 2426 for_each_gpu_id_in_mask(gpu_id, &va_block->resident) { 2427 uvm_va_block_gpu_state_t *gpu_state; 2428 2429 gpu_state = uvm_va_block_gpu_state_get(va_block, gpu_id); 2430 if (!uvm_page_mask_region_empty(&gpu_state->resident, region)) 2431 return true; 2432 } 2433 2434 return false; 2435 } 2436 2437 static NV_STATUS hmm_block_cpu_fault_locked(uvm_processor_id_t processor_id, 2438 uvm_va_block_t *va_block, 2439 uvm_va_block_retry_t *va_block_retry, 2440 uvm_service_block_context_t *service_context) 2441 { 2442 uvm_va_block_region_t region = service_context->region; 2443 struct migrate_vma *args = &service_context->block_context.hmm.migrate_vma_args; 2444 NV_STATUS status; 2445 int ret; 2446 uvm_hmm_devmem_fault_context_t fault_context = { 2447 .processor_id = processor_id, 2448 .va_block = va_block, 2449 .va_block_retry = va_block_retry, 2450 .service_context = service_context, 2451 }; 2452 2453 // Normally the source page will be a device private page that is being 2454 // migrated to system memory. However, when it is a GPU fault, the source 2455 // page can be a system memory page that the GPU needs to remote map 2456 // instead. However migrate_vma_setup() won't select these types of 2457 // mappings/pages: 2458 // - device exclusive PTEs 2459 // - shared mappings 2460 // - file backed mappings 2461 // Also, if the source and destination page are the same, the page reference 2462 // count won't be the "expected" count and migrate_vma_pages() won't migrate 2463 // it. This mask records that uvm_hmm_devmem_fault_alloc_and_copy() and 2464 // uvm_hmm_devmem_fault_finalize_and_map() still needs to process these 2465 // pages even if src_pfn indicates they are not migrating. 2466 uvm_page_mask_zero(&fault_context.same_devmem_page_mask); 2467 2468 if (!UVM_ID_IS_CPU(processor_id)) { 2469 if (is_atomic_fault(service_context->access_type, region)) { 2470 return hmm_block_atomic_fault_locked(processor_id, 2471 va_block, 2472 va_block_retry, 2473 service_context); 2474 } 2475 2476 status = hmm_make_resident_cpu(va_block, 2477 service_context->block_context.hmm.vma, 2478 service_context->block_context.hmm.src_pfns, 2479 region, 2480 service_context->access_type, 2481 &fault_context.same_devmem_page_mask); 2482 if (status != NV_OK) 2483 return status; 2484 2485 // If no GPU has a resident copy, we can skip the migrate_vma_*(). 2486 // This is necessary if uvm_hmm_must_use_sysmem() returned true. 2487 if (!is_gpu_resident(va_block, region)) { 2488 status = uvm_va_block_service_copy(processor_id, 2489 UVM_ID_CPU, 2490 va_block, 2491 va_block_retry, 2492 service_context); 2493 if (status != NV_OK) 2494 return status; 2495 2496 return uvm_va_block_service_finish(processor_id, va_block, service_context); 2497 } 2498 } 2499 2500 args->vma = service_context->block_context.hmm.vma; 2501 args->src = service_context->block_context.hmm.src_pfns + region.first; 2502 args->dst = service_context->block_context.hmm.dst_pfns + region.first; 2503 args->start = uvm_va_block_region_start(va_block, region); 2504 args->end = uvm_va_block_region_end(va_block, region) + 1; 2505 args->flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE; 2506 args->pgmap_owner = &g_uvm_global; 2507 2508 if (UVM_ID_IS_CPU(processor_id)) { 2509 args->fault_page = service_context->cpu_fault.vmf->page; 2510 } 2511 else { 2512 args->flags |= MIGRATE_VMA_SELECT_SYSTEM; 2513 args->fault_page = NULL; 2514 } 2515 2516 ret = migrate_vma_setup_locked(args, va_block); 2517 UVM_ASSERT(!ret); 2518 2519 // The overall process here is to migrate pages from the GPU to the CPU 2520 // and possibly remote map the GPU to sysmem if accessed_by is set. 2521 // This is safe because we hold the va_block lock across the calls to 2522 // uvm_hmm_devmem_fault_alloc_and_copy(), migrate_vma_pages(), 2523 // uvm_hmm_devmem_fault_finalize_and_map(), and migrate_vma_finalize(). 2524 // If uvm_hmm_devmem_fault_alloc_and_copy() needs to drop the va_block 2525 // lock, a sequence number is used to tell if an invalidate() callback 2526 // occurred while not holding the lock. If the sequence number changes, 2527 // all the locks need to be dropped (mm, va_space, va_block) and the whole 2528 // uvm_va_block_service_locked() called again. Otherwise, there were no 2529 // conflicting invalidate callbacks and our snapshots of the CPU page 2530 // tables are accurate and can be used to DMA pages and update GPU page 2531 // tables. 2532 status = uvm_hmm_devmem_fault_alloc_and_copy(&fault_context); 2533 if (status == NV_OK) { 2534 migrate_vma_pages(args); 2535 status = uvm_hmm_devmem_fault_finalize_and_map(&fault_context); 2536 } 2537 2538 migrate_vma_finalize(args); 2539 2540 if (status == NV_WARN_NOTHING_TO_DO) 2541 status = NV_OK; 2542 2543 return status; 2544 } 2545 2546 static NV_STATUS dmamap_src_sysmem_pages(uvm_va_block_t *va_block, 2547 struct vm_area_struct *vma, 2548 const unsigned long *src_pfns, 2549 unsigned long *dst_pfns, 2550 uvm_va_block_region_t region, 2551 uvm_page_mask_t *page_mask, 2552 uvm_processor_id_t dest_id, 2553 uvm_service_block_context_t *service_context) 2554 { 2555 uvm_page_index_t page_index; 2556 NV_STATUS status = NV_OK; 2557 2558 for_each_va_block_page_in_region_mask(page_index, page_mask, region) { 2559 struct page *src_page; 2560 2561 if (!(src_pfns[page_index] & MIGRATE_PFN_MIGRATE)) { 2562 // HMM currently has some limitations on what pages can be migrated. 2563 // For example, no file backed pages, device private pages owned by 2564 // a different device, device exclusive or swapped out pages. 2565 goto clr_mask; 2566 } 2567 2568 // This is the page that will be copied to the destination GPU. 2569 src_page = migrate_pfn_to_page(src_pfns[page_index]); 2570 if (src_page) { 2571 if (is_device_private_page(src_page)) { 2572 status = gpu_chunk_add(va_block, page_index, src_page); 2573 if (status != NV_OK) 2574 break; 2575 continue; 2576 } 2577 2578 if (PageSwapCache(src_page)) { 2579 // TODO: Bug 4050579: Remove this when swap cached pages can be 2580 // migrated. 2581 if (service_context) { 2582 service_context->block_context.hmm.swap_cached = true; 2583 break; 2584 } 2585 2586 goto clr_mask; 2587 } 2588 2589 // If the page is already allocated, it is most likely a mirrored 2590 // page. Check to be sure it matches what we have recorded. The 2591 // page shouldn't be a staging page from a GPU to GPU migration 2592 // or a remote mapped atomic sysmem page because migrate_vma_setup() 2593 // found a normal page and non-mirrored pages are only known 2594 // privately to the UVM driver. 2595 if (uvm_page_mask_test(&va_block->cpu.allocated, page_index)) { 2596 UVM_ASSERT(hmm_va_block_cpu_page_is_same(va_block, page_index, src_page)); 2597 UVM_ASSERT(uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU)); 2598 UVM_ASSERT(uvm_page_mask_test(&va_block->cpu.resident, page_index)); 2599 } 2600 else { 2601 status = hmm_va_block_cpu_page_populate(va_block, page_index, src_page); 2602 if (status != NV_OK) 2603 goto clr_mask; 2604 2605 // Since there is a CPU resident page, there shouldn't be one 2606 // anywhere else. TODO: Bug 3660922: Need to handle read 2607 // duplication at some point. 2608 UVM_ASSERT(!uvm_va_block_page_resident_processors_count(va_block, page_index)); 2609 2610 // migrate_vma_setup() was able to isolate and lock the page; 2611 // therefore, it is CPU resident and not mapped. 2612 uvm_processor_mask_set(&va_block->resident, UVM_ID_CPU); 2613 uvm_page_mask_set(&va_block->cpu.resident, page_index); 2614 } 2615 2616 // The call to migrate_vma_setup() will have inserted a migration 2617 // PTE so the CPU has no access. 2618 cpu_mapping_clear(va_block, page_index); 2619 } 2620 else { 2621 // It is OK to migrate an empty anonymous page, a zero page will 2622 // be allocated on the GPU. Just be sure to free any pages 2623 // used for GPU to GPU copies. It can't be an evicted page because 2624 // migrate_vma_setup() would have found a source page. 2625 if (uvm_page_mask_test(&va_block->cpu.allocated, page_index)) { 2626 UVM_ASSERT(!uvm_va_block_page_resident_processors_count(va_block, page_index)); 2627 2628 hmm_va_block_cpu_page_unpopulate(va_block, page_index); 2629 } 2630 } 2631 2632 continue; 2633 2634 clr_mask: 2635 // TODO: Bug 3900774: clean up murky mess of mask clearing. 2636 uvm_page_mask_clear(page_mask, page_index); 2637 if (service_context) 2638 clear_service_context_masks(service_context, dest_id, page_index); 2639 } 2640 2641 if (uvm_page_mask_empty(page_mask) || 2642 (service_context && service_context->block_context.hmm.swap_cached)) 2643 status = NV_WARN_MORE_PROCESSING_REQUIRED; 2644 2645 if (status != NV_OK) 2646 clean_up_non_migrating_pages(va_block, src_pfns, dst_pfns, region, page_mask); 2647 2648 return status; 2649 } 2650 2651 static NV_STATUS uvm_hmm_gpu_fault_alloc_and_copy(struct vm_area_struct *vma, 2652 uvm_hmm_gpu_fault_event_t *uvm_hmm_gpu_fault_event) 2653 { 2654 uvm_processor_id_t processor_id; 2655 uvm_processor_id_t new_residency; 2656 uvm_va_block_t *va_block; 2657 uvm_va_block_retry_t *va_block_retry; 2658 uvm_service_block_context_t *service_context; 2659 uvm_perf_prefetch_hint_t *prefetch_hint; 2660 const unsigned long *src_pfns; 2661 unsigned long *dst_pfns; 2662 uvm_va_block_region_t region; 2663 uvm_page_mask_t *page_mask; 2664 NV_STATUS status; 2665 2666 processor_id = uvm_hmm_gpu_fault_event->processor_id; 2667 new_residency = uvm_hmm_gpu_fault_event->new_residency; 2668 va_block = uvm_hmm_gpu_fault_event->va_block; 2669 va_block_retry = uvm_hmm_gpu_fault_event->va_block_retry; 2670 service_context = uvm_hmm_gpu_fault_event->service_context; 2671 region = service_context->region; 2672 prefetch_hint = &service_context->prefetch_hint; 2673 src_pfns = service_context->block_context.hmm.src_pfns; 2674 dst_pfns = service_context->block_context.hmm.dst_pfns; 2675 2676 // Build the migration mask. 2677 // Note that thrashing pinned pages are already accounted for in 2678 // service_context->resident_processors. 2679 page_mask = &uvm_hmm_gpu_fault_event->page_mask; 2680 uvm_page_mask_copy(page_mask, 2681 &service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency); 2682 2683 status = dmamap_src_sysmem_pages(va_block, 2684 vma, 2685 src_pfns, 2686 dst_pfns, 2687 region, 2688 page_mask, 2689 new_residency, 2690 service_context); 2691 if (status != NV_OK) 2692 return status; 2693 2694 // Do the alloc and copy but don't update the residency or mapping for the 2695 // new location yet. 2696 status = uvm_va_block_service_copy(processor_id, new_residency, va_block, va_block_retry, service_context); 2697 if (status != NV_OK) 2698 return status; 2699 2700 // Record the destination PFNs of device private struct pages now that 2701 // uvm_va_block_service_copy() has populated the GPU destination pages. 2702 fill_dst_pfns(va_block, 2703 src_pfns, 2704 dst_pfns, 2705 region, 2706 page_mask, 2707 &uvm_hmm_gpu_fault_event->same_devmem_page_mask, 2708 new_residency); 2709 2710 return status; 2711 } 2712 2713 static NV_STATUS uvm_hmm_gpu_fault_finalize_and_map(uvm_hmm_gpu_fault_event_t *uvm_hmm_gpu_fault_event) 2714 { 2715 uvm_processor_id_t processor_id; 2716 uvm_processor_id_t new_residency; 2717 uvm_va_block_t *va_block; 2718 uvm_va_block_retry_t *va_block_retry; 2719 uvm_service_block_context_t *service_context; 2720 const unsigned long *src_pfns; 2721 unsigned long *dst_pfns; 2722 uvm_va_block_region_t region; 2723 uvm_page_index_t page_index; 2724 uvm_page_mask_t *page_mask; 2725 NV_STATUS status, tracker_status; 2726 2727 processor_id = uvm_hmm_gpu_fault_event->processor_id; 2728 new_residency = uvm_hmm_gpu_fault_event->new_residency; 2729 va_block = uvm_hmm_gpu_fault_event->va_block; 2730 va_block_retry = uvm_hmm_gpu_fault_event->va_block_retry; 2731 service_context = uvm_hmm_gpu_fault_event->service_context; 2732 src_pfns = service_context->block_context.hmm.src_pfns; 2733 dst_pfns = service_context->block_context.hmm.dst_pfns; 2734 region = service_context->region; 2735 page_mask = &uvm_hmm_gpu_fault_event->page_mask; 2736 2737 // There are a number of reasons why HMM will mark a page as not migrating 2738 // even if we set a valid entry in dst_pfns[]. Mark these pages accordingly. 2739 for_each_va_block_page_in_region_mask(page_index, page_mask, region) { 2740 unsigned long src_pfn = src_pfns[page_index]; 2741 2742 if (src_pfn & MIGRATE_PFN_MIGRATE) 2743 continue; 2744 2745 // If a device private page isn't migrating and only the GPU page table 2746 // is being updated, continue to process it normally. 2747 if (uvm_page_mask_test(&uvm_hmm_gpu_fault_event->same_devmem_page_mask, page_index)) 2748 continue; 2749 2750 // TODO: Bug 3900774: clean up murky mess of mask clearing. 2751 uvm_page_mask_clear(page_mask, page_index); 2752 clear_service_context_masks(service_context, new_residency, page_index); 2753 } 2754 2755 if (uvm_page_mask_empty(page_mask)) 2756 status = NV_WARN_MORE_PROCESSING_REQUIRED; 2757 else 2758 status = uvm_va_block_service_finish(processor_id, va_block, service_context); 2759 2760 tracker_status = sync_page_and_chunk_state(va_block, 2761 src_pfns, 2762 dst_pfns, 2763 region, 2764 page_mask, 2765 &uvm_hmm_gpu_fault_event->same_devmem_page_mask); 2766 2767 return status == NV_OK ? tracker_status : status; 2768 } 2769 2770 NV_STATUS uvm_hmm_va_block_service_locked(uvm_processor_id_t processor_id, 2771 uvm_processor_id_t new_residency, 2772 uvm_va_block_t *va_block, 2773 uvm_va_block_retry_t *va_block_retry, 2774 uvm_service_block_context_t *service_context) 2775 { 2776 struct mm_struct *mm = service_context->block_context.mm; 2777 struct vm_area_struct *vma = service_context->block_context.hmm.vma; 2778 uvm_va_block_region_t region = service_context->region; 2779 uvm_hmm_gpu_fault_event_t uvm_hmm_gpu_fault_event; 2780 struct migrate_vma *args = &service_context->block_context.hmm.migrate_vma_args; 2781 int ret; 2782 NV_STATUS status = NV_ERR_INVALID_ADDRESS; 2783 2784 if (!mm) 2785 return status; 2786 2787 uvm_assert_mmap_lock_locked(mm); 2788 uvm_assert_rwsem_locked(&va_block->hmm.va_space->lock); 2789 uvm_assert_mutex_locked(&va_block->hmm.migrate_lock); 2790 uvm_assert_mutex_locked(&va_block->lock); 2791 UVM_ASSERT(vma); 2792 2793 // If the desired destination is the CPU, try to fault in CPU pages. 2794 if (UVM_ID_IS_CPU(new_residency)) 2795 return hmm_block_cpu_fault_locked(processor_id, va_block, va_block_retry, service_context); 2796 2797 uvm_hmm_gpu_fault_event.processor_id = processor_id; 2798 uvm_hmm_gpu_fault_event.new_residency = new_residency; 2799 uvm_hmm_gpu_fault_event.va_block = va_block; 2800 uvm_hmm_gpu_fault_event.va_block_retry = va_block_retry; 2801 uvm_hmm_gpu_fault_event.service_context = service_context; 2802 2803 args->vma = vma; 2804 args->src = service_context->block_context.hmm.src_pfns + region.first; 2805 args->dst = service_context->block_context.hmm.dst_pfns + region.first; 2806 args->start = uvm_va_block_region_start(va_block, region); 2807 args->end = uvm_va_block_region_end(va_block, region) + 1; 2808 args->flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE | MIGRATE_VMA_SELECT_SYSTEM; 2809 args->pgmap_owner = &g_uvm_global; 2810 args->fault_page = NULL; 2811 2812 ret = migrate_vma_setup_locked(args, va_block); 2813 UVM_ASSERT(!ret); 2814 2815 // The overall process here is to migrate pages from the CPU or GPUs to the 2816 // faulting GPU. 2817 // This is safe because we hold the va_block lock across the calls to 2818 // uvm_hmm_gpu_fault_alloc_and_copy(), migrate_vma_pages(), 2819 // uvm_hmm_gpu_fault_finalize_and_map(), and migrate_vma_finalize(). 2820 // If uvm_hmm_gpu_fault_alloc_and_copy() needs to drop the va_block 2821 // lock, a sequence number is used to tell if an invalidate() callback 2822 // occurred while not holding the lock. If the sequence number changes, 2823 // all the locks need to be dropped (mm, va_space, va_block) and the whole 2824 // uvm_va_block_service_locked() called again. Otherwise, there were no 2825 // conflicting invalidate callbacks and our snapshots of the CPU page 2826 // tables are accurate and can be used to DMA pages and update GPU page 2827 // tables. TODO: Bug 3901904: there might be better ways of handling no 2828 // page being migrated. 2829 status = uvm_hmm_gpu_fault_alloc_and_copy(vma, &uvm_hmm_gpu_fault_event); 2830 if (status == NV_WARN_MORE_PROCESSING_REQUIRED) { 2831 migrate_vma_finalize(args); 2832 2833 // migrate_vma_setup() might have not been able to lock/isolate any 2834 // pages because they are swapped out or are device exclusive. 2835 // We do know that none of the pages in the region are zero pages 2836 // since migrate_vma_setup() would have reported that information. 2837 // Try to make it resident in system memory and retry the migration. 2838 status = hmm_make_resident_cpu(va_block, 2839 service_context->block_context.hmm.vma, 2840 service_context->block_context.hmm.src_pfns, 2841 region, 2842 service_context->access_type, 2843 NULL); 2844 return NV_WARN_MORE_PROCESSING_REQUIRED; 2845 } 2846 2847 if (status == NV_OK) { 2848 migrate_vma_pages(args); 2849 status = uvm_hmm_gpu_fault_finalize_and_map(&uvm_hmm_gpu_fault_event); 2850 } 2851 2852 migrate_vma_finalize(args); 2853 2854 if (status == NV_WARN_NOTHING_TO_DO) 2855 status = NV_OK; 2856 2857 return status; 2858 } 2859 2860 static NV_STATUS uvm_hmm_migrate_alloc_and_copy(struct vm_area_struct *vma, 2861 uvm_hmm_migrate_event_t *uvm_hmm_migrate_event) 2862 { 2863 uvm_va_block_t *va_block; 2864 uvm_va_block_retry_t *va_block_retry; 2865 uvm_va_block_context_t *va_block_context; 2866 const unsigned long *src_pfns; 2867 unsigned long *dst_pfns; 2868 uvm_va_block_region_t region; 2869 uvm_processor_id_t dest_id; 2870 uvm_page_mask_t *page_mask; 2871 NV_STATUS status; 2872 2873 va_block = uvm_hmm_migrate_event->va_block; 2874 va_block_retry = uvm_hmm_migrate_event->va_block_retry; 2875 va_block_context = uvm_hmm_migrate_event->va_block_context; 2876 src_pfns = va_block_context->hmm.src_pfns; 2877 dst_pfns = va_block_context->hmm.dst_pfns; 2878 region = uvm_hmm_migrate_event->region; 2879 dest_id = uvm_hmm_migrate_event->dest_id; 2880 page_mask = &uvm_hmm_migrate_event->page_mask; 2881 uvm_page_mask_init_from_region(page_mask, region, NULL); 2882 uvm_page_mask_zero(&uvm_hmm_migrate_event->same_devmem_page_mask); 2883 2884 uvm_assert_mutex_locked(&va_block->lock); 2885 2886 if (UVM_ID_IS_CPU(dest_id)) { 2887 status = alloc_and_copy_to_cpu(va_block, 2888 vma, 2889 src_pfns, 2890 dst_pfns, 2891 region, 2892 page_mask, 2893 &uvm_hmm_migrate_event->same_devmem_page_mask, 2894 UVM_ID_INVALID, 2895 NULL); 2896 } 2897 else { 2898 status = dmamap_src_sysmem_pages(va_block, 2899 vma, 2900 src_pfns, 2901 dst_pfns, 2902 region, 2903 page_mask, 2904 dest_id, 2905 NULL); 2906 } 2907 if (status != NV_OK) 2908 return status; 2909 2910 UVM_ASSERT(!uvm_va_policy_is_read_duplicate(va_block_context->policy, va_block->hmm.va_space)); 2911 2912 status = uvm_va_block_make_resident_copy(va_block, 2913 va_block_retry, 2914 va_block_context, 2915 dest_id, 2916 region, 2917 page_mask, 2918 NULL, 2919 uvm_hmm_migrate_event->cause); 2920 if (status != NV_OK) 2921 return status; 2922 2923 if (!UVM_ID_IS_CPU(dest_id)) { 2924 // Record the destination PFNs of device private struct pages now that 2925 // uvm_va_block_make_resident_copy() has populated the GPU destination 2926 // pages. 2927 fill_dst_pfns(va_block, 2928 src_pfns, 2929 dst_pfns, 2930 region, 2931 page_mask, 2932 &uvm_hmm_migrate_event->same_devmem_page_mask, 2933 dest_id); 2934 } 2935 2936 return status; 2937 } 2938 2939 static NV_STATUS uvm_hmm_migrate_finalize(uvm_hmm_migrate_event_t *uvm_hmm_migrate_event) 2940 { 2941 uvm_va_block_t *va_block; 2942 uvm_va_block_retry_t *va_block_retry; 2943 uvm_va_block_context_t *va_block_context; 2944 uvm_va_block_region_t region; 2945 uvm_processor_id_t dest_id; 2946 uvm_page_index_t page_index; 2947 uvm_page_mask_t *page_mask; 2948 const unsigned long *src_pfns; 2949 unsigned long *dst_pfns; 2950 2951 va_block = uvm_hmm_migrate_event->va_block; 2952 va_block_retry = uvm_hmm_migrate_event->va_block_retry; 2953 va_block_context = uvm_hmm_migrate_event->va_block_context; 2954 region = uvm_hmm_migrate_event->region; 2955 dest_id = uvm_hmm_migrate_event->dest_id; 2956 page_mask = &uvm_hmm_migrate_event->page_mask; 2957 src_pfns = va_block_context->hmm.src_pfns; 2958 dst_pfns = va_block_context->hmm.dst_pfns; 2959 2960 uvm_assert_mutex_locked(&va_block->lock); 2961 2962 // There are a number of reasons why HMM will mark a page as not migrating 2963 // even if we set a valid entry in dst_pfns[]. Mark these pages accordingly. 2964 for_each_va_block_page_in_region_mask(page_index, page_mask, region) { 2965 unsigned long src_pfn = src_pfns[page_index]; 2966 2967 if (src_pfn & MIGRATE_PFN_MIGRATE) 2968 continue; 2969 2970 // If a device private page isn't migrating and only the GPU page table 2971 // is being updated, continue to process it normally. 2972 if (uvm_page_mask_test(&uvm_hmm_migrate_event->same_devmem_page_mask, page_index)) 2973 continue; 2974 2975 uvm_page_mask_clear(page_mask, page_index); 2976 } 2977 2978 uvm_va_block_make_resident_finish(va_block, va_block_context, region, page_mask); 2979 2980 return sync_page_and_chunk_state(va_block, 2981 src_pfns, 2982 dst_pfns, 2983 region, 2984 page_mask, 2985 &uvm_hmm_migrate_event->same_devmem_page_mask); 2986 } 2987 2988 static bool is_resident(uvm_va_block_t *va_block, 2989 uvm_processor_id_t dest_id, 2990 uvm_va_block_region_t region) 2991 { 2992 if (!uvm_processor_mask_test(&va_block->resident, dest_id)) 2993 return false; 2994 2995 return uvm_page_mask_region_full(uvm_va_block_resident_mask_get(va_block, dest_id), region); 2996 } 2997 2998 // Note that migrate_vma_*() doesn't handle asynchronous migrations so the 2999 // migration flag UVM_MIGRATE_FLAG_SKIP_CPU_MAP doesn't have an effect. 3000 // TODO: Bug 3900785: investigate ways to implement async migration. 3001 NV_STATUS uvm_hmm_va_block_migrate_locked(uvm_va_block_t *va_block, 3002 uvm_va_block_retry_t *va_block_retry, 3003 uvm_va_block_context_t *va_block_context, 3004 uvm_processor_id_t dest_id, 3005 uvm_va_block_region_t region, 3006 uvm_make_resident_cause_t cause) 3007 { 3008 uvm_hmm_migrate_event_t uvm_hmm_migrate_event; 3009 struct vm_area_struct *vma = va_block_context->hmm.vma; 3010 NvU64 start; 3011 NvU64 end; 3012 struct migrate_vma *args = &va_block_context->hmm.migrate_vma_args; 3013 NV_STATUS status; 3014 int ret; 3015 3016 UVM_ASSERT(vma); 3017 UVM_ASSERT(va_block_context->mm == vma->vm_mm); 3018 uvm_assert_mmap_lock_locked(va_block_context->mm); 3019 uvm_assert_rwsem_locked(&va_block->hmm.va_space->lock); 3020 uvm_assert_mutex_locked(&va_block->hmm.migrate_lock); 3021 uvm_assert_mutex_locked(&va_block->lock); 3022 3023 start = uvm_va_block_region_start(va_block, region); 3024 end = uvm_va_block_region_end(va_block, region); 3025 UVM_ASSERT(vma->vm_start <= start && end < vma->vm_end); 3026 3027 uvm_hmm_migrate_event.va_block = va_block; 3028 uvm_hmm_migrate_event.va_block_retry = va_block_retry; 3029 uvm_hmm_migrate_event.va_block_context = va_block_context; 3030 uvm_hmm_migrate_event.region = region; 3031 uvm_hmm_migrate_event.dest_id = dest_id; 3032 uvm_hmm_migrate_event.cause = cause; 3033 3034 args->vma = vma; 3035 args->src = va_block_context->hmm.src_pfns + region.first; 3036 args->dst = va_block_context->hmm.dst_pfns + region.first; 3037 args->start = uvm_va_block_region_start(va_block, region); 3038 args->end = uvm_va_block_region_end(va_block, region) + 1; 3039 args->flags = UVM_ID_IS_CPU(dest_id) ? MIGRATE_VMA_SELECT_DEVICE_PRIVATE : 3040 MIGRATE_VMA_SELECT_DEVICE_PRIVATE | MIGRATE_VMA_SELECT_SYSTEM; 3041 args->pgmap_owner = &g_uvm_global; 3042 args->fault_page = NULL; 3043 3044 // Note that migrate_vma_setup() doesn't handle file backed or VM_SPECIAL 3045 // VMAs so if UvmMigrate() tries to migrate such a region, -EINVAL will 3046 // be returned and we will only try to make the pages be CPU resident. 3047 ret = migrate_vma_setup_locked(args, va_block); 3048 if (ret) 3049 return hmm_make_resident_cpu(va_block, 3050 vma, 3051 va_block_context->hmm.src_pfns, 3052 region, 3053 NULL, 3054 NULL); 3055 3056 // The overall process here is to migrate pages from the CPU or GPUs to the 3057 // destination processor. Note that block_migrate_add_mappings() handles 3058 // updating GPU mappings after the migration. 3059 // This is safe because we hold the va_block lock across the calls to 3060 // uvm_hmm_migrate_alloc_and_copy(), migrate_vma_pages(), 3061 // uvm_hmm_migrate_finalize(), migrate_vma_finalize() and 3062 // block_migrate_add_mappings(). 3063 // If uvm_hmm_migrate_alloc_and_copy() needs to drop the va_block 3064 // lock, a sequence number is used to tell if an invalidate() callback 3065 // occurred while not holding the lock. If the sequence number changes, 3066 // all the locks need to be dropped (mm, va_space, va_block) and the whole 3067 // uvm_hmm_va_block_migrate_locked() called again. Otherwise, there were no 3068 // conflicting invalidate callbacks and our snapshots of the CPU page 3069 // tables are accurate and can be used to DMA pages and update GPU page 3070 // tables. 3071 status = uvm_hmm_migrate_alloc_and_copy(vma, &uvm_hmm_migrate_event); 3072 if (status == NV_WARN_MORE_PROCESSING_REQUIRED) { 3073 uvm_processor_id_t id; 3074 uvm_page_mask_t *page_mask; 3075 3076 migrate_vma_finalize(args); 3077 3078 // The CPU pages tables might contain only device private pages or 3079 // the migrate_vma_setup() might have not been able to lock/isolate 3080 // any pages because they are swapped out, or on another device. 3081 // We do know that none of the pages in the region are zero pages 3082 // since migrate_vma_setup() would have reported that information. 3083 // Collect all the pages that need to be faulted in and made CPU 3084 // resident, then do the hmm_range_fault() and retry. 3085 page_mask = &va_block_context->caller_page_mask; 3086 uvm_page_mask_init_from_region(page_mask, region, NULL); 3087 3088 for_each_id_in_mask(id, &va_block->resident) { 3089 if (!uvm_page_mask_andnot(page_mask, 3090 page_mask, 3091 uvm_va_block_resident_mask_get(va_block, id))) 3092 return NV_OK; 3093 } 3094 3095 return hmm_make_resident_cpu(va_block, 3096 vma, 3097 va_block_context->hmm.src_pfns, 3098 region, 3099 NULL, 3100 NULL); 3101 } 3102 3103 if (status == NV_OK) { 3104 migrate_vma_pages(args); 3105 status = uvm_hmm_migrate_finalize(&uvm_hmm_migrate_event); 3106 } 3107 3108 migrate_vma_finalize(args); 3109 3110 if (status == NV_WARN_NOTHING_TO_DO) 3111 status = NV_OK; 3112 3113 return status; 3114 } 3115 3116 NV_STATUS uvm_hmm_migrate_ranges(uvm_va_space_t *va_space, 3117 uvm_va_block_context_t *va_block_context, 3118 NvU64 base, 3119 NvU64 length, 3120 uvm_processor_id_t dest_id, 3121 uvm_migrate_mode_t mode, 3122 uvm_tracker_t *out_tracker) 3123 { 3124 struct mm_struct *mm; 3125 uvm_va_block_t *va_block; 3126 uvm_va_block_retry_t va_block_retry; 3127 NvU64 addr, end, last_address; 3128 NV_STATUS status = NV_OK; 3129 3130 if (!uvm_hmm_is_enabled(va_space)) 3131 return NV_ERR_INVALID_ADDRESS; 3132 3133 mm = va_block_context->mm; 3134 UVM_ASSERT(mm == va_space->va_space_mm.mm); 3135 uvm_assert_mmap_lock_locked(mm); 3136 uvm_assert_rwsem_locked(&va_space->lock); 3137 3138 last_address = base + length - 1; 3139 3140 for (addr = base; addr < last_address; addr = end + 1) { 3141 struct vm_area_struct *vma; 3142 3143 status = hmm_va_block_find_create(va_space, addr, false, va_block_context, &va_block); 3144 if (status != NV_OK) 3145 return status; 3146 3147 end = va_block->end; 3148 if (end > last_address) 3149 end = last_address; 3150 3151 vma = va_block_context->hmm.vma; 3152 if (end > vma->vm_end - 1) 3153 end = vma->vm_end - 1; 3154 3155 status = hmm_migrate_range(va_block, 3156 &va_block_retry, 3157 va_block_context, 3158 dest_id, 3159 addr, 3160 end, 3161 mode, 3162 out_tracker); 3163 if (status != NV_OK) 3164 break; 3165 } 3166 3167 return status; 3168 } 3169 3170 NV_STATUS uvm_hmm_va_block_evict_chunk_prep(uvm_va_block_t *va_block, 3171 uvm_va_block_context_t *va_block_context, 3172 uvm_gpu_chunk_t *gpu_chunk, 3173 uvm_va_block_region_t chunk_region) 3174 { 3175 uvm_thread_context_t *uvm_context = uvm_thread_context(); 3176 unsigned long *src_pfns = va_block_context->hmm.src_pfns; 3177 uvm_gpu_t *gpu = uvm_gpu_chunk_get_gpu(gpu_chunk); 3178 unsigned long pfn = uvm_pmm_gpu_devmem_get_pfn(&gpu->pmm, gpu_chunk); 3179 uvm_page_index_t page_index = chunk_region.first; 3180 int ret; 3181 3182 uvm_assert_mutex_locked(&va_block->lock); 3183 // TODO: Bug 3368756: add support for large GPU pages. 3184 UVM_ASSERT(uvm_va_block_region_num_pages(chunk_region) == 1); 3185 3186 uvm_context->ignore_hmm_invalidate_va_block = va_block; 3187 ret = migrate_device_range(src_pfns + page_index, pfn, uvm_va_block_region_num_pages(chunk_region)); 3188 uvm_context->ignore_hmm_invalidate_va_block = NULL; 3189 if (ret) 3190 return errno_to_nv_status(ret); 3191 3192 return NV_OK; 3193 } 3194 3195 // Note that the caller must initialize va_block_context->hmm.src_pfns by 3196 // calling uvm_hmm_va_block_evict_chunk_prep() before calling this. 3197 static NV_STATUS hmm_va_block_evict_chunks(uvm_va_block_t *va_block, 3198 uvm_va_block_context_t *va_block_context, 3199 const uvm_page_mask_t *pages_to_evict, 3200 uvm_va_block_region_t region, 3201 uvm_make_resident_cause_t cause, 3202 bool *out_accessed_by_set) 3203 { 3204 NvU64 start = uvm_va_block_region_start(va_block, region); 3205 NvU64 end = uvm_va_block_region_end(va_block, region); 3206 unsigned long *src_pfns = va_block_context->hmm.src_pfns; 3207 unsigned long *dst_pfns = va_block_context->hmm.dst_pfns; 3208 uvm_hmm_migrate_event_t uvm_hmm_migrate_event = { 3209 .va_block = va_block, 3210 .va_block_retry = NULL, 3211 .va_block_context = va_block_context, 3212 .region = region, 3213 .dest_id = UVM_ID_CPU, 3214 .cause = cause, 3215 }; 3216 uvm_page_mask_t *page_mask = &uvm_hmm_migrate_event.page_mask; 3217 const uvm_va_policy_t *policy; 3218 uvm_va_policy_node_t *node; 3219 unsigned long npages; 3220 NV_STATUS status; 3221 3222 uvm_assert_mutex_locked(&va_block->lock); 3223 3224 if (out_accessed_by_set) 3225 *out_accessed_by_set = false; 3226 3227 // Note that there is no VMA available when evicting HMM pages. 3228 va_block_context->hmm.vma = NULL; 3229 3230 uvm_page_mask_copy(page_mask, pages_to_evict); 3231 3232 uvm_for_each_va_policy_in(policy, va_block, start, end, node, region) { 3233 npages = uvm_va_block_region_num_pages(region); 3234 3235 va_block_context->policy = policy; 3236 if (out_accessed_by_set && uvm_processor_mask_get_count(&policy->accessed_by) > 0) 3237 *out_accessed_by_set = true; 3238 3239 // Pages resident on the GPU should not have a resident page in system 3240 // memory. 3241 // TODO: Bug 3660922: Need to handle read duplication at some point. 3242 UVM_ASSERT(uvm_page_mask_region_empty(&va_block->cpu.resident, region)); 3243 3244 status = alloc_and_copy_to_cpu(va_block, 3245 NULL, 3246 src_pfns, 3247 dst_pfns, 3248 region, 3249 page_mask, 3250 NULL, 3251 UVM_ID_INVALID, 3252 NULL); 3253 if (status != NV_OK) 3254 goto err; 3255 3256 status = uvm_va_block_make_resident_copy(va_block, 3257 NULL, 3258 va_block_context, 3259 UVM_ID_CPU, 3260 region, 3261 page_mask, 3262 NULL, 3263 cause); 3264 if (status != NV_OK) 3265 goto err; 3266 3267 migrate_device_pages(src_pfns + region.first, dst_pfns + region.first, npages); 3268 3269 uvm_hmm_migrate_event.region = region; 3270 3271 status = uvm_hmm_migrate_finalize(&uvm_hmm_migrate_event); 3272 if (status != NV_OK) 3273 goto err; 3274 3275 migrate_device_finalize(src_pfns + region.first, dst_pfns + region.first, npages); 3276 } 3277 3278 return NV_OK; 3279 3280 err: 3281 migrate_device_finalize(src_pfns + region.first, dst_pfns + region.first, npages); 3282 return status; 3283 } 3284 3285 NV_STATUS uvm_hmm_va_block_evict_chunks(uvm_va_block_t *va_block, 3286 uvm_va_block_context_t *va_block_context, 3287 const uvm_page_mask_t *pages_to_evict, 3288 uvm_va_block_region_t region, 3289 bool *out_accessed_by_set) 3290 { 3291 return hmm_va_block_evict_chunks(va_block, 3292 va_block_context, 3293 pages_to_evict, 3294 region, 3295 UVM_MAKE_RESIDENT_CAUSE_EVICTION, 3296 out_accessed_by_set); 3297 } 3298 3299 NV_STATUS uvm_hmm_va_block_evict_pages_from_gpu(uvm_va_block_t *va_block, 3300 uvm_gpu_t *gpu, 3301 uvm_va_block_context_t *va_block_context, 3302 const uvm_page_mask_t *pages_to_evict, 3303 uvm_va_block_region_t region) 3304 { 3305 unsigned long *src_pfns = va_block_context->hmm.src_pfns; 3306 uvm_va_block_gpu_state_t *gpu_state; 3307 uvm_page_index_t page_index; 3308 uvm_gpu_chunk_t *gpu_chunk; 3309 NV_STATUS status; 3310 3311 uvm_assert_mutex_locked(&va_block->lock); 3312 3313 gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id); 3314 UVM_ASSERT(gpu_state); 3315 UVM_ASSERT(gpu_state->chunks); 3316 3317 // Fill in the src_pfns[] with the ZONE_DEVICE private PFNs of the GPU. 3318 memset(src_pfns, 0, sizeof(va_block_context->hmm.src_pfns)); 3319 3320 // TODO: Bug 3368756: add support for large GPU pages. 3321 for_each_va_block_page_in_region_mask(page_index, pages_to_evict, region) { 3322 gpu_chunk = uvm_va_block_lookup_gpu_chunk(va_block, 3323 gpu, 3324 uvm_va_block_cpu_page_address(va_block, page_index)); 3325 status = uvm_hmm_va_block_evict_chunk_prep(va_block, 3326 va_block_context, 3327 gpu_chunk, 3328 uvm_va_block_region_for_page(page_index)); 3329 if (status != NV_OK) 3330 return status; 3331 } 3332 3333 return hmm_va_block_evict_chunks(va_block, 3334 va_block_context, 3335 pages_to_evict, 3336 region, 3337 UVM_MAKE_RESIDENT_CAUSE_API_MIGRATE, 3338 NULL); 3339 } 3340 3341 NV_STATUS uvm_hmm_pmm_gpu_evict_pfn(unsigned long pfn) 3342 { 3343 unsigned long src_pfn = 0; 3344 unsigned long dst_pfn = 0; 3345 struct page *dst_page; 3346 NV_STATUS status = NV_OK; 3347 int ret; 3348 3349 ret = migrate_device_range(&src_pfn, pfn, 1); 3350 if (ret) 3351 return errno_to_nv_status(ret); 3352 3353 if (src_pfn & MIGRATE_PFN_MIGRATE) { 3354 // All the code for copying a vidmem page to sysmem relies on 3355 // having a va_block. However certain combinations of mremap() 3356 // and fork() can result in device-private pages being mapped 3357 // in a child process without a va_block. 3358 // 3359 // We don't expect the above to be a common occurance so for 3360 // now we allocate a fresh zero page when evicting without a 3361 // va_block. However this results in child processes losing 3362 // data so make sure we warn about it. Ideally we would just 3363 // not migrate and SIGBUS the child if it tries to access the 3364 // page. However that would prevent unloading of the driver so 3365 // we're stuck with this until we fix the problem. 3366 // TODO: Bug 3902536: add code to migrate GPU memory without having a 3367 // va_block. 3368 WARN_ON(1); 3369 dst_page = alloc_page(GFP_HIGHUSER_MOVABLE | __GFP_ZERO); 3370 if (!dst_page) { 3371 status = NV_ERR_NO_MEMORY; 3372 goto out; 3373 } 3374 3375 lock_page(dst_page); 3376 dst_pfn = migrate_pfn(page_to_pfn(dst_page)); 3377 3378 migrate_device_pages(&src_pfn, &dst_pfn, 1); 3379 } 3380 3381 out: 3382 migrate_device_finalize(&src_pfn, &dst_pfn, 1); 3383 3384 return status; 3385 } 3386 3387 // The routines below are all for UVM-HMM tests. 3388 3389 NV_STATUS uvm_hmm_va_block_range_bounds(uvm_va_space_t *va_space, 3390 struct mm_struct *mm, 3391 NvU64 lookup_address, 3392 NvU64 *startp, 3393 NvU64 *endp, 3394 UVM_TEST_VA_RESIDENCY_INFO_PARAMS *params) 3395 { 3396 struct vm_area_struct *vma; 3397 NvU64 start; 3398 NvU64 end; 3399 3400 if (!uvm_hmm_is_enabled(va_space) || !mm) 3401 return NV_ERR_INVALID_ADDRESS; 3402 3403 uvm_assert_mmap_lock_locked(mm); 3404 uvm_assert_rwsem_locked(&va_space->lock); 3405 3406 // The VMA might have changed while not holding mmap_lock so check it. 3407 vma = find_vma(mm, lookup_address); 3408 if (!uvm_hmm_vma_is_valid(vma, lookup_address, false)) 3409 return NV_ERR_INVALID_ADDRESS; 3410 3411 // Since managed VA ranges don't cover more than one VMA, return only the 3412 // intersecting range of the VA block and VMA. 3413 start = UVM_VA_BLOCK_ALIGN_DOWN(lookup_address); 3414 end = start + UVM_VA_BLOCK_SIZE - 1; 3415 if (start < vma->vm_start) 3416 start = vma->vm_start; 3417 if (end > vma->vm_end - 1) 3418 end = vma->vm_end - 1; 3419 3420 *startp = start; 3421 *endp = end; 3422 3423 if (params) { 3424 uvm_va_space_processor_uuid(va_space, ¶ms->resident_on[0], UVM_ID_CPU); 3425 params->resident_physical_size[0] = PAGE_SIZE; 3426 params->resident_on_count = 1; 3427 3428 uvm_va_space_processor_uuid(va_space, ¶ms->mapped_on[0], UVM_ID_CPU); 3429 params->mapping_type[0] = (vma->vm_flags & VM_WRITE) ? 3430 UVM_PROT_READ_WRITE_ATOMIC : UVM_PROT_READ_ONLY; 3431 params->page_size[0] = PAGE_SIZE; 3432 params->mapped_on_count = 1; 3433 3434 uvm_va_space_processor_uuid(va_space, ¶ms->populated_on[0], UVM_ID_CPU); 3435 params->populated_on_count = 1; 3436 } 3437 3438 return NV_OK; 3439 } 3440 3441 NV_STATUS uvm_hmm_va_block_update_residency_info(uvm_va_block_t *va_block, 3442 struct mm_struct *mm, 3443 NvU64 lookup_address, 3444 bool populate) 3445 { 3446 uvm_va_space_t *va_space = va_block->hmm.va_space; 3447 struct vm_area_struct *vma; 3448 struct hmm_range range; 3449 uvm_va_block_region_t region; 3450 unsigned long pfn; 3451 NvU64 end; 3452 int ret; 3453 NV_STATUS status; 3454 3455 if (!uvm_hmm_is_enabled(va_space) || !mm) 3456 return NV_ERR_INVALID_ADDRESS; 3457 3458 uvm_assert_mmap_lock_locked(mm); 3459 uvm_assert_rwsem_locked(&va_space->lock); 3460 3461 // The VMA might have changed while not holding mmap_lock so check it. 3462 vma = find_vma(mm, lookup_address); 3463 if (!uvm_hmm_vma_is_valid(vma, lookup_address, false)) 3464 return NV_ERR_INVALID_ADDRESS; 3465 3466 end = lookup_address + PAGE_SIZE; 3467 region = uvm_va_block_region_from_start_end(va_block, lookup_address, end - 1); 3468 3469 range.notifier = &va_block->hmm.notifier; 3470 range.start = lookup_address; 3471 range.end = end; 3472 range.hmm_pfns = &pfn; 3473 range.default_flags = 0; 3474 range.pfn_flags_mask = 0; 3475 range.dev_private_owner = &g_uvm_global; 3476 3477 if (populate) { 3478 range.default_flags = HMM_PFN_REQ_FAULT; 3479 if (vma->vm_flags & VM_WRITE) 3480 range.default_flags |= HMM_PFN_REQ_WRITE; 3481 } 3482 3483 uvm_hmm_migrate_begin_wait(va_block); 3484 3485 while (true) { 3486 range.notifier_seq = mmu_interval_read_begin(range.notifier); 3487 ret = hmm_range_fault(&range); 3488 if (ret == -EBUSY) 3489 continue; 3490 if (ret) { 3491 uvm_hmm_migrate_finish(va_block); 3492 return errno_to_nv_status(ret); 3493 } 3494 3495 uvm_mutex_lock(&va_block->lock); 3496 3497 if (!mmu_interval_read_retry(range.notifier, range.notifier_seq)) 3498 break; 3499 3500 uvm_mutex_unlock(&va_block->lock); 3501 } 3502 3503 // Update the va_block CPU state based on the snapshot. 3504 // Note that we have to adjust the pfns address since it will be indexed 3505 // by region.first. 3506 status = populate_region(va_block, &pfn - region.first, region, NULL); 3507 3508 uvm_mutex_unlock(&va_block->lock); 3509 uvm_hmm_migrate_finish(va_block); 3510 3511 return NV_OK; 3512 } 3513 3514 NV_STATUS uvm_test_split_invalidate_delay(UVM_TEST_SPLIT_INVALIDATE_DELAY_PARAMS *params, struct file *filp) 3515 { 3516 uvm_va_space_t *va_space = uvm_va_space_get(filp); 3517 3518 atomic64_set(&va_space->test.split_invalidate_delay_us, params->delay_us); 3519 3520 return NV_OK; 3521 } 3522 3523 NV_STATUS uvm_hmm_va_range_info(uvm_va_space_t *va_space, 3524 struct mm_struct *mm, 3525 UVM_TEST_VA_RANGE_INFO_PARAMS *params) 3526 { 3527 uvm_range_tree_node_t *tree_node; 3528 const uvm_va_policy_node_t *node; 3529 struct vm_area_struct *vma; 3530 uvm_va_block_t *va_block; 3531 3532 if (!mm || !uvm_hmm_is_enabled(va_space)) 3533 return NV_ERR_INVALID_ADDRESS; 3534 3535 uvm_assert_mmap_lock_locked(mm); 3536 uvm_assert_rwsem_locked(&va_space->lock); 3537 3538 params->type = UVM_TEST_VA_RANGE_TYPE_MANAGED; 3539 params->managed.subtype = UVM_TEST_RANGE_SUBTYPE_HMM; 3540 params->va_range_start = 0; 3541 params->va_range_end = ULONG_MAX; 3542 params->read_duplication = UVM_TEST_READ_DUPLICATION_UNSET; 3543 memset(¶ms->preferred_location, 0, sizeof(params->preferred_location)); 3544 params->accessed_by_count = 0; 3545 params->managed.vma_start = 0; 3546 params->managed.vma_end = 0; 3547 params->managed.is_zombie = NV_FALSE; 3548 params->managed.owned_by_calling_process = (mm == current->mm ? NV_TRUE : NV_FALSE); 3549 3550 vma = find_vma(mm, params->lookup_address); 3551 if (!uvm_hmm_vma_is_valid(vma, params->lookup_address, false)) 3552 return NV_ERR_INVALID_ADDRESS; 3553 3554 params->va_range_start = vma->vm_start; 3555 params->va_range_end = vma->vm_end - 1; 3556 params->managed.vma_start = vma->vm_start; 3557 params->managed.vma_end = vma->vm_end - 1; 3558 3559 uvm_mutex_lock(&va_space->hmm.blocks_lock); 3560 tree_node = uvm_range_tree_find(&va_space->hmm.blocks, params->lookup_address); 3561 if (!tree_node) { 3562 UVM_ASSERT(uvm_range_tree_find_hole_in(&va_space->hmm.blocks, params->lookup_address, 3563 ¶ms->va_range_start, ¶ms->va_range_end) == NV_OK); 3564 uvm_mutex_unlock(&va_space->hmm.blocks_lock); 3565 return NV_OK; 3566 } 3567 3568 uvm_mutex_unlock(&va_space->hmm.blocks_lock); 3569 va_block = hmm_va_block_from_node(tree_node); 3570 uvm_mutex_lock(&va_block->lock); 3571 3572 params->va_range_start = va_block->start; 3573 params->va_range_end = va_block->end; 3574 3575 node = uvm_va_policy_node_find(va_block, params->lookup_address); 3576 if (node) { 3577 uvm_processor_id_t processor_id; 3578 3579 if (params->va_range_start < node->node.start) 3580 params->va_range_start = node->node.start; 3581 if (params->va_range_end > node->node.end) 3582 params->va_range_end = node->node.end; 3583 3584 params->read_duplication = node->policy.read_duplication; 3585 3586 if (!UVM_ID_IS_INVALID(node->policy.preferred_location)) 3587 uvm_va_space_processor_uuid(va_space, ¶ms->preferred_location, node->policy.preferred_location); 3588 3589 for_each_id_in_mask(processor_id, &node->policy.accessed_by) 3590 uvm_va_space_processor_uuid(va_space, ¶ms->accessed_by[params->accessed_by_count++], processor_id); 3591 } 3592 else { 3593 uvm_range_tree_find_hole_in(&va_block->hmm.va_policy_tree, params->lookup_address, 3594 ¶ms->va_range_start, ¶ms->va_range_end); 3595 } 3596 3597 uvm_mutex_unlock(&va_block->lock); 3598 3599 return NV_OK; 3600 } 3601 3602 // TODO: Bug 3660968: Remove this hack as soon as HMM migration is implemented 3603 // for VMAs other than anonymous private memory. 3604 bool uvm_hmm_must_use_sysmem(uvm_va_block_t *va_block, 3605 uvm_va_block_context_t *va_block_context) 3606 { 3607 struct vm_area_struct *vma = va_block_context->hmm.vma; 3608 3609 uvm_assert_mutex_locked(&va_block->lock); 3610 3611 if (!uvm_va_block_is_hmm(va_block)) 3612 return false; 3613 3614 UVM_ASSERT(vma); 3615 UVM_ASSERT(va_block_context->mm == vma->vm_mm); 3616 uvm_assert_mmap_lock_locked(va_block_context->mm); 3617 3618 // TODO: Bug 4050579: Remove this when swap cached pages can be migrated. 3619 if (va_block_context->hmm.swap_cached) 3620 return true; 3621 3622 // migrate_vma_setup() can't migrate VM_SPECIAL so we have to force GPU 3623 // remote mapping. 3624 // TODO: Bug 3660968: add support for file-backed migrations. 3625 // TODO: Bug 3368756: add support for transparent huge page migrations. 3626 return !vma_is_anonymous(vma) || 3627 (vma->vm_flags & VM_SPECIAL) || 3628 vma_is_dax(vma) || 3629 is_vm_hugetlb_page(vma); 3630 } 3631 3632 #endif // UVM_IS_CONFIG_HMM() 3633 3634