1 /******************************************************************************* 2 Copyright (c) 2016-2023 NVIDIA Corporation 3 4 Permission is hereby granted, free of charge, to any person obtaining a copy 5 of this software and associated documentation files (the "Software"), to 6 deal in the Software without restriction, including without limitation the 7 rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 8 sell copies of the Software, and to permit persons to whom the Software is 9 furnished to do so, subject to the following conditions: 10 11 The above copyright notice and this permission notice shall be 12 included in all copies or substantial portions of the Software. 13 14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 DEALINGS IN THE SOFTWARE. 21 22 *******************************************************************************/ 23 24 #include "uvm_hmm.h" 25 26 // Support for HMM ( https://docs.kernel.org/mm/hmm.html ): 27 28 #ifdef NVCPU_X86_64 29 static bool uvm_disable_hmm = false; 30 MODULE_PARM_DESC(uvm_disable_hmm, 31 "Force-disable HMM functionality in the UVM driver. " 32 "Default: false (HMM is enabled if possible). " 33 "However, even with uvm_disable_hmm=false, HMM will not be " 34 "enabled if is not supported in this driver build " 35 "configuration, or if ATS settings conflict with HMM."); 36 #else 37 // So far, we've only tested HMM on x86_64, so disable it by default everywhere 38 // else. 39 static bool uvm_disable_hmm = true; 40 MODULE_PARM_DESC(uvm_disable_hmm, 41 "Force-disable HMM functionality in the UVM driver. " 42 "Default: true (HMM is not enabled on this CPU architecture). " 43 "However, even with uvm_disable_hmm=false, HMM will not be " 44 "enabled if is not supported in this driver build " 45 "configuration, or if ATS settings conflict with HMM."); 46 #endif 47 48 module_param(uvm_disable_hmm, bool, 0444); 49 50 #if UVM_IS_CONFIG_HMM() 51 52 #include <linux/hmm.h> 53 #include <linux/rmap.h> 54 #include <linux/migrate.h> 55 #include <linux/userfaultfd_k.h> 56 #include <linux/memremap.h> 57 #include <linux/wait.h> 58 59 #include "uvm_common.h" 60 #include "uvm_gpu.h" 61 #include "uvm_pmm_gpu.h" 62 #include "uvm_hal_types.h" 63 #include "uvm_push.h" 64 #include "uvm_hal.h" 65 #include "uvm_va_block_types.h" 66 #include "uvm_va_space_mm.h" 67 #include "uvm_va_space.h" 68 #include "uvm_va_range.h" 69 #include "uvm_range_tree.h" 70 #include "uvm_pmm_sysmem.h" 71 #include "uvm_lock.h" 72 #include "uvm_api.h" 73 #include "uvm_va_policy.h" 74 #include "uvm_tools.h" 75 76 static NV_STATUS gpu_chunk_add(uvm_va_block_t *va_block, 77 uvm_page_index_t page_index, 78 struct page *page); 79 80 typedef struct 81 { 82 uvm_processor_id_t processor_id; 83 uvm_processor_id_t new_residency; 84 uvm_va_block_t *va_block; 85 uvm_va_block_retry_t *va_block_retry; 86 uvm_service_block_context_t *service_context; 87 uvm_page_mask_t page_mask; 88 uvm_page_mask_t same_devmem_page_mask; 89 } uvm_hmm_gpu_fault_event_t; 90 91 typedef struct 92 { 93 uvm_va_block_t *va_block; 94 uvm_va_block_retry_t *va_block_retry; 95 uvm_va_block_context_t *va_block_context; 96 uvm_va_block_region_t region; 97 uvm_processor_id_t dest_id; 98 uvm_make_resident_cause_t cause; 99 uvm_page_mask_t page_mask; 100 uvm_page_mask_t same_devmem_page_mask; 101 } uvm_hmm_migrate_event_t; 102 103 typedef struct 104 { 105 uvm_processor_id_t processor_id; 106 uvm_va_block_t *va_block; 107 uvm_va_block_retry_t *va_block_retry; 108 uvm_service_block_context_t *service_context; 109 uvm_page_mask_t page_mask; 110 uvm_page_mask_t same_devmem_page_mask; 111 } uvm_hmm_devmem_fault_context_t; 112 113 bool uvm_hmm_is_enabled_system_wide(void) 114 { 115 if (uvm_disable_hmm) 116 return false; 117 118 if (g_uvm_global.ats.enabled) 119 return false; 120 121 // Confidential Computing and HMM impose mutually exclusive constraints. In 122 // Confidential Computing the GPU can only access pages resident in vidmem, 123 // but in HMM pages may be required to be resident in sysmem: file backed 124 // VMAs, huge pages, etc. 125 if (g_uvm_global.conf_computing_enabled) 126 return false; 127 128 return uvm_va_space_mm_enabled_system(); 129 } 130 131 bool uvm_hmm_is_enabled(uvm_va_space_t *va_space) 132 { 133 return uvm_hmm_is_enabled_system_wide() && 134 uvm_va_space_mm_enabled(va_space) && 135 !(va_space->initialization_flags & UVM_INIT_FLAGS_DISABLE_HMM); 136 } 137 138 static uvm_va_block_t *hmm_va_block_from_node(uvm_range_tree_node_t *node) 139 { 140 if (!node) 141 return NULL; 142 return container_of(node, uvm_va_block_t, hmm.node); 143 } 144 145 // Copies the contents of the source device-private page to the 146 // destination CPU page. This will invalidate mappings, so cannot be 147 // called while holding any va_block locks. 148 static void hmm_copy_devmem_page(struct page *dst_page, struct page *src_page) 149 { 150 uvm_tracker_t tracker = UVM_TRACKER_INIT(); 151 uvm_gpu_phys_address_t src_addr; 152 uvm_gpu_phys_address_t dst_addr; 153 uvm_gpu_chunk_t *gpu_chunk; 154 NvU64 dma_addr; 155 uvm_push_t push; 156 NV_STATUS status = NV_OK; 157 uvm_gpu_t *gpu; 158 159 // Holding a reference on the device-private page ensures the gpu 160 // is already retained. This is because when a GPU is unregistered 161 // all device-private pages are migrated back to the CPU and freed 162 // before releasing the GPU. Therefore if we could get a reference 163 // to the page the GPU must be retained. 164 UVM_ASSERT(is_device_private_page(src_page) && page_count(src_page)); 165 gpu_chunk = uvm_pmm_devmem_page_to_chunk(src_page); 166 gpu = uvm_gpu_chunk_get_gpu(gpu_chunk); 167 status = uvm_mmu_chunk_map(gpu_chunk); 168 if (status != NV_OK) 169 goto out_zero; 170 171 status = uvm_parent_gpu_map_cpu_pages(gpu->parent, dst_page, PAGE_SIZE, &dma_addr); 172 if (status != NV_OK) 173 goto out_unmap_gpu; 174 175 dst_addr = uvm_gpu_phys_address(UVM_APERTURE_SYS, dma_addr); 176 src_addr = uvm_gpu_phys_address(UVM_APERTURE_VID, gpu_chunk->address); 177 status = uvm_push_begin_acquire(gpu->channel_manager, 178 UVM_CHANNEL_TYPE_GPU_TO_CPU, 179 &tracker, 180 &push, 181 "Copy for remote process fault"); 182 if (status != NV_OK) 183 goto out_unmap_cpu; 184 185 gpu->parent->ce_hal->memcopy(&push, 186 uvm_gpu_address_copy(gpu, dst_addr), 187 uvm_gpu_address_copy(gpu, src_addr), 188 PAGE_SIZE); 189 uvm_push_end(&push); 190 status = uvm_tracker_add_push_safe(&tracker, &push); 191 if (status == NV_OK) 192 uvm_tracker_wait_deinit(&tracker); 193 194 out_unmap_cpu: 195 uvm_parent_gpu_unmap_cpu_pages(gpu->parent, dma_addr, PAGE_SIZE); 196 197 out_unmap_gpu: 198 uvm_mmu_chunk_unmap(gpu_chunk, NULL); 199 200 out_zero: 201 // We can't fail eviction because we need to free the device-private pages 202 // so the GPU can be unregistered. So the best we can do is warn on any 203 // failures and zero the uninitialised page. This could result in data loss 204 // in the application but failures are not expected. 205 if (WARN_ON(status != NV_OK)) 206 memzero_page(dst_page, 0, PAGE_SIZE); 207 } 208 209 static NV_STATUS uvm_hmm_pmm_gpu_evict_pfn(unsigned long pfn) 210 { 211 unsigned long src_pfn = 0; 212 unsigned long dst_pfn = 0; 213 struct page *dst_page; 214 NV_STATUS status = NV_OK; 215 int ret; 216 217 ret = migrate_device_range(&src_pfn, pfn, 1); 218 if (ret) 219 return errno_to_nv_status(ret); 220 221 if (src_pfn & MIGRATE_PFN_MIGRATE) { 222 223 dst_page = alloc_page(GFP_HIGHUSER_MOVABLE); 224 if (!dst_page) { 225 status = NV_ERR_NO_MEMORY; 226 goto out; 227 } 228 229 lock_page(dst_page); 230 hmm_copy_devmem_page(dst_page, migrate_pfn_to_page(src_pfn)); 231 dst_pfn = migrate_pfn(page_to_pfn(dst_page)); 232 migrate_device_pages(&src_pfn, &dst_pfn, 1); 233 } 234 235 out: 236 migrate_device_finalize(&src_pfn, &dst_pfn, 1); 237 238 if (!(src_pfn & MIGRATE_PFN_MIGRATE)) 239 status = NV_ERR_BUSY_RETRY; 240 241 return status; 242 } 243 244 void uvm_hmm_va_space_initialize(uvm_va_space_t *va_space) 245 { 246 uvm_hmm_va_space_t *hmm_va_space = &va_space->hmm; 247 248 if (!uvm_hmm_is_enabled(va_space)) 249 return; 250 251 uvm_range_tree_init(&hmm_va_space->blocks); 252 uvm_mutex_init(&hmm_va_space->blocks_lock, UVM_LOCK_ORDER_LEAF); 253 254 return; 255 } 256 257 void uvm_hmm_va_space_destroy(uvm_va_space_t *va_space) 258 { 259 uvm_hmm_va_space_t *hmm_va_space = &va_space->hmm; 260 uvm_range_tree_node_t *node, *next; 261 uvm_va_block_t *va_block; 262 263 if (!uvm_hmm_is_enabled(va_space)) 264 return; 265 266 uvm_assert_rwsem_locked_write(&va_space->lock); 267 268 // The blocks_lock is not needed when the va_space lock is held for write. 269 uvm_range_tree_for_each_safe(node, next, &hmm_va_space->blocks) { 270 va_block = hmm_va_block_from_node(node); 271 uvm_range_tree_remove(&hmm_va_space->blocks, node); 272 mmu_interval_notifier_remove(&va_block->hmm.notifier); 273 uvm_va_block_kill(va_block); 274 } 275 } 276 277 static void hmm_va_block_unregister_gpu(uvm_va_block_t *va_block, 278 uvm_gpu_t *gpu, 279 struct mm_struct *mm) 280 { 281 uvm_va_policy_node_t *node; 282 283 uvm_mutex_lock(&va_block->lock); 284 285 // Reset preferred location and accessed-by of policy nodes if needed. 286 uvm_for_each_va_policy_node_in(node, va_block, va_block->start, va_block->end) { 287 if (uvm_id_equal(node->policy.preferred_location, gpu->id)) 288 node->policy.preferred_location = UVM_ID_INVALID; 289 290 uvm_processor_mask_clear(&node->policy.accessed_by, gpu->id); 291 } 292 293 // Migrate and free any remaining resident allocations on this GPU. 294 uvm_va_block_unregister_gpu_locked(va_block, gpu, mm); 295 296 uvm_mutex_unlock(&va_block->lock); 297 } 298 299 void uvm_hmm_unregister_gpu(uvm_va_space_t *va_space, uvm_gpu_t *gpu, struct mm_struct *mm) 300 { 301 uvm_range_tree_node_t *node; 302 uvm_va_block_t *va_block; 303 struct range range = gpu->pmm.devmem.pagemap.range; 304 unsigned long pfn; 305 bool retry; 306 307 if (!uvm_hmm_is_enabled(va_space)) 308 return; 309 310 if (mm) 311 uvm_assert_mmap_lock_locked(mm); 312 uvm_assert_rwsem_locked_write(&va_space->lock); 313 314 // There could be pages with page->zone_device_data pointing to the va_space 315 // which may be about to be freed. Migrate those back to the CPU so we don't 316 // fault on them. Normally infinite retries are bad, but we don't have any 317 // option here. Device-private pages can't be pinned so migration should 318 // eventually succeed. Even if we did eventually bail out of the loop we'd 319 // just stall in memunmap_pages() anyway. 320 do { 321 retry = false; 322 323 for (pfn = __phys_to_pfn(range.start); pfn <= __phys_to_pfn(range.end); pfn++) { 324 struct page *page = pfn_to_page(pfn); 325 326 UVM_ASSERT(is_device_private_page(page)); 327 328 // This check is racy because nothing stops the page being freed and 329 // even reused. That doesn't matter though - worst case the 330 // migration fails, we retry and find the va_space doesn't match. 331 if (page->zone_device_data == va_space) 332 if (uvm_hmm_pmm_gpu_evict_pfn(pfn) != NV_OK) 333 retry = true; 334 } 335 } while (retry); 336 337 uvm_range_tree_for_each(node, &va_space->hmm.blocks) { 338 va_block = hmm_va_block_from_node(node); 339 340 hmm_va_block_unregister_gpu(va_block, gpu, mm); 341 } 342 } 343 344 static void hmm_va_block_remove_gpu_va_space(uvm_va_block_t *va_block, 345 uvm_gpu_va_space_t *gpu_va_space, 346 uvm_va_block_context_t *va_block_context) 347 { 348 uvm_mutex_lock(&va_block->lock); 349 350 uvm_va_block_remove_gpu_va_space(va_block, gpu_va_space, va_block_context); 351 352 uvm_mutex_unlock(&va_block->lock); 353 354 // TODO: Bug 3660922: Need to handle read duplication at some point. 355 // See range_remove_gpu_va_space_managed(). 356 } 357 358 void uvm_hmm_remove_gpu_va_space(uvm_va_space_t *va_space, 359 uvm_gpu_va_space_t *gpu_va_space, 360 struct mm_struct *mm) 361 { 362 uvm_va_block_context_t *va_block_context; 363 uvm_range_tree_node_t *node, *next; 364 uvm_va_block_t *va_block; 365 366 if (!uvm_hmm_is_enabled(va_space)) 367 return; 368 369 if (mm) 370 uvm_assert_mmap_lock_locked(mm); 371 uvm_assert_rwsem_locked_write(&va_space->lock); 372 373 va_block_context = uvm_va_space_block_context(va_space, mm); 374 375 uvm_range_tree_for_each_safe(node, next, &va_space->hmm.blocks) { 376 va_block = hmm_va_block_from_node(node); 377 378 hmm_va_block_remove_gpu_va_space(va_block, gpu_va_space, va_block_context); 379 } 380 } 381 382 static bool hmm_invalidate(uvm_va_block_t *va_block, 383 const struct mmu_notifier_range *range, 384 unsigned long cur_seq) 385 { 386 uvm_thread_context_t *uvm_context = uvm_thread_context(); 387 struct mmu_interval_notifier *mni = &va_block->hmm.notifier; 388 struct mm_struct *mm = mni->mm; 389 uvm_va_block_context_t *va_block_context; 390 uvm_va_block_region_t region; 391 NvU64 start, end; 392 uvm_processor_id_t id; 393 NV_STATUS status = NV_OK; 394 395 // The MMU_NOTIFY_RELEASE event isn't really needed since mn_itree_release() 396 // doesn't remove the interval notifiers from the struct_mm so there will 397 // be a full range MMU_NOTIFY_UNMAP event after the release from 398 // unmap_vmas() during exit_mmap(). 399 if (range->event == MMU_NOTIFY_SOFT_DIRTY || range->event == MMU_NOTIFY_RELEASE) 400 return true; 401 402 // Blockable is only set false by 403 // mmu_notifier_invalidate_range_start_nonblock() which is only called in 404 // __oom_reap_task_mm(). 405 if (!mmu_notifier_range_blockable(range)) 406 return false; 407 408 // We only ignore invalidations in this context whilst holding the 409 // va_block lock. This prevents deadlock when try_to_migrate() 410 // calls the notifier, but holding the lock prevents other threads 411 // invalidating PTEs so we can safely assume the results of 412 // migrate_vma_setup() are correct. 413 if (uvm_context->ignore_hmm_invalidate_va_block == va_block || 414 ((range->event == MMU_NOTIFY_MIGRATE || range->event == MMU_NOTIFY_EXCLUSIVE) && 415 range->owner == &g_uvm_global)) 416 return true; 417 418 va_block_context = uvm_va_block_context_alloc(mm); 419 if (!va_block_context) 420 return true; 421 422 uvm_mutex_lock(&va_block->lock); 423 424 // mmu_interval_notifier_remove() is always called before marking a 425 // va_block as dead so this va_block has to be alive. 426 UVM_ASSERT(!uvm_va_block_is_dead(va_block)); 427 428 // Note: unmap_vmas() does MMU_NOTIFY_UNMAP [0, 0xffffffffffffffff] 429 // Also note that hmm_invalidate() can be called when a new va_block is not 430 // yet inserted into the va_space->hmm.blocks table while the original 431 // va_block is being split. The original va_block may have its end address 432 // updated before the mmu interval notifier is updated so this invalidate 433 // may be for a range past the va_block end address. 434 start = range->start; 435 end = (range->end == ULONG_MAX) ? range->end : range->end - 1; 436 if (start < va_block->start) 437 start = va_block->start; 438 if (end > va_block->end) 439 end = va_block->end; 440 if (start > end) 441 goto unlock; 442 443 // These will be equal if no other thread causes an invalidation 444 // whilst the va_block lock was dropped. 445 uvm_context->hmm_invalidate_seqnum++; 446 va_block->hmm.changed++; 447 448 mmu_interval_set_seq(mni, cur_seq); 449 450 region = uvm_va_block_region_from_start_end(va_block, start, end); 451 452 va_block_context->hmm.vma = NULL; 453 454 // We only need to unmap GPUs since Linux handles the CPUs. 455 for_each_gpu_id_in_mask(id, &va_block->mapped) { 456 status = uvm_va_block_unmap(va_block, 457 va_block_context, 458 id, 459 region, 460 uvm_va_block_map_mask_get(va_block, id), 461 &va_block->tracker); 462 // Note that the va_block lock can be dropped, relocked, and 463 // NV_ERR_MORE_PROCESSING_REQUIRED returned. 464 if (status != NV_OK) 465 break; 466 } 467 468 if (range->event == MMU_NOTIFY_UNMAP || range->event == MMU_NOTIFY_CLEAR) 469 uvm_va_block_munmap_region(va_block, region); 470 471 if (status == NV_OK) 472 status = uvm_tracker_wait(&va_block->tracker); 473 474 // Remove stale HMM struct page pointers to system memory. 475 uvm_va_block_remove_cpu_chunks(va_block, region); 476 477 unlock: 478 uvm_mutex_unlock(&va_block->lock); 479 480 uvm_va_block_context_free(va_block_context); 481 482 UVM_ASSERT(status == NV_OK); 483 return true; 484 } 485 486 static bool uvm_hmm_invalidate_entry(struct mmu_interval_notifier *mni, 487 const struct mmu_notifier_range *range, 488 unsigned long cur_seq) 489 { 490 uvm_va_block_t *va_block = container_of(mni, uvm_va_block_t, hmm.notifier); 491 492 UVM_ENTRY_RET(hmm_invalidate(va_block, range, cur_seq)); 493 } 494 495 static const struct mmu_interval_notifier_ops uvm_hmm_notifier_ops = 496 { 497 .invalidate = uvm_hmm_invalidate_entry, 498 }; 499 500 NV_STATUS uvm_hmm_va_block_find(uvm_va_space_t *va_space, 501 NvU64 addr, 502 uvm_va_block_t **va_block_ptr) 503 { 504 uvm_range_tree_node_t *node; 505 506 if (!uvm_hmm_is_enabled(va_space)) 507 return NV_ERR_INVALID_ADDRESS; 508 509 uvm_assert_rwsem_locked(&va_space->lock); 510 511 uvm_mutex_lock(&va_space->hmm.blocks_lock); 512 node = uvm_range_tree_find(&va_space->hmm.blocks, addr); 513 uvm_mutex_unlock(&va_space->hmm.blocks_lock); 514 515 if (!node) 516 return NV_ERR_OBJECT_NOT_FOUND; 517 518 *va_block_ptr = hmm_va_block_from_node(node); 519 520 return NV_OK; 521 } 522 523 static int migrate_vma_setup_locked(struct migrate_vma *args, uvm_va_block_t *va_block) 524 { 525 uvm_thread_context_t *uvm_context = uvm_thread_context(); 526 int ret; 527 528 // It's only safe to ignore invalidations whilst doing a migration 529 // and holding the va_block lock. 530 uvm_assert_mutex_locked(&va_block->lock); 531 uvm_context->ignore_hmm_invalidate_va_block = va_block; 532 ret = migrate_vma_setup(args); 533 534 // We shouldn't be generating any more invalidations now. 535 uvm_context->ignore_hmm_invalidate_va_block = NULL; 536 return ret; 537 } 538 539 static bool uvm_hmm_vma_is_valid(struct vm_area_struct *vma, 540 unsigned long addr, 541 bool allow_unreadable_vma) 542 { 543 // UVM doesn't support userfaultfd. hmm_range_fault() doesn't support 544 // VM_IO or VM_PFNMAP VMAs. It also doesn't support VMAs without VM_READ 545 // but we allow those VMAs to have policy set on them. 546 // migrate_vma_setup() doesn't support VM_SPECIAL VMAs but that is handled 547 // by uvm_hmm_must_use_sysmem() forcing residency to the CPU. 548 return vma && 549 addr >= vma->vm_start && 550 !userfaultfd_armed(vma) && 551 !(vma->vm_flags & (VM_IO | VM_PFNMAP)) && 552 !uvm_vma_is_managed(vma) && 553 (allow_unreadable_vma || (vma->vm_flags & VM_READ)); 554 } 555 556 static void hmm_va_block_init(uvm_va_block_t *va_block, 557 uvm_va_space_t *va_space, 558 NvU64 start, 559 NvU64 end) 560 { 561 va_block->hmm.va_space = va_space; 562 va_block->hmm.node.start = start; 563 va_block->hmm.node.end = end; 564 uvm_range_tree_init(&va_block->hmm.va_policy_tree); 565 uvm_mutex_init(&va_block->hmm.migrate_lock, UVM_LOCK_ORDER_VA_BLOCK_MIGRATE); 566 } 567 568 static NV_STATUS hmm_va_block_find_create(uvm_va_space_t *va_space, 569 NvU64 addr, 570 bool allow_unreadable_vma, 571 struct vm_area_struct **vma_out, 572 uvm_va_block_t **va_block_ptr) 573 { 574 struct mm_struct *mm; 575 struct vm_area_struct *va_block_vma; 576 uvm_va_block_t *va_block; 577 NvU64 start, end; 578 NV_STATUS status; 579 int ret; 580 581 if (!uvm_hmm_is_enabled(va_space)) 582 return NV_ERR_INVALID_ADDRESS; 583 584 mm = va_space->va_space_mm.mm; 585 uvm_assert_mmap_lock_locked(mm); 586 uvm_assert_rwsem_locked(&va_space->lock); 587 UVM_ASSERT(PAGE_ALIGNED(addr)); 588 589 // Note that we have to allow PROT_NONE VMAs so that policies can be set. 590 va_block_vma = find_vma(mm, addr); 591 if (!uvm_hmm_vma_is_valid(va_block_vma, addr, allow_unreadable_vma)) 592 return NV_ERR_INVALID_ADDRESS; 593 594 // Since we only hold the va_space read lock, there can be multiple 595 // parallel va_block insertions. 596 uvm_mutex_lock(&va_space->hmm.blocks_lock); 597 598 va_block = hmm_va_block_from_node(uvm_range_tree_find(&va_space->hmm.blocks, addr)); 599 if (va_block) 600 goto done; 601 602 // The va_block is always created to cover the whole aligned 603 // UVM_VA_BLOCK_SIZE interval unless there are existing UVM va_ranges or 604 // HMM va_blocks. In that case, the new HMM va_block size is adjusted so it 605 // doesn't overlap. 606 start = UVM_VA_BLOCK_ALIGN_DOWN(addr); 607 end = start + UVM_VA_BLOCK_SIZE - 1; 608 609 // Search for existing UVM va_ranges in the start/end interval and create 610 // a maximum interval that doesn't overlap any existing UVM va_ranges. 611 // We know that 'addr' is not within a va_range or 612 // hmm_va_block_find_create() wouldn't be called. 613 status = uvm_range_tree_find_hole_in(&va_space->va_range_tree, addr, &start, &end); 614 UVM_ASSERT(status == NV_OK); 615 616 // Search for existing HMM va_blocks in the start/end interval and create 617 // a maximum interval that doesn't overlap any existing HMM va_blocks. 618 status = uvm_range_tree_find_hole_in(&va_space->hmm.blocks, addr, &start, &end); 619 UVM_ASSERT(status == NV_OK); 620 621 // Create a HMM va_block with a NULL va_range pointer. 622 status = uvm_va_block_create(NULL, start, end, &va_block); 623 if (status != NV_OK) 624 goto err_unlock; 625 626 hmm_va_block_init(va_block, va_space, start, end); 627 628 ret = mmu_interval_notifier_insert(&va_block->hmm.notifier, 629 mm, 630 start, 631 end - start + 1, 632 &uvm_hmm_notifier_ops); 633 if (ret) { 634 status = errno_to_nv_status(ret); 635 goto err_release; 636 } 637 638 status = uvm_range_tree_add(&va_space->hmm.blocks, &va_block->hmm.node); 639 UVM_ASSERT(status == NV_OK); 640 641 done: 642 uvm_mutex_unlock(&va_space->hmm.blocks_lock); 643 if (vma_out) 644 *vma_out = va_block_vma; 645 *va_block_ptr = va_block; 646 return NV_OK; 647 648 err_release: 649 uvm_va_block_release(va_block); 650 651 err_unlock: 652 uvm_mutex_unlock(&va_space->hmm.blocks_lock); 653 return status; 654 } 655 656 NV_STATUS uvm_hmm_va_block_find_create(uvm_va_space_t *va_space, 657 NvU64 addr, 658 struct vm_area_struct **vma, 659 uvm_va_block_t **va_block_ptr) 660 { 661 return hmm_va_block_find_create(va_space, addr, false, vma, va_block_ptr); 662 } 663 664 NV_STATUS uvm_hmm_find_vma(struct mm_struct *mm, struct vm_area_struct **vma_out, NvU64 addr) 665 { 666 if (!mm) 667 return NV_ERR_INVALID_ADDRESS; 668 669 uvm_assert_mmap_lock_locked(mm); 670 671 *vma_out = find_vma(mm, addr); 672 if (!uvm_hmm_vma_is_valid(*vma_out, addr, false)) 673 return NV_ERR_INVALID_ADDRESS; 674 675 return NV_OK; 676 } 677 678 bool uvm_hmm_check_context_vma_is_valid(uvm_va_block_t *va_block, 679 struct vm_area_struct *vma, 680 uvm_va_block_region_t region) 681 { 682 uvm_assert_mutex_locked(&va_block->lock); 683 684 if (uvm_va_block_is_hmm(va_block)) { 685 UVM_ASSERT(vma); 686 UVM_ASSERT(va_block->hmm.va_space->va_space_mm.mm == vma->vm_mm); 687 uvm_assert_mmap_lock_locked(va_block->hmm.va_space->va_space_mm.mm); 688 UVM_ASSERT(vma->vm_start <= uvm_va_block_region_start(va_block, region)); 689 UVM_ASSERT(vma->vm_end > uvm_va_block_region_end(va_block, region)); 690 } 691 692 return true; 693 } 694 695 NV_STATUS uvm_hmm_migrate_begin(uvm_va_block_t *va_block) 696 { 697 if (uvm_mutex_trylock(&va_block->hmm.migrate_lock)) 698 return NV_OK; 699 700 return NV_ERR_BUSY_RETRY; 701 } 702 703 void uvm_hmm_migrate_begin_wait(uvm_va_block_t *va_block) 704 { 705 uvm_mutex_lock(&va_block->hmm.migrate_lock); 706 } 707 708 void uvm_hmm_migrate_finish(uvm_va_block_t *va_block) 709 { 710 uvm_mutex_unlock(&va_block->hmm.migrate_lock); 711 } 712 713 // Migrate the given range [start end] within a va_block to dest_id. 714 static NV_STATUS hmm_migrate_range(uvm_va_block_t *va_block, 715 uvm_va_block_retry_t *va_block_retry, 716 uvm_va_block_context_t *va_block_context, 717 uvm_processor_id_t dest_id, 718 NvU64 start, 719 NvU64 end, 720 uvm_migrate_mode_t mode, 721 uvm_tracker_t *out_tracker) 722 { 723 uvm_va_block_region_t region; 724 uvm_va_policy_node_t *node; 725 const uvm_va_policy_t *policy; 726 NV_STATUS status = NV_OK; 727 728 uvm_hmm_migrate_begin_wait(va_block); 729 uvm_mutex_lock(&va_block->lock); 730 731 uvm_for_each_va_policy_in(policy, va_block, start, end, node, region) { 732 // Even though UVM_VA_BLOCK_RETRY_LOCKED() may unlock and relock the 733 // va_block lock, the policy remains valid because we hold the mmap 734 // lock so munmap can't remove the policy, and the va_space lock so the 735 // policy APIs can't change the policy. 736 status = UVM_VA_BLOCK_RETRY_LOCKED(va_block, 737 va_block_retry, 738 uvm_va_block_migrate_locked(va_block, 739 va_block_retry, 740 va_block_context, 741 region, 742 dest_id, 743 mode, 744 out_tracker)); 745 if (status != NV_OK) 746 break; 747 } 748 749 uvm_mutex_unlock(&va_block->lock); 750 uvm_hmm_migrate_finish(va_block); 751 752 return status; 753 } 754 755 NV_STATUS uvm_hmm_test_va_block_inject_split_error(uvm_va_space_t *va_space, NvU64 addr) 756 { 757 uvm_va_block_test_t *block_test; 758 uvm_va_block_t *va_block; 759 NV_STATUS status; 760 761 if (!uvm_hmm_is_enabled(va_space)) 762 return NV_ERR_INVALID_ADDRESS; 763 764 status = hmm_va_block_find_create(va_space, addr, false, NULL, &va_block); 765 if (status != NV_OK) 766 return status; 767 768 block_test = uvm_va_block_get_test(va_block); 769 if (block_test) 770 block_test->inject_split_error = true; 771 772 return NV_OK; 773 } 774 775 typedef struct { 776 struct mmu_interval_notifier notifier; 777 uvm_va_block_t *existing_block; 778 } hmm_split_invalidate_data_t; 779 780 static bool hmm_split_invalidate(struct mmu_interval_notifier *mni, 781 const struct mmu_notifier_range *range, 782 unsigned long cur_seq) 783 { 784 hmm_split_invalidate_data_t *split_data = container_of(mni, hmm_split_invalidate_data_t, notifier); 785 786 uvm_tools_test_hmm_split_invalidate(split_data->existing_block->hmm.va_space); 787 hmm_invalidate(split_data->existing_block, range, cur_seq); 788 789 return true; 790 } 791 792 static bool hmm_split_invalidate_entry(struct mmu_interval_notifier *mni, 793 const struct mmu_notifier_range *range, 794 unsigned long cur_seq) 795 { 796 UVM_ENTRY_RET(hmm_split_invalidate(mni, range, cur_seq)); 797 } 798 799 static const struct mmu_interval_notifier_ops hmm_notifier_split_ops = 800 { 801 .invalidate = hmm_split_invalidate_entry, 802 }; 803 804 // Splits existing va_block into two pieces, with new_va_block always after 805 // va_block. va_block is updated to have new_end. new_end+1 must be page- 806 // aligned. 807 // 808 // Before: [----------- existing ------------] 809 // After: [---- existing ----][---- new ----] 810 // ^new_end 811 // 812 // On error, va_block is still accessible and is left in its original 813 // functional state. 814 static NV_STATUS hmm_split_block(uvm_va_block_t *va_block, 815 NvU64 new_end, 816 uvm_va_block_t **new_block_ptr) 817 { 818 uvm_va_space_t *va_space = va_block->hmm.va_space; 819 struct mm_struct *mm = va_space->va_space_mm.mm; 820 hmm_split_invalidate_data_t split_data; 821 NvU64 delay_us; 822 uvm_va_block_t *new_va_block; 823 NV_STATUS status; 824 int ret; 825 826 uvm_assert_rwsem_locked_write(&va_space->lock); 827 828 UVM_ASSERT(new_end > va_block->start); 829 UVM_ASSERT(new_end < va_block->end); 830 UVM_ASSERT(PAGE_ALIGNED(new_end + 1)); 831 832 status = uvm_va_block_create(NULL, new_end + 1, va_block->end, &new_va_block); 833 if (status != NV_OK) 834 return status; 835 836 // Initialize the newly created HMM va_block. 837 hmm_va_block_init(new_va_block, va_space, new_va_block->start, new_va_block->end); 838 839 ret = mmu_interval_notifier_insert(&new_va_block->hmm.notifier, 840 mm, 841 new_va_block->start, 842 uvm_va_block_size(new_va_block), 843 &uvm_hmm_notifier_ops); 844 845 // Since __mmu_notifier_register() was called when the va_space was 846 // initially created, we know that mm->notifier_subscriptions is valid 847 // and mmu_interval_notifier_insert() can't return ENOMEM. 848 // The only error return is for start + length overflowing but we already 849 // registered the same address range before so there should be no error. 850 UVM_ASSERT(!ret); 851 852 uvm_mutex_lock(&va_block->lock); 853 854 status = uvm_va_block_split_locked(va_block, new_end, new_va_block, NULL); 855 if (status != NV_OK) 856 goto err; 857 858 uvm_mutex_unlock(&va_block->lock); 859 860 // The MMU interval notifier has to be removed in order to resize it. 861 // That means there would be a window of time when invalidation callbacks 862 // could be missed. To handle this case, we register a temporary notifier 863 // to cover the address range while resizing the old notifier (it is 864 // OK to have multiple notifiers for the same range, we may simply try to 865 // invalidate twice). 866 split_data.existing_block = va_block; 867 ret = mmu_interval_notifier_insert(&split_data.notifier, 868 mm, 869 va_block->start, 870 new_end - va_block->start + 1, 871 &hmm_notifier_split_ops); 872 UVM_ASSERT(!ret); 873 874 // Delay to allow hmm_sanity test to trigger an mmu_notifier during the 875 // critical window where the split invalidate callback is active. 876 delay_us = atomic64_read(&va_space->test.split_invalidate_delay_us); 877 if (delay_us) 878 udelay(delay_us); 879 880 mmu_interval_notifier_remove(&va_block->hmm.notifier); 881 882 // Enable notifications on the old block with the smaller size. 883 ret = mmu_interval_notifier_insert(&va_block->hmm.notifier, 884 mm, 885 va_block->start, 886 uvm_va_block_size(va_block), 887 &uvm_hmm_notifier_ops); 888 UVM_ASSERT(!ret); 889 890 mmu_interval_notifier_remove(&split_data.notifier); 891 892 if (new_block_ptr) 893 *new_block_ptr = new_va_block; 894 895 return status; 896 897 err: 898 uvm_mutex_unlock(&va_block->lock); 899 mmu_interval_notifier_remove(&new_va_block->hmm.notifier); 900 uvm_va_block_release(new_va_block); 901 return status; 902 } 903 904 // Check to see if the HMM va_block would overlap the range start/end and 905 // split it so it can be removed. That breaks down to the following cases: 906 // start/end could cover all of the HMM va_block -> 907 // remove the va_block 908 // start/end could cover the left part of the HMM va_block -> 909 // remove the left part 910 // start/end could cover the right part of the HMM va_block -> 911 // remove the right part 912 // or start/end could "punch a hole" in the middle and leave the ends intact. 913 // In each case, only one HMM va_block is removed so return it in out_va_block. 914 static NV_STATUS split_block_if_needed(uvm_va_block_t *va_block, 915 NvU64 start, 916 NvU64 end, 917 uvm_va_block_t **out_va_block) 918 { 919 uvm_va_block_context_t *va_block_context; 920 uvm_va_space_t *va_space; 921 struct mm_struct *mm; 922 struct vm_area_struct *vma; 923 uvm_va_block_region_t region; 924 NvU64 addr, from, to; 925 uvm_va_block_t *new; 926 NV_STATUS status; 927 928 if (va_block->start < start) { 929 status = hmm_split_block(va_block, start - 1, &new); 930 if (status != NV_OK) 931 return status; 932 933 // Keep the left part, the right part will be deleted. 934 va_block = new; 935 } 936 937 if (va_block->end > end) { 938 status = hmm_split_block(va_block, end, NULL); 939 if (status != NV_OK) 940 return status; 941 942 // Keep the right part, the left part will be deleted. 943 } 944 945 *out_va_block = va_block; 946 947 // Migrate any GPU data to sysmem before destroying the HMM va_block. 948 // We do this because the new va_range might be for a UVM external 949 // allocation which could be converting an address range that was first 950 // operated on by UVM-HMM and the exteral allocation should see that data. 951 va_space = va_block->hmm.va_space; 952 mm = va_space->va_space_mm.mm; 953 va_block_context = uvm_va_space_block_context(va_space, mm); 954 955 for (addr = va_block->start; addr < va_block->end; addr = to + 1) { 956 vma = find_vma_intersection(mm, addr, va_block->end); 957 if (!vma) 958 break; 959 960 from = max(addr, (NvU64)vma->vm_start); 961 to = min(va_block->end, (NvU64)vma->vm_end - 1); 962 region = uvm_va_block_region_from_start_end(va_block, from, to); 963 964 if (!uvm_hmm_vma_is_valid(vma, from, false)) 965 continue; 966 967 va_block_context->hmm.vma = vma; 968 969 status = hmm_migrate_range(va_block, 970 NULL, 971 va_block_context, 972 UVM_ID_CPU, 973 from, 974 to, 975 UVM_MIGRATE_MODE_MAKE_RESIDENT_AND_MAP, 976 NULL); 977 if (status != NV_OK) 978 return status; 979 } 980 981 return NV_OK; 982 } 983 984 // Normally, the HMM va_block is destroyed when the va_space is destroyed 985 // (i.e., when the /dev/nvidia-uvm device is closed). A munmap() call triggers 986 // a uvm_hmm_invalidate() callback which unmaps the VMA's range from the GPU's 987 // page tables. However, it doesn't destroy the va_block because that would 988 // require calling mmu_interval_notifier_remove() which can't be called from 989 // the invalidate callback due to Linux locking constraints. If a process 990 // calls mmap()/munmap() for SAM and then creates a managed allocation, 991 // the same VMA range can be picked and there would be a UVM/HMM va_block 992 // conflict. Creating a managed allocation, external allocation, or other 993 // va_range types, calls this function to remove stale HMM va_blocks or split 994 // the HMM va_block so there is no overlap. 995 NV_STATUS uvm_hmm_va_block_reclaim(uvm_va_space_t *va_space, 996 struct mm_struct *mm, 997 NvU64 start, 998 NvU64 end) 999 { 1000 uvm_range_tree_node_t *node, *next; 1001 uvm_va_block_t *va_block; 1002 NV_STATUS status; 1003 1004 if (!uvm_hmm_is_enabled(va_space)) 1005 return NV_OK; 1006 1007 if (mm) 1008 uvm_assert_mmap_lock_locked(mm); 1009 uvm_assert_rwsem_locked_write(&va_space->lock); 1010 1011 // Process each HMM va_block that overlaps the interval [start, end]. 1012 // Note that end is inclusive. 1013 // The blocks_lock is not needed when the va_space lock is held for write. 1014 uvm_range_tree_for_each_in_safe(node, next, &va_space->hmm.blocks, start, end) { 1015 va_block = hmm_va_block_from_node(node); 1016 1017 if (mm) { 1018 status = split_block_if_needed(va_block, start, end, &va_block); 1019 if (status != NV_OK) 1020 return status; 1021 } 1022 1023 // Note that this waits for any invalidations callbacks to complete 1024 // so uvm_hmm_invalidate() won't see a block disapear. 1025 // The va_space write lock should prevent uvm_hmm_va_block_find_create() 1026 // from adding it back. 1027 mmu_interval_notifier_remove(&va_block->hmm.notifier); 1028 uvm_range_tree_remove(&va_space->hmm.blocks, &va_block->hmm.node); 1029 uvm_va_block_kill(va_block); 1030 } 1031 1032 UVM_ASSERT(!uvm_range_tree_iter_first(&va_space->hmm.blocks, start, end)); 1033 1034 return NV_OK; 1035 } 1036 1037 void uvm_hmm_va_block_split_tree(uvm_va_block_t *existing_va_block, uvm_va_block_t *new_block) 1038 { 1039 uvm_va_space_t *va_space = existing_va_block->hmm.va_space; 1040 1041 UVM_ASSERT(uvm_va_block_is_hmm(existing_va_block)); 1042 uvm_assert_rwsem_locked_write(&va_space->lock); 1043 1044 uvm_range_tree_split(&existing_va_block->hmm.va_space->hmm.blocks, 1045 &existing_va_block->hmm.node, 1046 &new_block->hmm.node); 1047 } 1048 1049 NV_STATUS uvm_hmm_split_as_needed(uvm_va_space_t *va_space, 1050 NvU64 addr, 1051 uvm_va_policy_is_split_needed_t split_needed_cb, 1052 void *data) 1053 { 1054 uvm_va_block_t *va_block; 1055 uvm_va_policy_node_t *node; 1056 NV_STATUS status; 1057 1058 uvm_assert_rwsem_locked_write(&va_space->lock); 1059 1060 // If there is no HMM va_block or the va_block doesn't span the policy 1061 // addr, there is no need to split. 1062 status = uvm_hmm_va_block_find(va_space, addr, &va_block); 1063 if (status != NV_OK || va_block->start == addr) 1064 return NV_OK; 1065 1066 uvm_mutex_lock(&va_block->lock); 1067 1068 node = uvm_va_policy_node_find(va_block, addr); 1069 if (!node) 1070 goto done; 1071 1072 // If the policy range doesn't span addr, we're done. 1073 if (addr == node->node.start) 1074 goto done; 1075 1076 if (split_needed_cb(&node->policy, data)) 1077 status = uvm_va_policy_node_split(va_block, node, addr - 1, NULL); 1078 1079 done: 1080 uvm_mutex_unlock(&va_block->lock); 1081 return status; 1082 } 1083 1084 static NV_STATUS hmm_set_preferred_location_locked(uvm_va_block_t *va_block, 1085 uvm_va_block_context_t *va_block_context, 1086 uvm_processor_id_t preferred_location, 1087 int preferred_cpu_nid, 1088 NvU64 addr, 1089 NvU64 end, 1090 uvm_tracker_t *out_tracker) 1091 { 1092 uvm_processor_mask_t set_accessed_by_processors; 1093 const uvm_va_policy_t *old_policy; 1094 uvm_va_policy_node_t *node; 1095 uvm_va_block_region_t region; 1096 uvm_processor_id_t id; 1097 NV_STATUS status, tracker_status; 1098 1099 // Note that we can't just call uvm_va_policy_set_range() for the whole 1100 // range [addr end] because we need to examine the old value of 1101 // policy->preferred_location and policy->preferred_nid before setting it. 1102 // Thus we iterate over the existing policy nodes. 1103 uvm_for_each_va_policy_in(old_policy, va_block, addr, end, node, region) { 1104 if (uvm_va_policy_preferred_location_equal(old_policy, preferred_location, preferred_cpu_nid)) 1105 continue; 1106 1107 // If the old preferred location is a valid processor ID, remote 1108 // mappings should be established to the new preferred location if 1109 // accessed-by is set. 1110 uvm_processor_mask_zero(&set_accessed_by_processors); 1111 1112 if (UVM_ID_IS_VALID(old_policy->preferred_location) && 1113 uvm_processor_mask_test(&old_policy->accessed_by, old_policy->preferred_location)) 1114 uvm_processor_mask_set(&set_accessed_by_processors, old_policy->preferred_location); 1115 1116 if (!uvm_va_policy_set_preferred_location(va_block, 1117 region, 1118 preferred_location, 1119 preferred_cpu_nid, 1120 old_policy)) 1121 return NV_ERR_NO_MEMORY; 1122 1123 // Establish new remote mappings if the old preferred location had 1124 // accessed-by set. 1125 for_each_id_in_mask(id, &set_accessed_by_processors) { 1126 status = uvm_va_block_set_accessed_by_locked(va_block, va_block_context, id, region, out_tracker); 1127 if (status != NV_OK) 1128 return status; 1129 } 1130 1131 // Even though the UVM_VA_BLOCK_RETRY_LOCKED() may unlock and relock 1132 // the va_block lock, the policy remains valid because we hold the mmap 1133 // lock so munmap can't remove the policy, and the va_space lock so the 1134 // policy APIs can't change the policy. 1135 status = UVM_VA_BLOCK_RETRY_LOCKED(va_block, 1136 NULL, 1137 uvm_va_block_set_preferred_location_locked(va_block, 1138 va_block_context, 1139 region)); 1140 1141 tracker_status = uvm_tracker_add_tracker_safe(out_tracker, &va_block->tracker); 1142 if (status == NV_OK) 1143 status = tracker_status; 1144 1145 if (status != NV_OK) 1146 return status; 1147 } 1148 1149 return NV_OK; 1150 } 1151 1152 NV_STATUS uvm_hmm_set_preferred_location(uvm_va_space_t *va_space, 1153 uvm_processor_id_t preferred_location, 1154 int preferred_cpu_nid, 1155 NvU64 base, 1156 NvU64 last_address, 1157 uvm_tracker_t *out_tracker) 1158 { 1159 uvm_va_block_context_t *va_block_context; 1160 uvm_va_block_t *va_block; 1161 NvU64 addr; 1162 NV_STATUS status = NV_OK; 1163 1164 if (!uvm_hmm_is_enabled(va_space)) 1165 return NV_ERR_INVALID_ADDRESS; 1166 1167 uvm_assert_mmap_lock_locked(va_space->va_space_mm.mm); 1168 uvm_assert_rwsem_locked_write(&va_space->lock); 1169 UVM_ASSERT(PAGE_ALIGNED(base)); 1170 UVM_ASSERT(PAGE_ALIGNED(last_address + 1)); 1171 UVM_ASSERT(base < last_address); 1172 1173 // Update HMM preferred location policy. 1174 1175 va_block_context = uvm_va_space_block_context(va_space, va_space->va_space_mm.mm); 1176 1177 for (addr = base; addr < last_address; addr = va_block->end + 1) { 1178 NvU64 end; 1179 1180 status = hmm_va_block_find_create(va_space, addr, true, &va_block_context->hmm.vma, &va_block); 1181 if (status != NV_OK) 1182 break; 1183 1184 end = min(last_address, va_block->end); 1185 1186 uvm_mutex_lock(&va_block->lock); 1187 1188 status = hmm_set_preferred_location_locked(va_block, 1189 va_block_context, 1190 preferred_location, 1191 preferred_cpu_nid, 1192 addr, 1193 end, 1194 out_tracker); 1195 1196 uvm_mutex_unlock(&va_block->lock); 1197 1198 if (status != NV_OK) 1199 break; 1200 } 1201 1202 return status; 1203 } 1204 1205 static NV_STATUS hmm_set_accessed_by_start_end_locked(uvm_va_block_t *va_block, 1206 uvm_va_block_context_t *va_block_context, 1207 uvm_processor_id_t processor_id, 1208 NvU64 start, 1209 NvU64 end, 1210 uvm_tracker_t *out_tracker) 1211 { 1212 uvm_va_space_t *va_space = va_block->hmm.va_space; 1213 uvm_va_policy_node_t *node; 1214 uvm_va_block_region_t region; 1215 NV_STATUS status = NV_OK; 1216 1217 uvm_for_each_va_policy_node_in(node, va_block, start, end) { 1218 // Read duplication takes precedence over SetAccessedBy. 1219 // Do not add mappings if read duplication is enabled. 1220 if (uvm_va_policy_is_read_duplicate(&node->policy, va_space)) 1221 continue; 1222 1223 region = uvm_va_block_region_from_start_end(va_block, 1224 max(start, node->node.start), 1225 min(end, node->node.end)); 1226 1227 status = uvm_va_block_set_accessed_by_locked(va_block, 1228 va_block_context, 1229 processor_id, 1230 region, 1231 out_tracker); 1232 if (status != NV_OK) 1233 break; 1234 } 1235 1236 return status; 1237 } 1238 1239 NV_STATUS uvm_hmm_set_accessed_by(uvm_va_space_t *va_space, 1240 uvm_processor_id_t processor_id, 1241 bool set_bit, 1242 NvU64 base, 1243 NvU64 last_address, 1244 uvm_tracker_t *out_tracker) 1245 { 1246 uvm_va_block_context_t *va_block_context; 1247 uvm_va_block_t *va_block; 1248 NvU64 addr; 1249 NV_STATUS status = NV_OK; 1250 1251 if (!uvm_hmm_is_enabled(va_space)) 1252 return NV_ERR_INVALID_ADDRESS; 1253 1254 uvm_assert_mmap_lock_locked(va_space->va_space_mm.mm); 1255 uvm_assert_rwsem_locked_write(&va_space->lock); 1256 UVM_ASSERT(PAGE_ALIGNED(base)); 1257 UVM_ASSERT(PAGE_ALIGNED(last_address + 1)); 1258 UVM_ASSERT(base < last_address); 1259 1260 // Update HMM accessed by policy. 1261 1262 va_block_context = uvm_va_space_block_context(va_space, va_space->va_space_mm.mm); 1263 1264 for (addr = base; addr < last_address; addr = va_block->end + 1) { 1265 NvU64 end; 1266 1267 status = hmm_va_block_find_create(va_space, addr, true, &va_block_context->hmm.vma, &va_block); 1268 if (status != NV_OK) 1269 break; 1270 1271 end = min(last_address, va_block->end); 1272 1273 uvm_mutex_lock(&va_block->lock); 1274 1275 status = uvm_va_policy_set_range(va_block, 1276 addr, 1277 end, 1278 UVM_VA_POLICY_ACCESSED_BY, 1279 !set_bit, 1280 processor_id, 1281 NUMA_NO_NODE, 1282 UVM_READ_DUPLICATION_MAX); 1283 1284 if (status == NV_OK && set_bit) { 1285 status = hmm_set_accessed_by_start_end_locked(va_block, 1286 va_block_context, 1287 processor_id, 1288 addr, 1289 end, 1290 out_tracker); 1291 } 1292 1293 uvm_mutex_unlock(&va_block->lock); 1294 1295 if (status != NV_OK) 1296 break; 1297 } 1298 1299 return status; 1300 } 1301 1302 void uvm_hmm_block_add_eviction_mappings(uvm_va_space_t *va_space, 1303 uvm_va_block_t *va_block, 1304 uvm_va_block_context_t *block_context) 1305 { 1306 uvm_tracker_t local_tracker = UVM_TRACKER_INIT(); 1307 uvm_va_policy_node_t *node; 1308 uvm_va_block_region_t region; 1309 uvm_processor_mask_t map_processors; 1310 uvm_processor_id_t id; 1311 NV_STATUS tracker_status; 1312 NV_STATUS status = NV_OK; 1313 1314 UVM_ASSERT(uvm_va_block_is_hmm(va_block)); 1315 uvm_assert_mmap_lock_locked(va_space->va_space_mm.mm); 1316 uvm_assert_rwsem_locked(&va_space->lock); 1317 1318 uvm_mutex_lock(&va_block->lock); 1319 1320 uvm_for_each_va_policy_node_in(node, va_block, va_block->start, va_block->end) { 1321 for_each_id_in_mask(id, &node->policy.accessed_by) { 1322 status = hmm_set_accessed_by_start_end_locked(va_block, 1323 block_context, 1324 id, 1325 node->node.start, 1326 node->node.end, 1327 &local_tracker); 1328 if (status != NV_OK) 1329 break; 1330 1331 if (!uvm_va_space_map_remote_on_eviction(va_space)) 1332 continue; 1333 1334 // Exclude the processors that have been already mapped due to 1335 // AccessedBy. 1336 uvm_processor_mask_andnot(&map_processors, &va_block->evicted_gpus, &node->policy.accessed_by); 1337 1338 for_each_gpu_id_in_mask(id, &map_processors) { 1339 uvm_gpu_t *gpu = uvm_va_space_get_gpu(va_space, id); 1340 uvm_va_block_gpu_state_t *gpu_state; 1341 1342 if (!gpu->parent->access_counters_supported) 1343 continue; 1344 1345 gpu_state = uvm_va_block_gpu_state_get(va_block, id); 1346 UVM_ASSERT(gpu_state); 1347 1348 // TODO: Bug 2096389: uvm_va_block_add_mappings does not add 1349 // remote mappings to read-duplicated pages. Add support for it 1350 // or create a new function. 1351 status = uvm_va_block_add_mappings(va_block, 1352 block_context, 1353 id, 1354 region, 1355 &gpu_state->evicted, 1356 UvmEventMapRemoteCauseEviction); 1357 tracker_status = uvm_tracker_add_tracker_safe(&local_tracker, &va_block->tracker); 1358 status = (status == NV_OK) ? tracker_status : status; 1359 if (status != NV_OK) { 1360 UVM_ASSERT(status != NV_ERR_MORE_PROCESSING_REQUIRED); 1361 break; 1362 } 1363 } 1364 } 1365 } 1366 1367 uvm_mutex_unlock(&va_block->lock); 1368 1369 tracker_status = uvm_tracker_wait_deinit(&local_tracker); 1370 status = (status == NV_OK) ? tracker_status : status; 1371 if (status != NV_OK) { 1372 UVM_ERR_PRINT("Deferred mappings to evicted memory for block [0x%llx, 0x%llx] failed %s\n", 1373 va_block->start, 1374 va_block->end, 1375 nvstatusToString(status)); 1376 } 1377 } 1378 1379 const uvm_va_policy_t *uvm_hmm_find_policy_end(uvm_va_block_t *va_block, 1380 struct vm_area_struct *vma, 1381 unsigned long addr, 1382 NvU64 *endp) 1383 { 1384 const uvm_va_policy_node_t *node; 1385 const uvm_va_policy_t *policy; 1386 NvU64 end = va_block->end; 1387 1388 uvm_assert_mmap_lock_locked(vma->vm_mm); 1389 uvm_assert_mutex_locked(&va_block->lock); 1390 1391 if (end > vma->vm_end - 1) 1392 end = vma->vm_end - 1; 1393 1394 node = uvm_va_policy_node_find(va_block, addr); 1395 if (node) { 1396 policy = &node->policy; 1397 if (end > node->node.end) 1398 end = node->node.end; 1399 } 1400 else { 1401 policy = &uvm_va_policy_default; 1402 } 1403 1404 *endp = end; 1405 1406 return policy; 1407 } 1408 1409 NV_STATUS uvm_hmm_find_policy_vma_and_outer(uvm_va_block_t *va_block, 1410 struct vm_area_struct **vma_out, 1411 uvm_page_index_t page_index, 1412 const uvm_va_policy_t **policy, 1413 uvm_page_index_t *outerp) 1414 { 1415 unsigned long addr; 1416 NvU64 end; 1417 uvm_page_index_t outer; 1418 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block); 1419 struct mm_struct *mm = va_space->va_space_mm.mm; 1420 1421 if (!mm) 1422 return NV_ERR_INVALID_ADDRESS; 1423 1424 UVM_ASSERT(uvm_va_block_is_hmm(va_block)); 1425 uvm_assert_mmap_lock_locked(mm); 1426 uvm_assert_mutex_locked(&va_block->lock); 1427 1428 addr = uvm_va_block_cpu_page_address(va_block, page_index); 1429 1430 *vma_out = vma_lookup(mm, addr); 1431 if (!*vma_out || !((*vma_out)->vm_flags & VM_READ)) 1432 return NV_ERR_INVALID_ADDRESS; 1433 1434 *policy = uvm_hmm_find_policy_end(va_block, *vma_out, addr, &end); 1435 1436 outer = uvm_va_block_cpu_page_index(va_block, end) + 1; 1437 if (*outerp > outer) 1438 *outerp = outer; 1439 1440 return NV_OK; 1441 } 1442 1443 static NV_STATUS hmm_clear_thrashing_policy(uvm_va_block_t *va_block, 1444 uvm_va_block_context_t *block_context) 1445 { 1446 const uvm_va_policy_t *policy; 1447 uvm_va_policy_node_t *node; 1448 uvm_va_block_region_t region; 1449 NV_STATUS status = NV_OK; 1450 1451 uvm_mutex_lock(&va_block->lock); 1452 1453 uvm_for_each_va_policy_in(policy, va_block, va_block->start, va_block->end, node, region) { 1454 // Unmap may split PTEs and require a retry. Needs to be called 1455 // before the pinned pages information is destroyed. 1456 status = UVM_VA_BLOCK_RETRY_LOCKED(va_block, 1457 NULL, 1458 uvm_perf_thrashing_unmap_remote_pinned_pages_all(va_block, 1459 block_context, 1460 region)); 1461 1462 uvm_perf_thrashing_info_destroy(va_block); 1463 1464 if (status != NV_OK) 1465 break; 1466 } 1467 1468 uvm_mutex_unlock(&va_block->lock); 1469 1470 return status; 1471 } 1472 1473 NV_STATUS uvm_hmm_clear_thrashing_policy(uvm_va_space_t *va_space) 1474 { 1475 uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, NULL); 1476 uvm_range_tree_node_t *node, *next; 1477 uvm_va_block_t *va_block; 1478 NV_STATUS status = NV_OK; 1479 1480 if (!uvm_hmm_is_enabled(va_space)) 1481 return NV_OK; 1482 1483 uvm_assert_rwsem_locked_write(&va_space->lock); 1484 1485 uvm_range_tree_for_each_safe(node, next, &va_space->hmm.blocks) { 1486 va_block = hmm_va_block_from_node(node); 1487 1488 status = hmm_clear_thrashing_policy(va_block, block_context); 1489 if (status != NV_OK) 1490 break; 1491 } 1492 1493 return status; 1494 } 1495 1496 uvm_va_block_region_t uvm_hmm_get_prefetch_region(uvm_va_block_t *va_block, 1497 struct vm_area_struct *vma, 1498 const uvm_va_policy_t *policy, 1499 NvU64 address) 1500 { 1501 NvU64 start, end; 1502 1503 UVM_ASSERT(uvm_va_block_is_hmm(va_block)); 1504 1505 // We need to limit the prefetch region to the VMA. 1506 start = max(va_block->start, (NvU64)vma->vm_start); 1507 end = min(va_block->end, (NvU64)vma->vm_end - 1); 1508 1509 // Also, we need to limit the prefetch region to the policy range. 1510 if (uvm_va_policy_is_default(policy)) { 1511 NV_STATUS status = uvm_range_tree_find_hole_in(&va_block->hmm.va_policy_tree, 1512 address, 1513 &start, 1514 &end); 1515 // We already know the hole exists and covers the fault region. 1516 UVM_ASSERT(status == NV_OK); 1517 } 1518 else { 1519 const uvm_va_policy_node_t *node = uvm_va_policy_node_from_policy(policy); 1520 1521 start = max(start, node->node.start); 1522 end = min(end, node->node.end); 1523 } 1524 1525 return uvm_va_block_region_from_start_end(va_block, start, end); 1526 } 1527 1528 uvm_prot_t uvm_hmm_compute_logical_prot(uvm_va_block_t *va_block, 1529 struct vm_area_struct *vma, 1530 NvU64 addr) 1531 { 1532 UVM_ASSERT(uvm_va_block_is_hmm(va_block)); 1533 uvm_assert_mmap_lock_locked(va_block->hmm.va_space->va_space_mm.mm); 1534 UVM_ASSERT(vma && addr >= vma->vm_start && addr < vma->vm_end); 1535 1536 if (!(vma->vm_flags & VM_READ)) 1537 return UVM_PROT_NONE; 1538 else if (!(vma->vm_flags & VM_WRITE)) 1539 return UVM_PROT_READ_ONLY; 1540 else 1541 return UVM_PROT_READ_WRITE_ATOMIC; 1542 } 1543 1544 static NV_STATUS hmm_va_block_cpu_page_populate(uvm_va_block_t *va_block, 1545 uvm_page_index_t page_index, 1546 struct page *page) 1547 { 1548 uvm_cpu_chunk_t *chunk; 1549 NV_STATUS status; 1550 1551 UVM_ASSERT(uvm_va_block_is_hmm(va_block)); 1552 UVM_ASSERT(!uvm_page_mask_test(&va_block->cpu.allocated, page_index)); 1553 1554 if (page == ZERO_PAGE(uvm_va_block_cpu_page_address(va_block, page_index))) 1555 return NV_ERR_INVALID_ADDRESS; 1556 1557 status = uvm_cpu_chunk_alloc_hmm(page, &chunk); 1558 if (status != NV_OK) 1559 return status; 1560 1561 status = uvm_cpu_chunk_insert_in_block(va_block, chunk, page_index); 1562 if (status != NV_OK) { 1563 uvm_cpu_chunk_free(chunk); 1564 return status; 1565 } 1566 1567 status = uvm_va_block_map_cpu_chunk_on_gpus(va_block, chunk, page_index); 1568 if (status != NV_OK) { 1569 uvm_cpu_chunk_remove_from_block(va_block, page_to_nid(page), page_index); 1570 uvm_cpu_chunk_free(chunk); 1571 } 1572 1573 return status; 1574 } 1575 1576 static void hmm_va_block_cpu_unpopulate_chunk(uvm_va_block_t *va_block, 1577 uvm_cpu_chunk_t *chunk, 1578 int chunk_nid, 1579 uvm_page_index_t page_index) 1580 { 1581 if (!chunk) 1582 return; 1583 1584 UVM_ASSERT(!uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU) || 1585 !uvm_va_block_cpu_is_page_resident_on(va_block, NUMA_NO_NODE, page_index)); 1586 UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == PAGE_SIZE); 1587 1588 uvm_cpu_chunk_remove_from_block(va_block, chunk_nid, page_index); 1589 uvm_va_block_unmap_cpu_chunk_on_gpus(va_block, chunk, page_index); 1590 uvm_cpu_chunk_free(chunk); 1591 } 1592 1593 static void hmm_va_block_cpu_page_unpopulate(uvm_va_block_t *va_block, uvm_page_index_t page_index, struct page *page) 1594 { 1595 uvm_cpu_chunk_t *chunk; 1596 1597 UVM_ASSERT(uvm_va_block_is_hmm(va_block)); 1598 1599 if (page) { 1600 chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, page_to_nid(page), page_index); 1601 hmm_va_block_cpu_unpopulate_chunk(va_block, chunk, page_to_nid(page), page_index); 1602 } 1603 else { 1604 int nid; 1605 1606 for_each_possible_uvm_node(nid) { 1607 chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, nid, page_index); 1608 hmm_va_block_cpu_unpopulate_chunk(va_block, chunk, nid, page_index); 1609 } 1610 } 1611 } 1612 1613 static bool hmm_va_block_cpu_page_is_same(uvm_va_block_t *va_block, 1614 uvm_page_index_t page_index, 1615 struct page *page) 1616 { 1617 struct page *old_page = uvm_va_block_get_cpu_page(va_block, page_index); 1618 1619 UVM_ASSERT(uvm_cpu_chunk_is_hmm(uvm_cpu_chunk_get_chunk_for_page(va_block, page_to_nid(page), page_index))); 1620 return old_page == page; 1621 } 1622 1623 // uvm_va_block_service_copy() and uvm_va_block_service_finish() expect the 1624 // service_context masks to match what is being processed. Since a page 1625 // that was expected to be processed isn't migrating, we have to clear the 1626 // masks to make service_context consistent with what is actually being 1627 // handled. 1628 static void clear_service_context_masks(uvm_service_block_context_t *service_context, 1629 uvm_processor_id_t new_residency, 1630 uvm_page_index_t page_index) 1631 { 1632 uvm_page_mask_clear(&service_context->block_context->caller_page_mask, page_index); 1633 1634 uvm_page_mask_clear(&service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency, 1635 page_index); 1636 1637 if (uvm_page_mask_empty(&service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency)) 1638 uvm_processor_mask_clear(&service_context->resident_processors, new_residency); 1639 1640 if (UVM_ID_IS_VALID(service_context->prefetch_hint.residency)) 1641 uvm_page_mask_clear(&service_context->prefetch_hint.prefetch_pages_mask, page_index); 1642 1643 if (service_context->thrashing_pin_count > 0 && 1644 uvm_page_mask_test_and_clear(&service_context->thrashing_pin_mask, page_index)) { 1645 service_context->thrashing_pin_count--; 1646 } 1647 1648 if (service_context->read_duplicate_count > 0 && 1649 uvm_page_mask_test_and_clear(&service_context->read_duplicate_mask, page_index)) { 1650 service_context->read_duplicate_count--; 1651 } 1652 } 1653 1654 static void cpu_mapping_set(uvm_va_block_t *va_block, 1655 bool is_write, 1656 uvm_page_index_t page_index) 1657 { 1658 uvm_processor_mask_set(&va_block->mapped, UVM_ID_CPU); 1659 uvm_page_mask_set(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ], page_index); 1660 if (is_write) 1661 uvm_page_mask_set(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE], page_index); 1662 else 1663 uvm_page_mask_clear(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE], page_index); 1664 } 1665 1666 static void cpu_mapping_clear(uvm_va_block_t *va_block, uvm_page_index_t page_index) 1667 { 1668 uvm_page_mask_clear(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE], page_index); 1669 uvm_page_mask_clear(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ], page_index); 1670 if (uvm_page_mask_empty(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ])) 1671 uvm_processor_mask_clear(&va_block->mapped, UVM_ID_CPU); 1672 } 1673 1674 static void gpu_chunk_remove(uvm_va_block_t *va_block, 1675 uvm_page_index_t page_index, 1676 struct page *page) 1677 { 1678 uvm_va_block_gpu_state_t *gpu_state; 1679 uvm_gpu_chunk_t *gpu_chunk; 1680 uvm_gpu_id_t id; 1681 1682 id = uvm_pmm_devmem_page_to_gpu_id(page); 1683 gpu_state = uvm_va_block_gpu_state_get(va_block, id); 1684 UVM_ASSERT(gpu_state); 1685 1686 gpu_chunk = gpu_state->chunks[page_index]; 1687 if (!gpu_chunk) { 1688 // If we didn't find a chunk it's because the page was unmapped for 1689 // mremap and no fault has established a new mapping. 1690 UVM_ASSERT(!uvm_page_mask_test(&gpu_state->resident, page_index)); 1691 return; 1692 } 1693 1694 // TODO: Bug 3898467: unmap indirect peers when freeing GPU chunks 1695 1696 uvm_mmu_chunk_unmap(gpu_chunk, &va_block->tracker); 1697 gpu_state->chunks[page_index] = NULL; 1698 } 1699 1700 static NV_STATUS gpu_chunk_add(uvm_va_block_t *va_block, 1701 uvm_page_index_t page_index, 1702 struct page *page) 1703 { 1704 uvm_va_block_gpu_state_t *gpu_state; 1705 uvm_gpu_chunk_t *gpu_chunk; 1706 uvm_gpu_id_t id; 1707 NV_STATUS status; 1708 1709 id = uvm_pmm_devmem_page_to_gpu_id(page); 1710 gpu_state = uvm_va_block_gpu_state_get(va_block, id); 1711 1712 // It's possible that this is a fresh va_block we're trying to add an 1713 // existing gpu_chunk to. This occurs for example when a GPU faults on a 1714 // virtual address that has been remapped with mremap(). 1715 if (!gpu_state) { 1716 status = uvm_va_block_gpu_state_alloc(va_block); 1717 if (status != NV_OK) 1718 return status; 1719 gpu_state = uvm_va_block_gpu_state_get(va_block, id); 1720 } 1721 1722 UVM_ASSERT(gpu_state); 1723 1724 // Note that a mremap() might be to a CPU virtual address that is nolonger 1725 // aligned with a larger GPU chunk size. We would need to allocate a new 1726 // aligned GPU chunk and copy from old to new. 1727 // TODO: Bug 3368756: add support for large GPU pages. 1728 gpu_chunk = uvm_pmm_devmem_page_to_chunk(page); 1729 UVM_ASSERT(gpu_chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED); 1730 UVM_ASSERT(gpu_chunk->is_referenced); 1731 UVM_ASSERT(page->zone_device_data == va_block->hmm.va_space); 1732 1733 if (gpu_state->chunks[page_index] == gpu_chunk) 1734 return NV_OK; 1735 1736 UVM_ASSERT(!gpu_state->chunks[page_index]); 1737 1738 // In some configurations such as SR-IOV heavy, the chunk cannot be 1739 // referenced using its physical address. Create a virtual mapping. 1740 status = uvm_mmu_chunk_map(gpu_chunk); 1741 if (status != NV_OK) 1742 return status; 1743 1744 // TODO: Bug 3898467: map indirect peers. 1745 1746 uvm_processor_mask_set(&va_block->resident, id); 1747 uvm_page_mask_set(&gpu_state->resident, page_index); 1748 1749 // It is safe to modify the page index field without holding any PMM locks 1750 // because the chunk is allocated, which means that none of the other 1751 // fields in the bitmap can change. 1752 gpu_chunk->va_block = va_block; 1753 gpu_chunk->va_block_page_index = page_index; 1754 1755 gpu_state->chunks[page_index] = gpu_chunk; 1756 1757 return NV_OK; 1758 } 1759 1760 // This is called just before calling migrate_vma_finalize() in order to wait 1761 // for GPU operations to complete and update the va_block state to match which 1762 // pages migrated (or not) and therefore which pages will be released by 1763 // migrate_vma_finalize(). 1764 // 'migrated_pages' is the mask of pages that migrated, 1765 // 'same_devmem_page_mask' is the mask of pages that are the same in src_pfns 1766 // and dst_pfns and therefore appear to migrate_vma_*() to be not migrating. 1767 // 'region' is the page index region of all migrated, non-migrated, and 1768 // same_devmem_page_mask pages. 1769 static NV_STATUS sync_page_and_chunk_state(uvm_va_block_t *va_block, 1770 const unsigned long *src_pfns, 1771 const unsigned long *dst_pfns, 1772 uvm_va_block_region_t region, 1773 const uvm_page_mask_t *migrated_pages, 1774 const uvm_page_mask_t *same_devmem_page_mask) 1775 { 1776 uvm_page_index_t page_index; 1777 NV_STATUS status; 1778 1779 // Wait for the GPU to finish. migrate_vma_finalize() will release the 1780 // migrated source pages (or non migrating destination pages), so GPU 1781 // opererations must be finished by then. 1782 status = uvm_tracker_wait(&va_block->tracker); 1783 1784 for_each_va_block_page_in_region(page_index, region) { 1785 struct page *page; 1786 1787 if (uvm_page_mask_test(same_devmem_page_mask, page_index)) 1788 continue; 1789 1790 // If a page migrated, clean up the source page. 1791 // Otherwise, clean up the destination page. 1792 if (uvm_page_mask_test(migrated_pages, page_index)) 1793 page = migrate_pfn_to_page(src_pfns[page_index]); 1794 else 1795 page = migrate_pfn_to_page(dst_pfns[page_index]); 1796 1797 if (!page) 1798 continue; 1799 1800 if (is_device_private_page(page)) { 1801 gpu_chunk_remove(va_block, page_index, page); 1802 } 1803 else { 1804 // If the source page is a system memory page, 1805 // migrate_vma_finalize() will release the reference so we should 1806 // clear our pointer to it. 1807 // TODO: Bug 3660922: Need to handle read duplication at some point. 1808 hmm_va_block_cpu_page_unpopulate(va_block, page_index, page); 1809 } 1810 } 1811 1812 return status; 1813 } 1814 1815 // Update va_block state to reflect that the page isn't migrating. 1816 static void clean_up_non_migrating_page(uvm_va_block_t *va_block, 1817 const unsigned long *src_pfns, 1818 unsigned long *dst_pfns, 1819 uvm_page_index_t page_index) 1820 { 1821 struct page *dst_page = migrate_pfn_to_page(dst_pfns[page_index]); 1822 1823 if (!dst_page) 1824 return; 1825 1826 // migrate_vma_finalize() will release the dst_page reference so don't keep 1827 // a pointer to it. 1828 if (is_device_private_page(dst_page)) { 1829 gpu_chunk_remove(va_block, page_index, dst_page); 1830 } 1831 else { 1832 UVM_ASSERT(page_ref_count(dst_page) == 1); 1833 1834 hmm_va_block_cpu_page_unpopulate(va_block, page_index, dst_page); 1835 } 1836 1837 unlock_page(dst_page); 1838 put_page(dst_page); 1839 dst_pfns[page_index] = 0; 1840 } 1841 1842 static void clean_up_non_migrating_pages(uvm_va_block_t *va_block, 1843 const unsigned long *src_pfns, 1844 unsigned long *dst_pfns, 1845 uvm_va_block_region_t region, 1846 uvm_page_mask_t *page_mask) 1847 { 1848 uvm_page_index_t page_index; 1849 NV_STATUS status; 1850 1851 status = uvm_tracker_wait(&va_block->tracker); 1852 UVM_ASSERT(status == NV_OK); 1853 1854 for_each_va_block_page_in_region_mask(page_index, page_mask, region) { 1855 clean_up_non_migrating_page(va_block, src_pfns, dst_pfns, page_index); 1856 } 1857 } 1858 1859 // CPU page fault handling. 1860 1861 // Fill in the dst_pfns[page_index] entry given that there is an allocated 1862 // CPU page. 1863 static void lock_block_cpu_page(uvm_va_block_t *va_block, 1864 uvm_page_index_t page_index, 1865 struct page *src_page, 1866 unsigned long *dst_pfns, 1867 uvm_page_mask_t *same_devmem_page_mask) 1868 { 1869 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, page_to_nid(src_page), page_index); 1870 uvm_va_block_region_t chunk_region; 1871 struct page *dst_page; 1872 1873 UVM_ASSERT(chunk); 1874 UVM_ASSERT(chunk->page); 1875 1876 chunk_region = uvm_va_block_chunk_region(va_block, uvm_cpu_chunk_get_size(chunk), page_index); 1877 1878 dst_page = chunk->page + (page_index - chunk_region.first); 1879 1880 UVM_ASSERT(dst_page != ZERO_PAGE(uvm_va_block_cpu_page_address(va_block, page_index))); 1881 UVM_ASSERT(!is_device_private_page(dst_page)); 1882 1883 // The source page is usually a device private page but it could be a GPU 1884 // remote mapped system memory page. It could also be a driver allocated 1885 // page for GPU-to-GPU staged copies (i.e., not a resident copy and owned 1886 // by the driver). 1887 if (is_device_private_page(src_page)) { 1888 // Since the page isn't mirrored, it was allocated by alloc_pages() 1889 // and UVM owns the reference. We leave the reference count unchanged 1890 // and mark the page pointer as mirrored since UVM is transferring 1891 // ownership to Linux and we don't want UVM to double free the page in 1892 // hmm_va_block_cpu_page_unpopulate() or block_kill(). If the page 1893 // does not migrate, it will be freed though. 1894 UVM_ASSERT(!uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU) || 1895 !uvm_va_block_cpu_is_page_resident_on(va_block, NUMA_NO_NODE, page_index)); 1896 UVM_ASSERT(chunk->type == UVM_CPU_CHUNK_TYPE_PHYSICAL); 1897 UVM_ASSERT(page_ref_count(dst_page) == 1); 1898 uvm_cpu_chunk_make_hmm(chunk); 1899 } 1900 else { 1901 UVM_ASSERT(same_devmem_page_mask); 1902 UVM_ASSERT(src_page == dst_page); 1903 uvm_page_mask_set(same_devmem_page_mask, page_index); 1904 1905 // The call to migrate_vma_setup() will have inserted a migration PTE 1906 // so the CPU has no access. 1907 cpu_mapping_clear(va_block, page_index); 1908 return; 1909 } 1910 1911 lock_page(dst_page); 1912 dst_pfns[page_index] = migrate_pfn(page_to_pfn(dst_page)); 1913 } 1914 1915 static void hmm_mark_gpu_chunk_referenced(uvm_va_block_t *va_block, 1916 uvm_gpu_t *gpu, 1917 uvm_gpu_chunk_t *gpu_chunk) 1918 { 1919 // Tell PMM to expect a callback from Linux to free the page since the 1920 // device private struct page reference count will determine when the 1921 // GPU chunk is free. 1922 UVM_ASSERT(gpu_chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED); 1923 list_del_init(&gpu_chunk->list); 1924 uvm_pmm_gpu_unpin_referenced(&gpu->pmm, gpu_chunk, va_block); 1925 } 1926 1927 static void fill_dst_pfn(uvm_va_block_t *va_block, 1928 uvm_gpu_t *gpu, 1929 const unsigned long *src_pfns, 1930 unsigned long *dst_pfns, 1931 uvm_page_index_t page_index, 1932 uvm_page_mask_t *same_devmem_page_mask) 1933 { 1934 unsigned long src_pfn = src_pfns[page_index]; 1935 uvm_gpu_chunk_t *gpu_chunk; 1936 unsigned long pfn; 1937 struct page *dpage; 1938 1939 gpu_chunk = uvm_va_block_lookup_gpu_chunk(va_block, gpu, uvm_va_block_cpu_page_address(va_block, page_index)); 1940 UVM_ASSERT(gpu_chunk); 1941 UVM_ASSERT(gpu_chunk->log2_size == PAGE_SHIFT); 1942 pfn = uvm_pmm_gpu_devmem_get_pfn(&gpu->pmm, gpu_chunk); 1943 1944 // If the same GPU page is both source and destination, migrate_vma_pages() 1945 // will see the wrong "expected" reference count and not migrate it, so we 1946 // mark it as not migrating but we keep track of this so we don't confuse 1947 // it with a page that migrate_vma_pages() actually does not migrate. 1948 if ((src_pfn & MIGRATE_PFN_VALID) && (src_pfn >> MIGRATE_PFN_SHIFT) == pfn) { 1949 uvm_page_mask_set(same_devmem_page_mask, page_index); 1950 return; 1951 } 1952 1953 dpage = pfn_to_page(pfn); 1954 UVM_ASSERT(is_device_private_page(dpage)); 1955 UVM_ASSERT(dpage->pgmap->owner == &g_uvm_global); 1956 1957 hmm_mark_gpu_chunk_referenced(va_block, gpu, gpu_chunk); 1958 UVM_ASSERT(!page_count(dpage)); 1959 zone_device_page_init(dpage); 1960 dpage->zone_device_data = va_block->hmm.va_space; 1961 1962 dst_pfns[page_index] = migrate_pfn(pfn); 1963 } 1964 1965 static void fill_dst_pfns(uvm_va_block_t *va_block, 1966 const unsigned long *src_pfns, 1967 unsigned long *dst_pfns, 1968 uvm_va_block_region_t region, 1969 uvm_page_mask_t *page_mask, 1970 uvm_page_mask_t *same_devmem_page_mask, 1971 uvm_processor_id_t dest_id) 1972 { 1973 uvm_gpu_t *gpu = uvm_va_space_get_gpu(va_block->hmm.va_space, dest_id); 1974 uvm_page_index_t page_index; 1975 1976 uvm_page_mask_zero(same_devmem_page_mask); 1977 1978 for_each_va_block_page_in_region_mask(page_index, page_mask, region) { 1979 if (!(src_pfns[page_index] & MIGRATE_PFN_MIGRATE)) 1980 continue; 1981 1982 fill_dst_pfn(va_block, 1983 gpu, 1984 src_pfns, 1985 dst_pfns, 1986 page_index, 1987 same_devmem_page_mask); 1988 } 1989 } 1990 1991 static NV_STATUS alloc_page_on_cpu(uvm_va_block_t *va_block, 1992 uvm_page_index_t page_index, 1993 const unsigned long *src_pfns, 1994 unsigned long *dst_pfns, 1995 uvm_page_mask_t *same_devmem_page_mask, 1996 uvm_va_block_context_t *block_context) 1997 { 1998 NV_STATUS status; 1999 struct page *src_page; 2000 struct page *dst_page; 2001 2002 // This is the page that will be copied to system memory. 2003 src_page = migrate_pfn_to_page(src_pfns[page_index]); 2004 2005 if (src_page) { 2006 // mremap may have caused us to lose the gpu_chunk associated with 2007 // this va_block/page_index so make sure we have the correct chunk. 2008 if (is_device_private_page(src_page)) 2009 gpu_chunk_add(va_block, page_index, src_page); 2010 2011 if (uvm_page_mask_test(&va_block->cpu.allocated, page_index)) { 2012 lock_block_cpu_page(va_block, page_index, src_page, dst_pfns, same_devmem_page_mask); 2013 return NV_OK; 2014 } 2015 } 2016 2017 UVM_ASSERT(!uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU) || 2018 !uvm_va_block_cpu_is_page_resident_on(va_block, NUMA_NO_NODE, page_index)); 2019 2020 status = uvm_va_block_populate_page_cpu(va_block, page_index, block_context); 2021 if (status != NV_OK) 2022 return status; 2023 2024 // TODO: Bug 3368756: add support for transparent huge pages 2025 // Support for large CPU pages means the page_index may need fixing 2026 dst_page = migrate_pfn_to_page(block_context->hmm.dst_pfns[page_index]); 2027 2028 // Note that we don't call get_page(dst_page) since alloc_page_vma() 2029 // returns with a page reference count of one and we are passing 2030 // ownership to Linux. Also, uvm_va_block_cpu_page_populate() recorded 2031 // the page as "mirrored" so that migrate_vma_finalize() and 2032 // hmm_va_block_cpu_page_unpopulate() don't double free the page. 2033 lock_page(dst_page); 2034 dst_pfns[page_index] = migrate_pfn(page_to_pfn(dst_page)); 2035 2036 return NV_OK; 2037 } 2038 2039 // Allocates pages on the CPU to handle migration due to a page fault 2040 static NV_STATUS fault_alloc_on_cpu(uvm_va_block_t *va_block, 2041 const unsigned long *src_pfns, 2042 unsigned long *dst_pfns, 2043 uvm_va_block_region_t region, 2044 uvm_page_mask_t *page_mask, 2045 uvm_page_mask_t *same_devmem_page_mask, 2046 uvm_processor_id_t fault_processor_id, 2047 uvm_service_block_context_t *service_context) 2048 { 2049 uvm_page_index_t page_index; 2050 NV_STATUS status = NV_OK; 2051 2052 UVM_ASSERT(service_context); 2053 2054 for_each_va_block_page_in_region_mask(page_index, page_mask, region) { 2055 if (!(src_pfns[page_index] & MIGRATE_PFN_MIGRATE)) { 2056 // Device exclusive PTEs are not selected but we still want to 2057 // process the page so record it as such. 2058 if (!UVM_ID_IS_CPU(fault_processor_id) && 2059 service_context->access_type[page_index] == UVM_FAULT_ACCESS_TYPE_ATOMIC_STRONG) { 2060 uvm_page_mask_set(same_devmem_page_mask, page_index); 2061 continue; 2062 } 2063 2064 // We have previously found a page that is CPU resident which can't 2065 // be migrated (probably a shared mapping) so make sure we establish 2066 // a remote mapping for it. 2067 if (uvm_page_mask_test(same_devmem_page_mask, page_index)) 2068 continue; 2069 2070 goto clr_mask; 2071 } 2072 2073 status = alloc_page_on_cpu(va_block, page_index, src_pfns, dst_pfns, same_devmem_page_mask, service_context->block_context); 2074 if (status != NV_OK) { 2075 // Ignore errors if the page is only for prefetching. 2076 if (service_context && 2077 service_context->access_type[page_index] == UVM_FAULT_ACCESS_TYPE_PREFETCH) 2078 goto clr_mask; 2079 break; 2080 } 2081 continue; 2082 2083 clr_mask: 2084 // TODO: Bug 3900774: clean up murky mess of mask clearing. 2085 uvm_page_mask_clear(page_mask, page_index); 2086 clear_service_context_masks(service_context, UVM_ID_CPU, page_index); 2087 } 2088 2089 if (status != NV_OK) 2090 clean_up_non_migrating_pages(va_block, src_pfns, dst_pfns, region, page_mask); 2091 else if (uvm_page_mask_empty(page_mask)) 2092 return NV_WARN_MORE_PROCESSING_REQUIRED; 2093 2094 return status; 2095 } 2096 2097 // Allocates pages on the CPU for explicit migration calls. 2098 static NV_STATUS migrate_alloc_on_cpu(uvm_va_block_t *va_block, 2099 const unsigned long *src_pfns, 2100 unsigned long *dst_pfns, 2101 uvm_va_block_region_t region, 2102 uvm_page_mask_t *page_mask, 2103 uvm_page_mask_t *same_devmem_page_mask, 2104 uvm_va_block_context_t *block_context) 2105 { 2106 uvm_page_index_t page_index; 2107 NV_STATUS status = NV_OK; 2108 2109 for_each_va_block_page_in_region_mask(page_index, page_mask, region) { 2110 if (!(src_pfns[page_index] & MIGRATE_PFN_MIGRATE)) { 2111 // We have previously found a page that is CPU resident which can't 2112 // be migrated (probably a shared mapping) so make sure we establish 2113 // a remote mapping for it. 2114 if (uvm_page_mask_test(same_devmem_page_mask, page_index)) 2115 continue; 2116 2117 uvm_page_mask_clear(page_mask, page_index); 2118 continue; 2119 } 2120 2121 status = alloc_page_on_cpu(va_block, page_index, src_pfns, dst_pfns, same_devmem_page_mask, block_context); 2122 } 2123 2124 if (status != NV_OK) 2125 clean_up_non_migrating_pages(va_block, src_pfns, dst_pfns, region, page_mask); 2126 else if (uvm_page_mask_empty(page_mask)) 2127 return NV_WARN_MORE_PROCESSING_REQUIRED; 2128 2129 return status; 2130 } 2131 static NV_STATUS uvm_hmm_devmem_fault_alloc_and_copy(uvm_hmm_devmem_fault_context_t *devmem_fault_context) 2132 { 2133 uvm_processor_id_t processor_id; 2134 uvm_service_block_context_t *service_context; 2135 uvm_va_block_retry_t *va_block_retry; 2136 const unsigned long *src_pfns; 2137 unsigned long *dst_pfns; 2138 uvm_page_mask_t *page_mask; 2139 uvm_page_mask_t *same_devmem_page_mask = &devmem_fault_context->same_devmem_page_mask; 2140 uvm_va_block_t *va_block; 2141 NV_STATUS status = NV_OK; 2142 2143 processor_id = devmem_fault_context->processor_id; 2144 service_context = devmem_fault_context->service_context; 2145 va_block_retry = devmem_fault_context->va_block_retry; 2146 va_block = devmem_fault_context->va_block; 2147 src_pfns = service_context->block_context->hmm.src_pfns; 2148 dst_pfns = service_context->block_context->hmm.dst_pfns; 2149 2150 // Build the migration page mask. 2151 // Note that thrashing pinned pages and prefetch pages are already 2152 // accounted for in service_context->per_processor_masks. 2153 page_mask = &devmem_fault_context->page_mask; 2154 uvm_page_mask_copy(page_mask, &service_context->per_processor_masks[UVM_ID_CPU_VALUE].new_residency); 2155 2156 status = fault_alloc_on_cpu(va_block, 2157 src_pfns, 2158 dst_pfns, 2159 service_context->region, 2160 page_mask, 2161 same_devmem_page_mask, 2162 processor_id, 2163 service_context); 2164 if (status != NV_OK) 2165 return status; 2166 2167 // Do the copy but don't update the residency or mapping for the new 2168 // location yet. 2169 return uvm_va_block_service_copy(processor_id, UVM_ID_CPU, va_block, va_block_retry, service_context); 2170 } 2171 2172 static NV_STATUS uvm_hmm_devmem_fault_finalize_and_map(uvm_hmm_devmem_fault_context_t *devmem_fault_context) 2173 { 2174 uvm_processor_id_t processor_id; 2175 uvm_service_block_context_t *service_context; 2176 uvm_perf_prefetch_hint_t *prefetch_hint; 2177 uvm_va_block_retry_t *va_block_retry; 2178 const unsigned long *src_pfns; 2179 unsigned long *dst_pfns; 2180 uvm_page_mask_t *page_mask; 2181 uvm_va_block_t *va_block; 2182 uvm_va_block_region_t region; 2183 uvm_page_index_t page_index; 2184 NV_STATUS status, tracker_status; 2185 2186 processor_id = devmem_fault_context->processor_id; 2187 service_context = devmem_fault_context->service_context; 2188 prefetch_hint = &service_context->prefetch_hint; 2189 va_block = devmem_fault_context->va_block; 2190 va_block_retry = devmem_fault_context->va_block_retry; 2191 src_pfns = service_context->block_context->hmm.src_pfns; 2192 dst_pfns = service_context->block_context->hmm.dst_pfns; 2193 region = service_context->region; 2194 2195 page_mask = &devmem_fault_context->page_mask; 2196 2197 // There are a number of reasons why HMM will mark a page as not migrating 2198 // even if we set a valid entry in dst_pfns[]. Mark these pages accordingly. 2199 for_each_va_block_page_in_region_mask(page_index, page_mask, region) { 2200 if (src_pfns[page_index] & MIGRATE_PFN_MIGRATE) 2201 continue; 2202 2203 // If a page isn't migrating and only the GPU page table is being 2204 // updated, continue to process it normally. 2205 if (uvm_page_mask_test(&devmem_fault_context->same_devmem_page_mask, page_index)) 2206 continue; 2207 2208 // TODO: Bug 3900774: clean up murky mess of mask clearing. 2209 uvm_page_mask_clear(page_mask, page_index); 2210 clear_service_context_masks(service_context, UVM_ID_CPU, page_index); 2211 } 2212 2213 if (uvm_page_mask_empty(page_mask)) 2214 status = NV_WARN_MORE_PROCESSING_REQUIRED; 2215 else 2216 status = uvm_va_block_service_finish(processor_id, va_block, service_context); 2217 2218 tracker_status = sync_page_and_chunk_state(va_block, 2219 src_pfns, 2220 dst_pfns, 2221 region, 2222 page_mask, 2223 &devmem_fault_context->same_devmem_page_mask); 2224 2225 return status == NV_OK ? tracker_status : status; 2226 } 2227 2228 static NV_STATUS populate_region(uvm_va_block_t *va_block, 2229 unsigned long *pfns, 2230 uvm_va_block_region_t region, 2231 uvm_page_mask_t *populated_page_mask) 2232 { 2233 uvm_page_index_t page_index; 2234 NV_STATUS status; 2235 2236 // Make sure GPU state is allocated or else the GPU DMA mappings to 2237 // system memory won't be saved. 2238 status = uvm_va_block_gpu_state_alloc(va_block); 2239 if (status != NV_OK) 2240 return status; 2241 2242 for_each_va_block_page_in_region(page_index, region) { 2243 struct page *page; 2244 2245 // This case should only happen when querying CPU residency and we ask 2246 // for something not covered by a VMA. Otherwise, hmm_range_fault() 2247 // returns -EFAULT instead of setting the HMM_PFN_ERROR bit. 2248 if (pfns[page_index] & HMM_PFN_ERROR) 2249 return NV_ERR_INVALID_ADDRESS; 2250 2251 if (pfns[page_index] & HMM_PFN_VALID) { 2252 page = hmm_pfn_to_page(pfns[page_index]); 2253 } 2254 else { 2255 // The page can't be evicted since it has to be migrated to the GPU 2256 // first which would leave a device private page entry so this has 2257 // to be a pte_none(), swapped out, or similar entry. 2258 // The page would have been allocated if populate_region() is being 2259 // called from uvm_hmm_va_block_service_locked() so this must be 2260 // for uvm_hmm_va_block_update_residency_info(). Just leave the 2261 // residency/populated information unchanged since 2262 // uvm_hmm_invalidate() should handle that if the underlying page 2263 // is invalidated. 2264 // Also note there can be an allocated page due to GPU-to-GPU 2265 // migration between non-peer or indirect peer GPUs. 2266 continue; 2267 } 2268 2269 if (is_device_private_page(page)) { 2270 // Linux can call hmm_invalidate() and we have to clear the GPU 2271 // chunk pointer in uvm_va_block_gpu_state_t::chunks[] but it might 2272 // not release the device private struct page reference. Since 2273 // hmm_range_fault() did find a device private PTE, we can 2274 // re-establish the GPU chunk pointer. 2275 status = gpu_chunk_add(va_block, page_index, page); 2276 if (status != NV_OK) 2277 return status; 2278 continue; 2279 } 2280 2281 // If a CPU chunk is already allocated, check to see it matches what 2282 // hmm_range_fault() found. 2283 if (uvm_page_mask_test(&va_block->cpu.allocated, page_index)) { 2284 UVM_ASSERT(hmm_va_block_cpu_page_is_same(va_block, page_index, page)); 2285 } 2286 else { 2287 status = hmm_va_block_cpu_page_populate(va_block, page_index, page); 2288 if (status != NV_OK) 2289 return status; 2290 2291 // Record that we populated this page. hmm_block_cpu_fault_locked() 2292 // uses this to ensure pages that don't migrate get remote mapped. 2293 if (populated_page_mask) 2294 uvm_page_mask_set(populated_page_mask, page_index); 2295 } 2296 2297 // Since we have a stable snapshot of the CPU pages, we can 2298 // update the residency and protection information. 2299 uvm_va_block_cpu_set_resident_page(va_block, page_to_nid(page), page_index); 2300 2301 cpu_mapping_set(va_block, pfns[page_index] & HMM_PFN_WRITE, page_index); 2302 } 2303 2304 return NV_OK; 2305 } 2306 2307 static void hmm_range_fault_begin(uvm_va_block_t *va_block) 2308 { 2309 uvm_thread_context_t *uvm_context = uvm_thread_context(); 2310 2311 uvm_assert_mutex_locked(&va_block->lock); 2312 uvm_context->hmm_invalidate_seqnum = va_block->hmm.changed; 2313 } 2314 2315 static bool hmm_range_fault_retry(uvm_va_block_t *va_block) 2316 { 2317 uvm_thread_context_t *uvm_context = uvm_thread_context(); 2318 2319 uvm_assert_mutex_locked(&va_block->lock); 2320 return uvm_context->hmm_invalidate_seqnum != va_block->hmm.changed; 2321 } 2322 2323 // Make the region be resident on the CPU by calling hmm_range_fault() to fault 2324 // in CPU pages. 2325 static NV_STATUS hmm_make_resident_cpu(uvm_va_block_t *va_block, 2326 struct vm_area_struct *vma, 2327 unsigned long *hmm_pfns, 2328 uvm_va_block_region_t region, 2329 NvU8 *access_type, 2330 uvm_page_mask_t *populated_page_mask) 2331 { 2332 uvm_page_index_t page_index; 2333 int ret; 2334 struct hmm_range range = { 2335 .notifier = &va_block->hmm.notifier, 2336 .start = uvm_va_block_region_start(va_block, region), 2337 .end = uvm_va_block_region_end(va_block, region) + 1, 2338 .hmm_pfns = hmm_pfns + region.first, 2339 .pfn_flags_mask = HMM_PFN_REQ_FAULT | HMM_PFN_REQ_WRITE, 2340 .dev_private_owner = &g_uvm_global, 2341 }; 2342 2343 for_each_va_block_page_in_region(page_index, region) { 2344 if ((access_type && access_type[page_index] >= UVM_FAULT_ACCESS_TYPE_WRITE) || 2345 (vma->vm_flags & VM_WRITE)) 2346 hmm_pfns[page_index] = HMM_PFN_REQ_FAULT | HMM_PFN_REQ_WRITE; 2347 else 2348 hmm_pfns[page_index] = HMM_PFN_REQ_FAULT; 2349 } 2350 2351 hmm_range_fault_begin(va_block); 2352 2353 // Mirror the VA block to the HMM address range. 2354 // Note that we request HMM to handle page faults, which means that it will 2355 // populate and map potentially not-yet-existing pages to the VMA. 2356 // Also note that mmu_interval_read_begin() calls wait_event() for any 2357 // parallel invalidation callbacks to finish so we can't hold locks that 2358 // the invalidation callback acquires. 2359 uvm_mutex_unlock(&va_block->lock); 2360 2361 range.notifier_seq = mmu_interval_read_begin(range.notifier); 2362 ret = hmm_range_fault(&range); 2363 2364 uvm_mutex_lock(&va_block->lock); 2365 2366 if (ret) 2367 return (ret == -EBUSY) ? NV_WARN_MORE_PROCESSING_REQUIRED : errno_to_nv_status(ret); 2368 2369 if (hmm_range_fault_retry(va_block)) 2370 return NV_WARN_MORE_PROCESSING_REQUIRED; 2371 2372 return populate_region(va_block, 2373 hmm_pfns, 2374 region, 2375 populated_page_mask); 2376 } 2377 2378 // Release the reference count on any pages that were made device exclusive. 2379 static void hmm_release_atomic_pages(uvm_va_block_t *va_block, 2380 uvm_service_block_context_t *service_context) 2381 { 2382 uvm_va_block_region_t region = service_context->region; 2383 uvm_page_index_t page_index; 2384 2385 for_each_va_block_page_in_region(page_index, region) { 2386 struct page *page = service_context->block_context->hmm.pages[page_index]; 2387 2388 if (!page) 2389 continue; 2390 2391 unlock_page(page); 2392 put_page(page); 2393 } 2394 } 2395 2396 static NV_STATUS hmm_block_atomic_fault_locked(uvm_processor_id_t processor_id, 2397 uvm_va_block_t *va_block, 2398 uvm_va_block_retry_t *va_block_retry, 2399 uvm_service_block_context_t *service_context) 2400 { 2401 uvm_va_block_region_t region = service_context->region; 2402 struct page **pages = service_context->block_context->hmm.pages; 2403 int npages; 2404 uvm_page_index_t page_index; 2405 uvm_make_resident_cause_t cause; 2406 NV_STATUS status; 2407 2408 if (!uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU) || 2409 !uvm_va_block_cpu_is_region_resident_on(va_block, NUMA_NO_NODE, region)) { 2410 // There is an atomic GPU fault. We need to make sure no pages are 2411 // GPU resident so that make_device_exclusive_range() doesn't call 2412 // migrate_to_ram() and cause a va_space lock recursion problem. 2413 if (service_context->operation == UVM_SERVICE_OPERATION_REPLAYABLE_FAULTS) 2414 cause = UVM_MAKE_RESIDENT_CAUSE_REPLAYABLE_FAULT; 2415 else if (service_context->operation == UVM_SERVICE_OPERATION_NON_REPLAYABLE_FAULTS) 2416 cause = UVM_MAKE_RESIDENT_CAUSE_NON_REPLAYABLE_FAULT; 2417 else 2418 cause = UVM_MAKE_RESIDENT_CAUSE_ACCESS_COUNTER; 2419 2420 status = uvm_hmm_va_block_migrate_locked(va_block, 2421 va_block_retry, 2422 service_context->block_context, 2423 UVM_ID_CPU, 2424 region, 2425 cause); 2426 if (status != NV_OK) 2427 goto done; 2428 2429 // make_device_exclusive_range() will try to call migrate_to_ram() 2430 // and deadlock with ourself if the data isn't CPU resident. 2431 if (!uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU) || 2432 !uvm_va_block_cpu_is_region_resident_on(va_block, NUMA_NO_NODE, region)) { 2433 status = NV_WARN_MORE_PROCESSING_REQUIRED; 2434 goto done; 2435 } 2436 } 2437 2438 // TODO: Bug 4014681: atomic GPU operations are not supported on MAP_SHARED 2439 // mmap() files so we check for that here and report a fatal fault. 2440 // Otherwise with the current Linux 6.1 make_device_exclusive_range(), 2441 // it doesn't make the page exclusive and we end up in an endless loop. 2442 if (service_context->block_context->hmm.vma->vm_flags & (VM_SHARED | VM_HUGETLB)) { 2443 status = NV_ERR_NOT_SUPPORTED; 2444 goto done; 2445 } 2446 2447 hmm_range_fault_begin(va_block); 2448 2449 uvm_mutex_unlock(&va_block->lock); 2450 2451 npages = make_device_exclusive_range(service_context->block_context->mm, 2452 uvm_va_block_cpu_page_address(va_block, region.first), 2453 uvm_va_block_cpu_page_address(va_block, region.outer - 1) + PAGE_SIZE, 2454 pages + region.first, 2455 &g_uvm_global); 2456 2457 uvm_mutex_lock(&va_block->lock); 2458 2459 if (npages < 0) { 2460 status = (npages == -EBUSY) ? NV_WARN_MORE_PROCESSING_REQUIRED : errno_to_nv_status(npages); 2461 goto done; 2462 } 2463 2464 while ((size_t)npages < uvm_va_block_region_num_pages(region)) 2465 pages[region.first + npages++] = NULL; 2466 2467 if (hmm_range_fault_retry(va_block)) { 2468 status = NV_WARN_MORE_PROCESSING_REQUIRED; 2469 goto release; 2470 } 2471 2472 status = NV_OK; 2473 2474 for_each_va_block_page_in_region(page_index, region) { 2475 struct page *page = pages[page_index]; 2476 2477 if (!page) { 2478 // Record that one of the pages isn't exclusive but keep converting 2479 // the others. 2480 status = NV_WARN_MORE_PROCESSING_REQUIRED; 2481 continue; 2482 } 2483 2484 // If a CPU chunk is already allocated, check to see it matches what 2485 // make_device_exclusive_range() found. 2486 if (uvm_page_mask_test(&va_block->cpu.allocated, page_index)) { 2487 UVM_ASSERT(hmm_va_block_cpu_page_is_same(va_block, page_index, page)); 2488 UVM_ASSERT(uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU)); 2489 UVM_ASSERT(uvm_va_block_cpu_is_page_resident_on(va_block, NUMA_NO_NODE, page_index)); 2490 } 2491 else { 2492 NV_STATUS s = hmm_va_block_cpu_page_populate(va_block, page_index, page); 2493 2494 if (s == NV_OK) 2495 uvm_va_block_cpu_set_resident_page(va_block, page_to_nid(page), page_index); 2496 } 2497 2498 cpu_mapping_clear(va_block, page_index); 2499 } 2500 2501 if (status != NV_OK) 2502 goto release; 2503 2504 status = uvm_va_block_service_copy(processor_id, UVM_ID_CPU, va_block, va_block_retry, service_context); 2505 if (status != NV_OK) 2506 goto release; 2507 2508 status = uvm_va_block_service_finish(processor_id, va_block, service_context); 2509 2510 release: 2511 hmm_release_atomic_pages(va_block, service_context); 2512 2513 done: 2514 return status; 2515 } 2516 2517 static bool is_atomic_fault(NvU8 *access_type, uvm_va_block_region_t region) 2518 { 2519 uvm_page_index_t page_index; 2520 2521 for_each_va_block_page_in_region(page_index, region) { 2522 if (access_type[page_index] == UVM_FAULT_ACCESS_TYPE_ATOMIC_STRONG) 2523 return true; 2524 } 2525 2526 return false; 2527 } 2528 2529 static bool is_gpu_resident(uvm_va_block_t *va_block, uvm_va_block_region_t region) 2530 { 2531 uvm_processor_id_t gpu_id; 2532 2533 for_each_gpu_id_in_mask(gpu_id, &va_block->resident) { 2534 uvm_va_block_gpu_state_t *gpu_state; 2535 2536 gpu_state = uvm_va_block_gpu_state_get(va_block, gpu_id); 2537 if (!uvm_page_mask_region_empty(&gpu_state->resident, region)) 2538 return true; 2539 } 2540 2541 return false; 2542 } 2543 2544 static NV_STATUS hmm_block_cpu_fault_locked(uvm_processor_id_t processor_id, 2545 uvm_va_block_t *va_block, 2546 uvm_va_block_retry_t *va_block_retry, 2547 uvm_service_block_context_t *service_context) 2548 { 2549 uvm_va_block_region_t region = service_context->region; 2550 struct migrate_vma *args = &service_context->block_context->hmm.migrate_vma_args; 2551 NV_STATUS status; 2552 int ret; 2553 uvm_hmm_devmem_fault_context_t fault_context = { 2554 .processor_id = processor_id, 2555 .va_block = va_block, 2556 .va_block_retry = va_block_retry, 2557 .service_context = service_context, 2558 }; 2559 2560 // Normally the source page will be a device private page that is being 2561 // migrated to system memory. However, when it is a GPU fault, the source 2562 // page can be a system memory page that the GPU needs to remote map 2563 // instead. However migrate_vma_setup() won't select these types of 2564 // mappings/pages: 2565 // - device exclusive PTEs 2566 // - shared mappings 2567 // - file backed mappings 2568 // Also, if the source and destination page are the same, the page reference 2569 // count won't be the "expected" count and migrate_vma_pages() won't migrate 2570 // it. This mask records that uvm_hmm_devmem_fault_alloc_and_copy() and 2571 // uvm_hmm_devmem_fault_finalize_and_map() still needs to process these 2572 // pages even if src_pfn indicates they are not migrating. 2573 uvm_page_mask_zero(&fault_context.same_devmem_page_mask); 2574 2575 if (!UVM_ID_IS_CPU(processor_id)) { 2576 if (is_atomic_fault(service_context->access_type, region)) { 2577 return hmm_block_atomic_fault_locked(processor_id, 2578 va_block, 2579 va_block_retry, 2580 service_context); 2581 } 2582 2583 status = hmm_make_resident_cpu(va_block, 2584 service_context->block_context->hmm.vma, 2585 service_context->block_context->hmm.src_pfns, 2586 region, 2587 service_context->access_type, 2588 &fault_context.same_devmem_page_mask); 2589 if (status != NV_OK) 2590 return status; 2591 2592 // If no GPU has a resident copy, we can skip the migrate_vma_*(). 2593 // This is necessary if uvm_hmm_must_use_sysmem() returned true. 2594 if (!is_gpu_resident(va_block, region)) { 2595 status = uvm_va_block_service_copy(processor_id, 2596 UVM_ID_CPU, 2597 va_block, 2598 va_block_retry, 2599 service_context); 2600 if (status != NV_OK) 2601 return status; 2602 2603 return uvm_va_block_service_finish(processor_id, va_block, service_context); 2604 } 2605 } 2606 2607 args->vma = service_context->block_context->hmm.vma; 2608 args->src = service_context->block_context->hmm.src_pfns + region.first; 2609 args->dst = service_context->block_context->hmm.dst_pfns + region.first; 2610 args->start = uvm_va_block_region_start(va_block, region); 2611 args->end = uvm_va_block_region_end(va_block, region) + 1; 2612 args->flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE; 2613 args->pgmap_owner = &g_uvm_global; 2614 2615 if (UVM_ID_IS_CPU(processor_id)) { 2616 args->fault_page = service_context->cpu_fault.vmf->page; 2617 } 2618 else { 2619 args->flags |= MIGRATE_VMA_SELECT_SYSTEM; 2620 args->fault_page = NULL; 2621 } 2622 2623 ret = migrate_vma_setup_locked(args, va_block); 2624 UVM_ASSERT(!ret); 2625 2626 // The overall process here is to migrate pages from the GPU to the CPU 2627 // and possibly remote map the GPU to sysmem if accessed_by is set. 2628 // This is safe because we hold the va_block lock across the calls to 2629 // uvm_hmm_devmem_fault_alloc_and_copy(), migrate_vma_pages(), 2630 // uvm_hmm_devmem_fault_finalize_and_map(), and migrate_vma_finalize(). 2631 // If uvm_hmm_devmem_fault_alloc_and_copy() needs to drop the va_block 2632 // lock, a sequence number is used to tell if an invalidate() callback 2633 // occurred while not holding the lock. If the sequence number changes, 2634 // all the locks need to be dropped (mm, va_space, va_block) and the whole 2635 // uvm_va_block_service_locked() called again. Otherwise, there were no 2636 // conflicting invalidate callbacks and our snapshots of the CPU page 2637 // tables are accurate and can be used to DMA pages and update GPU page 2638 // tables. 2639 status = uvm_hmm_devmem_fault_alloc_and_copy(&fault_context); 2640 if (status == NV_OK) { 2641 migrate_vma_pages(args); 2642 status = uvm_hmm_devmem_fault_finalize_and_map(&fault_context); 2643 } 2644 2645 migrate_vma_finalize(args); 2646 2647 if (status == NV_WARN_NOTHING_TO_DO) 2648 status = NV_OK; 2649 2650 return status; 2651 } 2652 2653 static NV_STATUS dmamap_src_sysmem_pages(uvm_va_block_t *va_block, 2654 struct vm_area_struct *vma, 2655 const unsigned long *src_pfns, 2656 unsigned long *dst_pfns, 2657 uvm_va_block_region_t region, 2658 uvm_page_mask_t *page_mask, 2659 uvm_processor_id_t dest_id, 2660 uvm_service_block_context_t *service_context) 2661 { 2662 uvm_page_index_t page_index; 2663 NV_STATUS status = NV_OK; 2664 2665 for_each_va_block_page_in_region_mask(page_index, page_mask, region) { 2666 struct page *src_page; 2667 2668 if (!(src_pfns[page_index] & MIGRATE_PFN_MIGRATE)) { 2669 // HMM currently has some limitations on what pages can be migrated. 2670 // For example, no file backed pages, device private pages owned by 2671 // a different device, device exclusive or swapped out pages. 2672 goto clr_mask; 2673 } 2674 2675 // This is the page that will be copied to the destination GPU. 2676 src_page = migrate_pfn_to_page(src_pfns[page_index]); 2677 if (src_page) { 2678 if (is_device_private_page(src_page)) { 2679 status = gpu_chunk_add(va_block, page_index, src_page); 2680 if (status != NV_OK) 2681 break; 2682 continue; 2683 } 2684 2685 if (PageSwapCache(src_page)) { 2686 // TODO: Bug 4050579: Remove this when swap cached pages can be 2687 // migrated. 2688 status = NV_WARN_MISMATCHED_TARGET; 2689 break; 2690 } 2691 2692 // If the page is already allocated, it is most likely a mirrored 2693 // page. Check to be sure it matches what we have recorded. The 2694 // page shouldn't be a staging page from a GPU to GPU migration 2695 // or a remote mapped atomic sysmem page because migrate_vma_setup() 2696 // found a normal page and non-mirrored pages are only known 2697 // privately to the UVM driver. 2698 if (uvm_page_mask_test(&va_block->cpu.allocated, page_index)) { 2699 UVM_ASSERT(hmm_va_block_cpu_page_is_same(va_block, page_index, src_page)); 2700 UVM_ASSERT(uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU)); 2701 UVM_ASSERT(uvm_va_block_cpu_is_page_resident_on(va_block, NUMA_NO_NODE, page_index)); 2702 } 2703 else { 2704 status = hmm_va_block_cpu_page_populate(va_block, page_index, src_page); 2705 if (status != NV_OK) 2706 goto clr_mask; 2707 2708 // Since there is a CPU resident page, there shouldn't be one 2709 // anywhere else. TODO: Bug 3660922: Need to handle read 2710 // duplication at some point. 2711 UVM_ASSERT(!uvm_va_block_page_resident_processors_count(va_block, page_index)); 2712 2713 // migrate_vma_setup() was able to isolate and lock the page; 2714 // therefore, it is CPU resident and not mapped. 2715 uvm_va_block_cpu_set_resident_page(va_block, page_to_nid(src_page), page_index); 2716 } 2717 2718 // The call to migrate_vma_setup() will have inserted a migration 2719 // PTE so the CPU has no access. 2720 cpu_mapping_clear(va_block, page_index); 2721 } 2722 else { 2723 // It is OK to migrate an empty anonymous page, a zero page will 2724 // be allocated on the GPU. Just be sure to free any pages 2725 // used for GPU to GPU copies. It can't be an evicted page because 2726 // migrate_vma_setup() would have found a source page. 2727 if (uvm_page_mask_test(&va_block->cpu.allocated, page_index)) { 2728 UVM_ASSERT(!uvm_va_block_page_resident_processors_count(va_block, page_index)); 2729 2730 hmm_va_block_cpu_page_unpopulate(va_block, page_index, NULL); 2731 } 2732 } 2733 2734 continue; 2735 2736 clr_mask: 2737 // TODO: Bug 3900774: clean up murky mess of mask clearing. 2738 uvm_page_mask_clear(page_mask, page_index); 2739 if (service_context) 2740 clear_service_context_masks(service_context, dest_id, page_index); 2741 } 2742 2743 if (uvm_page_mask_empty(page_mask)) 2744 status = NV_WARN_MORE_PROCESSING_REQUIRED; 2745 2746 if (status != NV_OK) 2747 clean_up_non_migrating_pages(va_block, src_pfns, dst_pfns, region, page_mask); 2748 2749 return status; 2750 } 2751 2752 static NV_STATUS uvm_hmm_gpu_fault_alloc_and_copy(struct vm_area_struct *vma, 2753 uvm_hmm_gpu_fault_event_t *uvm_hmm_gpu_fault_event) 2754 { 2755 uvm_processor_id_t processor_id; 2756 uvm_processor_id_t new_residency; 2757 uvm_va_block_t *va_block; 2758 uvm_va_block_retry_t *va_block_retry; 2759 uvm_service_block_context_t *service_context; 2760 uvm_perf_prefetch_hint_t *prefetch_hint; 2761 const unsigned long *src_pfns; 2762 unsigned long *dst_pfns; 2763 uvm_va_block_region_t region; 2764 uvm_page_mask_t *page_mask; 2765 NV_STATUS status; 2766 2767 processor_id = uvm_hmm_gpu_fault_event->processor_id; 2768 new_residency = uvm_hmm_gpu_fault_event->new_residency; 2769 va_block = uvm_hmm_gpu_fault_event->va_block; 2770 va_block_retry = uvm_hmm_gpu_fault_event->va_block_retry; 2771 service_context = uvm_hmm_gpu_fault_event->service_context; 2772 region = service_context->region; 2773 prefetch_hint = &service_context->prefetch_hint; 2774 src_pfns = service_context->block_context->hmm.src_pfns; 2775 dst_pfns = service_context->block_context->hmm.dst_pfns; 2776 2777 // Build the migration mask. 2778 // Note that thrashing pinned pages are already accounted for in 2779 // service_context->resident_processors. 2780 page_mask = &uvm_hmm_gpu_fault_event->page_mask; 2781 uvm_page_mask_copy(page_mask, 2782 &service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency); 2783 2784 status = dmamap_src_sysmem_pages(va_block, 2785 vma, 2786 src_pfns, 2787 dst_pfns, 2788 region, 2789 page_mask, 2790 new_residency, 2791 service_context); 2792 if (status != NV_OK) 2793 return status; 2794 2795 // Do the alloc and copy but don't update the residency or mapping for the 2796 // new location yet. 2797 status = uvm_va_block_service_copy(processor_id, new_residency, va_block, va_block_retry, service_context); 2798 if (status != NV_OK) 2799 return status; 2800 2801 // Record the destination PFNs of device private struct pages now that 2802 // uvm_va_block_service_copy() has populated the GPU destination pages. 2803 fill_dst_pfns(va_block, 2804 src_pfns, 2805 dst_pfns, 2806 region, 2807 page_mask, 2808 &uvm_hmm_gpu_fault_event->same_devmem_page_mask, 2809 new_residency); 2810 2811 return status; 2812 } 2813 2814 static NV_STATUS uvm_hmm_gpu_fault_finalize_and_map(uvm_hmm_gpu_fault_event_t *uvm_hmm_gpu_fault_event) 2815 { 2816 uvm_processor_id_t processor_id; 2817 uvm_processor_id_t new_residency; 2818 uvm_va_block_t *va_block; 2819 uvm_va_block_retry_t *va_block_retry; 2820 uvm_service_block_context_t *service_context; 2821 const unsigned long *src_pfns; 2822 unsigned long *dst_pfns; 2823 uvm_va_block_region_t region; 2824 uvm_page_index_t page_index; 2825 uvm_page_mask_t *page_mask; 2826 NV_STATUS status, tracker_status; 2827 2828 processor_id = uvm_hmm_gpu_fault_event->processor_id; 2829 new_residency = uvm_hmm_gpu_fault_event->new_residency; 2830 va_block = uvm_hmm_gpu_fault_event->va_block; 2831 va_block_retry = uvm_hmm_gpu_fault_event->va_block_retry; 2832 service_context = uvm_hmm_gpu_fault_event->service_context; 2833 src_pfns = service_context->block_context->hmm.src_pfns; 2834 dst_pfns = service_context->block_context->hmm.dst_pfns; 2835 region = service_context->region; 2836 page_mask = &uvm_hmm_gpu_fault_event->page_mask; 2837 2838 // There are a number of reasons why HMM will mark a page as not migrating 2839 // even if we set a valid entry in dst_pfns[]. Mark these pages accordingly. 2840 for_each_va_block_page_in_region_mask(page_index, page_mask, region) { 2841 unsigned long src_pfn = src_pfns[page_index]; 2842 2843 if (src_pfn & MIGRATE_PFN_MIGRATE) 2844 continue; 2845 2846 // If a device private page isn't migrating and only the GPU page table 2847 // is being updated, continue to process it normally. 2848 if (uvm_page_mask_test(&uvm_hmm_gpu_fault_event->same_devmem_page_mask, page_index)) 2849 continue; 2850 2851 // TODO: Bug 3900774: clean up murky mess of mask clearing. 2852 uvm_page_mask_clear(page_mask, page_index); 2853 clear_service_context_masks(service_context, new_residency, page_index); 2854 } 2855 2856 if (uvm_page_mask_empty(page_mask)) 2857 status = NV_WARN_MORE_PROCESSING_REQUIRED; 2858 else 2859 status = uvm_va_block_service_finish(processor_id, va_block, service_context); 2860 2861 tracker_status = sync_page_and_chunk_state(va_block, 2862 src_pfns, 2863 dst_pfns, 2864 region, 2865 page_mask, 2866 &uvm_hmm_gpu_fault_event->same_devmem_page_mask); 2867 2868 return status == NV_OK ? tracker_status : status; 2869 } 2870 2871 NV_STATUS uvm_hmm_va_block_service_locked(uvm_processor_id_t processor_id, 2872 uvm_processor_id_t new_residency, 2873 uvm_va_block_t *va_block, 2874 uvm_va_block_retry_t *va_block_retry, 2875 uvm_service_block_context_t *service_context) 2876 { 2877 struct mm_struct *mm = service_context->block_context->mm; 2878 struct vm_area_struct *vma = service_context->block_context->hmm.vma; 2879 uvm_va_block_region_t region = service_context->region; 2880 uvm_hmm_gpu_fault_event_t uvm_hmm_gpu_fault_event; 2881 struct migrate_vma *args = &service_context->block_context->hmm.migrate_vma_args; 2882 int ret; 2883 NV_STATUS status = NV_ERR_INVALID_ADDRESS; 2884 2885 if (!mm) 2886 return status; 2887 2888 uvm_assert_mmap_lock_locked(mm); 2889 uvm_assert_rwsem_locked(&va_block->hmm.va_space->lock); 2890 uvm_assert_mutex_locked(&va_block->hmm.migrate_lock); 2891 uvm_assert_mutex_locked(&va_block->lock); 2892 UVM_ASSERT(vma); 2893 2894 // If the desired destination is the CPU, try to fault in CPU pages. 2895 if (UVM_ID_IS_CPU(new_residency)) 2896 return hmm_block_cpu_fault_locked(processor_id, va_block, va_block_retry, service_context); 2897 2898 uvm_hmm_gpu_fault_event.processor_id = processor_id; 2899 uvm_hmm_gpu_fault_event.new_residency = new_residency; 2900 uvm_hmm_gpu_fault_event.va_block = va_block; 2901 uvm_hmm_gpu_fault_event.va_block_retry = va_block_retry; 2902 uvm_hmm_gpu_fault_event.service_context = service_context; 2903 2904 args->vma = vma; 2905 args->src = service_context->block_context->hmm.src_pfns + region.first; 2906 args->dst = service_context->block_context->hmm.dst_pfns + region.first; 2907 args->start = uvm_va_block_region_start(va_block, region); 2908 args->end = uvm_va_block_region_end(va_block, region) + 1; 2909 args->flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE | MIGRATE_VMA_SELECT_SYSTEM; 2910 args->pgmap_owner = &g_uvm_global; 2911 args->fault_page = NULL; 2912 2913 ret = migrate_vma_setup_locked(args, va_block); 2914 UVM_ASSERT(!ret); 2915 2916 // The overall process here is to migrate pages from the CPU or GPUs to the 2917 // faulting GPU. 2918 // This is safe because we hold the va_block lock across the calls to 2919 // uvm_hmm_gpu_fault_alloc_and_copy(), migrate_vma_pages(), 2920 // uvm_hmm_gpu_fault_finalize_and_map(), and migrate_vma_finalize(). 2921 // If uvm_hmm_gpu_fault_alloc_and_copy() needs to drop the va_block 2922 // lock, a sequence number is used to tell if an invalidate() callback 2923 // occurred while not holding the lock. If the sequence number changes, 2924 // all the locks need to be dropped (mm, va_space, va_block) and the whole 2925 // uvm_va_block_service_locked() called again. Otherwise, there were no 2926 // conflicting invalidate callbacks and our snapshots of the CPU page 2927 // tables are accurate and can be used to DMA pages and update GPU page 2928 // tables. TODO: Bug 3901904: there might be better ways of handling no 2929 // page being migrated. 2930 status = uvm_hmm_gpu_fault_alloc_and_copy(vma, &uvm_hmm_gpu_fault_event); 2931 if (status == NV_WARN_MORE_PROCESSING_REQUIRED) { 2932 migrate_vma_finalize(args); 2933 2934 // migrate_vma_setup() might have not been able to lock/isolate any 2935 // pages because they are swapped out or are device exclusive. 2936 // We do know that none of the pages in the region are zero pages 2937 // since migrate_vma_setup() would have reported that information. 2938 // Try to make it resident in system memory and retry the migration. 2939 status = hmm_make_resident_cpu(va_block, 2940 service_context->block_context->hmm.vma, 2941 service_context->block_context->hmm.src_pfns, 2942 region, 2943 service_context->access_type, 2944 NULL); 2945 return NV_WARN_MORE_PROCESSING_REQUIRED; 2946 } 2947 2948 if (status == NV_OK) { 2949 migrate_vma_pages(args); 2950 status = uvm_hmm_gpu_fault_finalize_and_map(&uvm_hmm_gpu_fault_event); 2951 } 2952 2953 migrate_vma_finalize(args); 2954 2955 if (status == NV_WARN_NOTHING_TO_DO) 2956 status = NV_OK; 2957 2958 return status; 2959 } 2960 2961 static NV_STATUS uvm_hmm_migrate_alloc_and_copy(struct vm_area_struct *vma, 2962 uvm_hmm_migrate_event_t *uvm_hmm_migrate_event) 2963 { 2964 uvm_va_block_t *va_block; 2965 uvm_va_block_retry_t *va_block_retry; 2966 uvm_va_block_context_t *va_block_context; 2967 const unsigned long *src_pfns; 2968 unsigned long *dst_pfns; 2969 uvm_va_block_region_t region; 2970 uvm_processor_id_t dest_id; 2971 uvm_page_mask_t *page_mask; 2972 NV_STATUS status; 2973 2974 va_block = uvm_hmm_migrate_event->va_block; 2975 va_block_retry = uvm_hmm_migrate_event->va_block_retry; 2976 va_block_context = uvm_hmm_migrate_event->va_block_context; 2977 src_pfns = va_block_context->hmm.src_pfns; 2978 dst_pfns = va_block_context->hmm.dst_pfns; 2979 region = uvm_hmm_migrate_event->region; 2980 dest_id = uvm_hmm_migrate_event->dest_id; 2981 page_mask = &uvm_hmm_migrate_event->page_mask; 2982 uvm_page_mask_init_from_region(page_mask, region, NULL); 2983 uvm_page_mask_zero(&uvm_hmm_migrate_event->same_devmem_page_mask); 2984 2985 uvm_assert_mutex_locked(&va_block->lock); 2986 2987 if (UVM_ID_IS_CPU(dest_id)) { 2988 status = migrate_alloc_on_cpu(va_block, 2989 src_pfns, 2990 dst_pfns, 2991 region, 2992 page_mask, 2993 &uvm_hmm_migrate_event->same_devmem_page_mask, 2994 va_block_context); 2995 } 2996 else { 2997 status = dmamap_src_sysmem_pages(va_block, 2998 vma, 2999 src_pfns, 3000 dst_pfns, 3001 region, 3002 page_mask, 3003 dest_id, 3004 NULL); 3005 } 3006 if (status != NV_OK) 3007 return status; 3008 3009 status = uvm_va_block_make_resident_copy(va_block, 3010 va_block_retry, 3011 va_block_context, 3012 dest_id, 3013 region, 3014 page_mask, 3015 NULL, 3016 uvm_hmm_migrate_event->cause); 3017 if (status != NV_OK) 3018 return status; 3019 3020 if (!UVM_ID_IS_CPU(dest_id)) { 3021 // Record the destination PFNs of device private struct pages now that 3022 // uvm_va_block_make_resident_copy() has populated the GPU destination 3023 // pages. 3024 fill_dst_pfns(va_block, 3025 src_pfns, 3026 dst_pfns, 3027 region, 3028 page_mask, 3029 &uvm_hmm_migrate_event->same_devmem_page_mask, 3030 dest_id); 3031 } 3032 3033 return status; 3034 } 3035 3036 static NV_STATUS uvm_hmm_migrate_finalize(uvm_hmm_migrate_event_t *uvm_hmm_migrate_event) 3037 { 3038 uvm_va_block_t *va_block; 3039 uvm_va_block_retry_t *va_block_retry; 3040 uvm_va_block_context_t *va_block_context; 3041 uvm_va_block_region_t region; 3042 uvm_processor_id_t dest_id; 3043 uvm_page_index_t page_index; 3044 uvm_page_mask_t *page_mask; 3045 const unsigned long *src_pfns; 3046 unsigned long *dst_pfns; 3047 3048 va_block = uvm_hmm_migrate_event->va_block; 3049 va_block_retry = uvm_hmm_migrate_event->va_block_retry; 3050 va_block_context = uvm_hmm_migrate_event->va_block_context; 3051 region = uvm_hmm_migrate_event->region; 3052 dest_id = uvm_hmm_migrate_event->dest_id; 3053 page_mask = &uvm_hmm_migrate_event->page_mask; 3054 src_pfns = va_block_context->hmm.src_pfns; 3055 dst_pfns = va_block_context->hmm.dst_pfns; 3056 3057 uvm_assert_mutex_locked(&va_block->lock); 3058 3059 // There are a number of reasons why HMM will mark a page as not migrating 3060 // even if we set a valid entry in dst_pfns[]. Mark these pages accordingly. 3061 for_each_va_block_page_in_region_mask(page_index, page_mask, region) { 3062 unsigned long src_pfn = src_pfns[page_index]; 3063 3064 if (src_pfn & MIGRATE_PFN_MIGRATE) 3065 continue; 3066 3067 // If a device private page isn't migrating and only the GPU page table 3068 // is being updated, continue to process it normally. 3069 if (uvm_page_mask_test(&uvm_hmm_migrate_event->same_devmem_page_mask, page_index)) 3070 continue; 3071 3072 uvm_page_mask_clear(page_mask, page_index); 3073 } 3074 3075 uvm_va_block_make_resident_finish(va_block, va_block_context, region, page_mask); 3076 3077 return sync_page_and_chunk_state(va_block, 3078 src_pfns, 3079 dst_pfns, 3080 region, 3081 page_mask, 3082 &uvm_hmm_migrate_event->same_devmem_page_mask); 3083 } 3084 3085 // Note that migrate_vma_*() doesn't handle asynchronous migrations so the 3086 // migration flag UVM_MIGRATE_FLAG_SKIP_CPU_MAP doesn't have an effect. 3087 // TODO: Bug 3900785: investigate ways to implement async migration. 3088 NV_STATUS uvm_hmm_va_block_migrate_locked(uvm_va_block_t *va_block, 3089 uvm_va_block_retry_t *va_block_retry, 3090 uvm_va_block_context_t *va_block_context, 3091 uvm_processor_id_t dest_id, 3092 uvm_va_block_region_t region, 3093 uvm_make_resident_cause_t cause) 3094 { 3095 uvm_hmm_migrate_event_t uvm_hmm_migrate_event; 3096 struct vm_area_struct *vma = va_block_context->hmm.vma; 3097 NvU64 start; 3098 NvU64 end; 3099 struct migrate_vma *args = &va_block_context->hmm.migrate_vma_args; 3100 NV_STATUS status; 3101 int ret; 3102 3103 UVM_ASSERT(vma); 3104 UVM_ASSERT(va_block_context->mm == vma->vm_mm); 3105 uvm_assert_mmap_lock_locked(va_block_context->mm); 3106 uvm_assert_rwsem_locked(&va_block->hmm.va_space->lock); 3107 uvm_assert_mutex_locked(&va_block->hmm.migrate_lock); 3108 uvm_assert_mutex_locked(&va_block->lock); 3109 3110 start = uvm_va_block_region_start(va_block, region); 3111 end = uvm_va_block_region_end(va_block, region); 3112 UVM_ASSERT(vma->vm_start <= start && end < vma->vm_end); 3113 3114 uvm_hmm_migrate_event.va_block = va_block; 3115 uvm_hmm_migrate_event.va_block_retry = va_block_retry; 3116 uvm_hmm_migrate_event.va_block_context = va_block_context; 3117 uvm_hmm_migrate_event.region = region; 3118 uvm_hmm_migrate_event.dest_id = dest_id; 3119 uvm_hmm_migrate_event.cause = cause; 3120 3121 args->vma = vma; 3122 args->src = va_block_context->hmm.src_pfns + region.first; 3123 args->dst = va_block_context->hmm.dst_pfns + region.first; 3124 args->start = uvm_va_block_region_start(va_block, region); 3125 args->end = uvm_va_block_region_end(va_block, region) + 1; 3126 args->flags = UVM_ID_IS_CPU(dest_id) ? MIGRATE_VMA_SELECT_DEVICE_PRIVATE : 3127 MIGRATE_VMA_SELECT_DEVICE_PRIVATE | MIGRATE_VMA_SELECT_SYSTEM; 3128 args->pgmap_owner = &g_uvm_global; 3129 args->fault_page = NULL; 3130 3131 // Note that migrate_vma_setup() doesn't handle file backed or VM_SPECIAL 3132 // VMAs so if UvmMigrate() tries to migrate such a region, -EINVAL will 3133 // be returned and we will only try to make the pages be CPU resident. 3134 ret = migrate_vma_setup_locked(args, va_block); 3135 if (ret) 3136 return hmm_make_resident_cpu(va_block, 3137 vma, 3138 va_block_context->hmm.src_pfns, 3139 region, 3140 NULL, 3141 NULL); 3142 3143 // The overall process here is to migrate pages from the CPU or GPUs to the 3144 // destination processor. Note that block_migrate_add_mappings() handles 3145 // updating GPU mappings after the migration. 3146 // This is safe because we hold the va_block lock across the calls to 3147 // uvm_hmm_migrate_alloc_and_copy(), migrate_vma_pages(), 3148 // uvm_hmm_migrate_finalize(), migrate_vma_finalize() and 3149 // block_migrate_add_mappings(). 3150 // If uvm_hmm_migrate_alloc_and_copy() needs to drop the va_block 3151 // lock, a sequence number is used to tell if an invalidate() callback 3152 // occurred while not holding the lock. If the sequence number changes, 3153 // all the locks need to be dropped (mm, va_space, va_block) and the whole 3154 // uvm_hmm_va_block_migrate_locked() called again. Otherwise, there were no 3155 // conflicting invalidate callbacks and our snapshots of the CPU page 3156 // tables are accurate and can be used to DMA pages and update GPU page 3157 // tables. 3158 status = uvm_hmm_migrate_alloc_and_copy(vma, &uvm_hmm_migrate_event); 3159 if (status == NV_WARN_MORE_PROCESSING_REQUIRED) { 3160 uvm_processor_id_t id; 3161 uvm_page_mask_t *page_mask; 3162 3163 migrate_vma_finalize(args); 3164 3165 // The CPU pages tables might contain only device private pages or 3166 // the migrate_vma_setup() might have not been able to lock/isolate 3167 // any pages because they are swapped out, or on another device. 3168 // We do know that none of the pages in the region are zero pages 3169 // since migrate_vma_setup() would have reported that information. 3170 // Collect all the pages that need to be faulted in and made CPU 3171 // resident, then do the hmm_range_fault() and retry. 3172 page_mask = &va_block_context->caller_page_mask; 3173 uvm_page_mask_init_from_region(page_mask, region, NULL); 3174 3175 for_each_id_in_mask(id, &va_block->resident) { 3176 if (!uvm_page_mask_andnot(page_mask, page_mask, uvm_va_block_resident_mask_get(va_block, id, NUMA_NO_NODE))) 3177 return NV_OK; 3178 } 3179 3180 return hmm_make_resident_cpu(va_block, 3181 vma, 3182 va_block_context->hmm.src_pfns, 3183 region, 3184 NULL, 3185 NULL); 3186 } 3187 3188 if (status == NV_OK) { 3189 migrate_vma_pages(args); 3190 status = uvm_hmm_migrate_finalize(&uvm_hmm_migrate_event); 3191 } 3192 3193 migrate_vma_finalize(args); 3194 3195 if (status == NV_WARN_NOTHING_TO_DO || status == NV_WARN_MISMATCHED_TARGET) 3196 status = NV_OK; 3197 3198 return status; 3199 } 3200 3201 NV_STATUS uvm_hmm_migrate_ranges(uvm_va_space_t *va_space, 3202 uvm_va_block_context_t *va_block_context, 3203 NvU64 base, 3204 NvU64 length, 3205 uvm_processor_id_t dest_id, 3206 uvm_migrate_mode_t mode, 3207 uvm_tracker_t *out_tracker) 3208 { 3209 struct mm_struct *mm; 3210 uvm_va_block_t *va_block; 3211 uvm_va_block_retry_t va_block_retry; 3212 NvU64 addr, end, last_address; 3213 NV_STATUS status = NV_OK; 3214 3215 if (!uvm_hmm_is_enabled(va_space)) 3216 return NV_ERR_INVALID_ADDRESS; 3217 3218 mm = va_block_context->mm; 3219 UVM_ASSERT(mm == va_space->va_space_mm.mm); 3220 uvm_assert_mmap_lock_locked(mm); 3221 uvm_assert_rwsem_locked(&va_space->lock); 3222 3223 last_address = base + length - 1; 3224 3225 for (addr = base; addr < last_address; addr = end + 1) { 3226 struct vm_area_struct *vma; 3227 3228 status = hmm_va_block_find_create(va_space, addr, false, &va_block_context->hmm.vma, &va_block); 3229 if (status != NV_OK) 3230 return status; 3231 3232 end = va_block->end; 3233 if (end > last_address) 3234 end = last_address; 3235 3236 vma = va_block_context->hmm.vma; 3237 if (end > vma->vm_end - 1) 3238 end = vma->vm_end - 1; 3239 3240 status = hmm_migrate_range(va_block, 3241 &va_block_retry, 3242 va_block_context, 3243 dest_id, 3244 addr, 3245 end, 3246 mode, 3247 out_tracker); 3248 if (status != NV_OK) 3249 break; 3250 } 3251 3252 return status; 3253 } 3254 3255 NV_STATUS uvm_hmm_va_block_evict_chunk_prep(uvm_va_block_t *va_block, 3256 uvm_va_block_context_t *va_block_context, 3257 uvm_gpu_chunk_t *gpu_chunk, 3258 uvm_va_block_region_t chunk_region) 3259 { 3260 uvm_thread_context_t *uvm_context = uvm_thread_context(); 3261 unsigned long *src_pfns = va_block_context->hmm.src_pfns; 3262 uvm_gpu_t *gpu = uvm_gpu_chunk_get_gpu(gpu_chunk); 3263 unsigned long pfn = uvm_pmm_gpu_devmem_get_pfn(&gpu->pmm, gpu_chunk); 3264 uvm_page_index_t page_index = chunk_region.first; 3265 int ret; 3266 3267 uvm_assert_mutex_locked(&va_block->lock); 3268 // TODO: Bug 3368756: add support for large GPU pages. 3269 UVM_ASSERT(uvm_va_block_region_num_pages(chunk_region) == 1); 3270 3271 uvm_context->ignore_hmm_invalidate_va_block = va_block; 3272 ret = migrate_device_range(src_pfns + page_index, pfn, uvm_va_block_region_num_pages(chunk_region)); 3273 uvm_context->ignore_hmm_invalidate_va_block = NULL; 3274 if (ret) 3275 return errno_to_nv_status(ret); 3276 3277 return NV_OK; 3278 } 3279 3280 // Note that the caller must initialize va_block_context->hmm.src_pfns by 3281 // calling uvm_hmm_va_block_evict_chunk_prep() before calling this. 3282 static NV_STATUS hmm_va_block_evict_chunks(uvm_va_block_t *va_block, 3283 uvm_va_block_context_t *va_block_context, 3284 const uvm_page_mask_t *pages_to_evict, 3285 uvm_va_block_region_t region, 3286 uvm_make_resident_cause_t cause, 3287 bool *out_accessed_by_set) 3288 { 3289 NvU64 start = uvm_va_block_region_start(va_block, region); 3290 NvU64 end = uvm_va_block_region_end(va_block, region); 3291 unsigned long *src_pfns = va_block_context->hmm.src_pfns; 3292 unsigned long *dst_pfns = va_block_context->hmm.dst_pfns; 3293 uvm_hmm_migrate_event_t uvm_hmm_migrate_event = { 3294 .va_block = va_block, 3295 .va_block_retry = NULL, 3296 .va_block_context = va_block_context, 3297 .region = region, 3298 .dest_id = UVM_ID_CPU, 3299 .cause = cause, 3300 }; 3301 uvm_page_mask_t *page_mask = &uvm_hmm_migrate_event.page_mask; 3302 const uvm_va_policy_t *policy; 3303 uvm_va_policy_node_t *node; 3304 uvm_page_mask_t *cpu_resident_mask = uvm_va_block_resident_mask_get(va_block, UVM_ID_CPU, NUMA_NO_NODE); 3305 unsigned long npages; 3306 NV_STATUS status; 3307 3308 uvm_assert_mutex_locked(&va_block->lock); 3309 3310 if (out_accessed_by_set) 3311 *out_accessed_by_set = false; 3312 3313 // Note that there is no VMA available when evicting HMM pages. 3314 va_block_context->hmm.vma = NULL; 3315 3316 uvm_page_mask_copy(page_mask, pages_to_evict); 3317 3318 uvm_for_each_va_policy_in(policy, va_block, start, end, node, region) { 3319 npages = uvm_va_block_region_num_pages(region); 3320 3321 if (out_accessed_by_set && uvm_processor_mask_get_count(&policy->accessed_by) > 0) 3322 *out_accessed_by_set = true; 3323 3324 // Pages resident on the GPU should not have a resident page in system 3325 // memory. 3326 // TODO: Bug 3660922: Need to handle read duplication at some point. 3327 UVM_ASSERT(uvm_page_mask_region_empty(cpu_resident_mask, region)); 3328 3329 status = migrate_alloc_on_cpu(va_block, 3330 src_pfns, 3331 dst_pfns, 3332 region, 3333 page_mask, 3334 NULL, 3335 va_block_context); 3336 if (status != NV_OK) 3337 goto err; 3338 3339 status = uvm_va_block_make_resident_copy(va_block, 3340 NULL, 3341 va_block_context, 3342 UVM_ID_CPU, 3343 region, 3344 page_mask, 3345 NULL, 3346 cause); 3347 if (status != NV_OK) 3348 goto err; 3349 3350 migrate_device_pages(src_pfns + region.first, dst_pfns + region.first, npages); 3351 3352 uvm_hmm_migrate_event.region = region; 3353 3354 status = uvm_hmm_migrate_finalize(&uvm_hmm_migrate_event); 3355 if (status != NV_OK) 3356 goto err; 3357 3358 migrate_device_finalize(src_pfns + region.first, dst_pfns + region.first, npages); 3359 } 3360 3361 return NV_OK; 3362 3363 err: 3364 migrate_device_finalize(src_pfns + region.first, dst_pfns + region.first, npages); 3365 return status; 3366 } 3367 3368 NV_STATUS uvm_hmm_va_block_evict_chunks(uvm_va_block_t *va_block, 3369 uvm_va_block_context_t *va_block_context, 3370 const uvm_page_mask_t *pages_to_evict, 3371 uvm_va_block_region_t region, 3372 bool *out_accessed_by_set) 3373 { 3374 return hmm_va_block_evict_chunks(va_block, 3375 va_block_context, 3376 pages_to_evict, 3377 region, 3378 UVM_MAKE_RESIDENT_CAUSE_EVICTION, 3379 out_accessed_by_set); 3380 } 3381 3382 NV_STATUS uvm_hmm_va_block_evict_pages_from_gpu(uvm_va_block_t *va_block, 3383 uvm_gpu_t *gpu, 3384 uvm_va_block_context_t *va_block_context, 3385 const uvm_page_mask_t *pages_to_evict, 3386 uvm_va_block_region_t region) 3387 { 3388 unsigned long *src_pfns = va_block_context->hmm.src_pfns; 3389 uvm_va_block_gpu_state_t *gpu_state; 3390 uvm_page_index_t page_index; 3391 uvm_gpu_chunk_t *gpu_chunk; 3392 NV_STATUS status; 3393 3394 uvm_assert_mutex_locked(&va_block->lock); 3395 3396 gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id); 3397 UVM_ASSERT(gpu_state); 3398 UVM_ASSERT(gpu_state->chunks); 3399 3400 // Fill in the src_pfns[] with the ZONE_DEVICE private PFNs of the GPU. 3401 memset(src_pfns, 0, sizeof(va_block_context->hmm.src_pfns)); 3402 3403 // TODO: Bug 3368756: add support for large GPU pages. 3404 for_each_va_block_page_in_region_mask(page_index, pages_to_evict, region) { 3405 gpu_chunk = uvm_va_block_lookup_gpu_chunk(va_block, 3406 gpu, 3407 uvm_va_block_cpu_page_address(va_block, page_index)); 3408 status = uvm_hmm_va_block_evict_chunk_prep(va_block, 3409 va_block_context, 3410 gpu_chunk, 3411 uvm_va_block_region_for_page(page_index)); 3412 if (status != NV_OK) 3413 return status; 3414 } 3415 3416 return hmm_va_block_evict_chunks(va_block, 3417 va_block_context, 3418 pages_to_evict, 3419 region, 3420 UVM_MAKE_RESIDENT_CAUSE_API_MIGRATE, 3421 NULL); 3422 } 3423 3424 NV_STATUS uvm_hmm_remote_cpu_fault(struct vm_fault *vmf) 3425 { 3426 NV_STATUS status = NV_OK; 3427 unsigned long src_pfn; 3428 unsigned long dst_pfn; 3429 struct migrate_vma args; 3430 struct page *src_page = vmf->page; 3431 int ret; 3432 3433 args.vma = vmf->vma; 3434 args.src = &src_pfn; 3435 args.dst = &dst_pfn; 3436 args.start = nv_page_fault_va(vmf); 3437 args.end = args.start + PAGE_SIZE; 3438 args.pgmap_owner = &g_uvm_global; 3439 args.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE; 3440 args.fault_page = src_page; 3441 3442 // We don't call migrate_vma_setup_locked() here because we don't 3443 // have a va_block and don't want to ignore invalidations. 3444 ret = migrate_vma_setup(&args); 3445 UVM_ASSERT(!ret); 3446 3447 if (src_pfn & MIGRATE_PFN_MIGRATE) { 3448 struct page *dst_page; 3449 3450 dst_page = alloc_page(GFP_HIGHUSER_MOVABLE); 3451 if (!dst_page) { 3452 status = NV_ERR_NO_MEMORY; 3453 goto out; 3454 } 3455 3456 lock_page(dst_page); 3457 dst_pfn = migrate_pfn(page_to_pfn(dst_page)); 3458 3459 hmm_copy_devmem_page(dst_page, src_page); 3460 } 3461 3462 migrate_vma_pages(&args); 3463 3464 out: 3465 migrate_vma_finalize(&args); 3466 3467 return status; 3468 } 3469 3470 // The routines below are all for UVM-HMM tests. 3471 3472 NV_STATUS uvm_hmm_va_block_range_bounds(uvm_va_space_t *va_space, 3473 struct mm_struct *mm, 3474 NvU64 lookup_address, 3475 NvU64 *startp, 3476 NvU64 *endp, 3477 UVM_TEST_VA_RESIDENCY_INFO_PARAMS *params) 3478 { 3479 struct vm_area_struct *vma; 3480 NvU64 start; 3481 NvU64 end; 3482 3483 if (!uvm_hmm_is_enabled(va_space) || !mm) 3484 return NV_ERR_INVALID_ADDRESS; 3485 3486 uvm_assert_mmap_lock_locked(mm); 3487 uvm_assert_rwsem_locked(&va_space->lock); 3488 3489 // The VMA might have changed while not holding mmap_lock so check it. 3490 vma = find_vma(mm, lookup_address); 3491 if (!uvm_hmm_vma_is_valid(vma, lookup_address, false)) 3492 return NV_ERR_INVALID_ADDRESS; 3493 3494 // Since managed VA ranges don't cover more than one VMA, return only the 3495 // intersecting range of the VA block and VMA. 3496 start = UVM_VA_BLOCK_ALIGN_DOWN(lookup_address); 3497 end = start + UVM_VA_BLOCK_SIZE - 1; 3498 if (start < vma->vm_start) 3499 start = vma->vm_start; 3500 if (end > vma->vm_end - 1) 3501 end = vma->vm_end - 1; 3502 3503 *startp = start; 3504 *endp = end; 3505 3506 if (params) { 3507 uvm_va_space_processor_uuid(va_space, ¶ms->resident_on[0], UVM_ID_CPU); 3508 params->resident_physical_size[0] = PAGE_SIZE; 3509 params->resident_on_count = 1; 3510 3511 uvm_va_space_processor_uuid(va_space, ¶ms->mapped_on[0], UVM_ID_CPU); 3512 params->mapping_type[0] = (vma->vm_flags & VM_WRITE) ? 3513 UVM_PROT_READ_WRITE_ATOMIC : UVM_PROT_READ_ONLY; 3514 params->page_size[0] = PAGE_SIZE; 3515 params->mapped_on_count = 1; 3516 3517 uvm_va_space_processor_uuid(va_space, ¶ms->populated_on[0], UVM_ID_CPU); 3518 params->populated_on_count = 1; 3519 } 3520 3521 return NV_OK; 3522 } 3523 3524 NV_STATUS uvm_hmm_va_block_update_residency_info(uvm_va_block_t *va_block, 3525 struct mm_struct *mm, 3526 NvU64 lookup_address, 3527 bool populate) 3528 { 3529 uvm_va_space_t *va_space = va_block->hmm.va_space; 3530 struct vm_area_struct *vma; 3531 struct hmm_range range; 3532 uvm_va_block_region_t region; 3533 unsigned long pfn; 3534 NvU64 end; 3535 int ret; 3536 NV_STATUS status; 3537 3538 if (!uvm_hmm_is_enabled(va_space) || !mm) 3539 return NV_ERR_INVALID_ADDRESS; 3540 3541 uvm_assert_mmap_lock_locked(mm); 3542 uvm_assert_rwsem_locked(&va_space->lock); 3543 3544 // The VMA might have changed while not holding mmap_lock so check it. 3545 vma = find_vma(mm, lookup_address); 3546 if (!uvm_hmm_vma_is_valid(vma, lookup_address, false)) 3547 return NV_ERR_INVALID_ADDRESS; 3548 3549 end = lookup_address + PAGE_SIZE; 3550 region = uvm_va_block_region_from_start_end(va_block, lookup_address, end - 1); 3551 3552 range.notifier = &va_block->hmm.notifier; 3553 range.start = lookup_address; 3554 range.end = end; 3555 range.hmm_pfns = &pfn; 3556 range.default_flags = 0; 3557 range.pfn_flags_mask = 0; 3558 range.dev_private_owner = &g_uvm_global; 3559 3560 if (populate) { 3561 range.default_flags = HMM_PFN_REQ_FAULT; 3562 if (vma->vm_flags & VM_WRITE) 3563 range.default_flags |= HMM_PFN_REQ_WRITE; 3564 } 3565 3566 uvm_hmm_migrate_begin_wait(va_block); 3567 3568 while (true) { 3569 range.notifier_seq = mmu_interval_read_begin(range.notifier); 3570 ret = hmm_range_fault(&range); 3571 if (ret == -EBUSY) 3572 continue; 3573 if (ret) { 3574 uvm_hmm_migrate_finish(va_block); 3575 return errno_to_nv_status(ret); 3576 } 3577 3578 uvm_mutex_lock(&va_block->lock); 3579 3580 if (!mmu_interval_read_retry(range.notifier, range.notifier_seq)) 3581 break; 3582 3583 uvm_mutex_unlock(&va_block->lock); 3584 } 3585 3586 // Update the va_block CPU state based on the snapshot. 3587 // Note that we have to adjust the pfns address since it will be indexed 3588 // by region.first. 3589 status = populate_region(va_block, &pfn - region.first, region, NULL); 3590 3591 uvm_mutex_unlock(&va_block->lock); 3592 uvm_hmm_migrate_finish(va_block); 3593 3594 return NV_OK; 3595 } 3596 3597 NV_STATUS uvm_test_split_invalidate_delay(UVM_TEST_SPLIT_INVALIDATE_DELAY_PARAMS *params, struct file *filp) 3598 { 3599 uvm_va_space_t *va_space = uvm_va_space_get(filp); 3600 3601 atomic64_set(&va_space->test.split_invalidate_delay_us, params->delay_us); 3602 3603 return NV_OK; 3604 } 3605 3606 NV_STATUS uvm_hmm_va_range_info(uvm_va_space_t *va_space, 3607 struct mm_struct *mm, 3608 UVM_TEST_VA_RANGE_INFO_PARAMS *params) 3609 { 3610 uvm_range_tree_node_t *tree_node; 3611 const uvm_va_policy_node_t *node; 3612 struct vm_area_struct *vma; 3613 uvm_va_block_t *va_block; 3614 3615 if (!mm || !uvm_hmm_is_enabled(va_space)) 3616 return NV_ERR_INVALID_ADDRESS; 3617 3618 uvm_assert_mmap_lock_locked(mm); 3619 uvm_assert_rwsem_locked(&va_space->lock); 3620 3621 params->type = UVM_TEST_VA_RANGE_TYPE_MANAGED; 3622 params->managed.subtype = UVM_TEST_RANGE_SUBTYPE_HMM; 3623 params->va_range_start = 0; 3624 params->va_range_end = ULONG_MAX; 3625 params->read_duplication = UVM_TEST_READ_DUPLICATION_UNSET; 3626 memset(¶ms->preferred_location, 0, sizeof(params->preferred_location)); 3627 params->preferred_cpu_nid = NUMA_NO_NODE; 3628 params->accessed_by_count = 0; 3629 params->managed.vma_start = 0; 3630 params->managed.vma_end = 0; 3631 params->managed.is_zombie = NV_FALSE; 3632 params->managed.owned_by_calling_process = (mm == current->mm ? NV_TRUE : NV_FALSE); 3633 3634 vma = find_vma(mm, params->lookup_address); 3635 if (!uvm_hmm_vma_is_valid(vma, params->lookup_address, false)) 3636 return NV_ERR_INVALID_ADDRESS; 3637 3638 params->va_range_start = vma->vm_start; 3639 params->va_range_end = vma->vm_end - 1; 3640 params->managed.vma_start = vma->vm_start; 3641 params->managed.vma_end = vma->vm_end - 1; 3642 3643 uvm_mutex_lock(&va_space->hmm.blocks_lock); 3644 tree_node = uvm_range_tree_find(&va_space->hmm.blocks, params->lookup_address); 3645 if (!tree_node) { 3646 UVM_ASSERT(uvm_range_tree_find_hole_in(&va_space->hmm.blocks, params->lookup_address, 3647 ¶ms->va_range_start, ¶ms->va_range_end) == NV_OK); 3648 uvm_mutex_unlock(&va_space->hmm.blocks_lock); 3649 return NV_OK; 3650 } 3651 3652 uvm_mutex_unlock(&va_space->hmm.blocks_lock); 3653 va_block = hmm_va_block_from_node(tree_node); 3654 uvm_mutex_lock(&va_block->lock); 3655 3656 params->va_range_start = va_block->start; 3657 params->va_range_end = va_block->end; 3658 3659 node = uvm_va_policy_node_find(va_block, params->lookup_address); 3660 if (node) { 3661 uvm_processor_id_t processor_id; 3662 3663 if (params->va_range_start < node->node.start) 3664 params->va_range_start = node->node.start; 3665 if (params->va_range_end > node->node.end) 3666 params->va_range_end = node->node.end; 3667 3668 params->read_duplication = node->policy.read_duplication; 3669 3670 if (!UVM_ID_IS_INVALID(node->policy.preferred_location)) { 3671 uvm_va_space_processor_uuid(va_space, ¶ms->preferred_location, node->policy.preferred_location); 3672 params->preferred_cpu_nid = node->policy.preferred_nid; 3673 } 3674 3675 for_each_id_in_mask(processor_id, &node->policy.accessed_by) 3676 uvm_va_space_processor_uuid(va_space, ¶ms->accessed_by[params->accessed_by_count++], processor_id); 3677 } 3678 else { 3679 uvm_range_tree_find_hole_in(&va_block->hmm.va_policy_tree, params->lookup_address, 3680 ¶ms->va_range_start, ¶ms->va_range_end); 3681 } 3682 3683 uvm_mutex_unlock(&va_block->lock); 3684 3685 return NV_OK; 3686 } 3687 3688 // TODO: Bug 3660968: Remove this hack as soon as HMM migration is implemented 3689 // for VMAs other than anonymous private memory. 3690 bool uvm_hmm_must_use_sysmem(uvm_va_block_t *va_block, 3691 struct vm_area_struct *vma) 3692 { 3693 uvm_assert_mutex_locked(&va_block->lock); 3694 3695 if (!uvm_va_block_is_hmm(va_block)) 3696 return false; 3697 3698 UVM_ASSERT(vma); 3699 UVM_ASSERT(va_block->hmm.va_space->va_space_mm.mm == vma->vm_mm); 3700 uvm_assert_mmap_lock_locked(vma->vm_mm); 3701 3702 // migrate_vma_setup() can't migrate VM_SPECIAL so we have to force GPU 3703 // remote mapping. 3704 // TODO: Bug 3660968: add support for file-backed migrations. 3705 // TODO: Bug 3368756: add support for transparent huge page migrations. 3706 return !vma_is_anonymous(vma) || 3707 (vma->vm_flags & VM_SPECIAL) || 3708 vma_is_dax(vma) || 3709 is_vm_hugetlb_page(vma); 3710 } 3711 3712 #endif // UVM_IS_CONFIG_HMM() 3713