1 /******************************************************************************* 2 Copyright (c) 2018-2022 NVIDIA Corporation 3 4 Permission is hereby granted, free of charge, to any person obtaining a copy 5 of this software and associated documentation files (the "Software"), to 6 deal in the Software without restriction, including without limitation the 7 rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 8 sell copies of the Software, and to permit persons to whom the Software is 9 furnished to do so, subject to the following conditions: 10 11 The above copyright notice and this permission notice shall be 12 included in all copies or substantial portions of the Software. 13 14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 DEALINGS IN THE SOFTWARE. 21 22 *******************************************************************************/ 23 24 #include "uvm_common.h" 25 #include "uvm_linux.h" 26 #include "uvm_gpu.h" 27 #include "uvm_lock.h" 28 #include "uvm_va_space.h" 29 #include "uvm_tracker.h" 30 #include "uvm_api.h" 31 #include "uvm_push.h" 32 #include "uvm_hal.h" 33 #include "uvm_migrate_pageable.h" 34 #include "uvm_populate_pageable.h" 35 36 #ifdef UVM_MIGRATE_VMA_SUPPORTED 37 38 static struct kmem_cache *g_uvm_migrate_vma_state_cache __read_mostly; 39 40 static const gfp_t g_migrate_vma_gfp_flags = NV_UVM_GFP_FLAGS | GFP_HIGHUSER_MOVABLE | __GFP_THISNODE; 41 42 // Compute the address needed for copying_gpu to access the given page, 43 // resident on resident_id. 44 static NV_STATUS migrate_vma_page_copy_address(struct page *page, 45 unsigned long page_index, 46 uvm_processor_id_t resident_id, 47 uvm_gpu_t *copying_gpu, 48 migrate_vma_state_t *state, 49 uvm_gpu_address_t *gpu_addr) 50 { 51 uvm_va_space_t *va_space = state->uvm_migrate_args->va_space; 52 uvm_gpu_t *owning_gpu = UVM_ID_IS_CPU(resident_id)? NULL: uvm_va_space_get_gpu(va_space, resident_id); 53 const bool can_copy_from = uvm_processor_mask_test(&va_space->can_copy_from[uvm_id_value(copying_gpu->id)], 54 resident_id); 55 const bool direct_peer = owning_gpu && 56 (owning_gpu != copying_gpu) && 57 can_copy_from && 58 !uvm_gpu_peer_caps(owning_gpu, copying_gpu)->is_indirect_peer; 59 60 UVM_ASSERT(page_index < state->num_pages); 61 62 memset(gpu_addr, 0, sizeof(*gpu_addr)); 63 64 if (owning_gpu == copying_gpu) { 65 // Local vidmem address 66 *gpu_addr = uvm_gpu_address_copy(owning_gpu, uvm_gpu_page_to_phys_address(owning_gpu, page)); 67 } 68 else if (direct_peer) { 69 // Direct GPU peer 70 uvm_gpu_identity_mapping_t *gpu_peer_mappings = uvm_gpu_get_peer_mapping(copying_gpu, owning_gpu->id); 71 uvm_gpu_phys_address_t phys_addr = uvm_gpu_page_to_phys_address(owning_gpu, page); 72 73 *gpu_addr = uvm_gpu_address_virtual(gpu_peer_mappings->base + phys_addr.address); 74 } 75 else { 76 // Sysmem/Indirect Peer 77 NV_STATUS status = uvm_gpu_map_cpu_page(copying_gpu->parent, page, &state->dma.addrs[page_index]); 78 79 if (status != NV_OK) 80 return status; 81 82 state->dma.addrs_gpus[page_index] = copying_gpu; 83 84 if (state->dma.num_pages++ == 0) 85 bitmap_zero(state->dma.page_mask, state->num_pages); 86 87 UVM_ASSERT(!test_bit(page_index, state->dma.page_mask)); 88 89 __set_bit(page_index, state->dma.page_mask); 90 91 *gpu_addr = uvm_gpu_address_copy(copying_gpu, 92 uvm_gpu_phys_address(UVM_APERTURE_SYS, state->dma.addrs[page_index])); 93 } 94 95 return NV_OK; 96 } 97 98 // Create a new push to zero pages on dst_id 99 static NV_STATUS migrate_vma_zero_begin_push(uvm_va_space_t *va_space, 100 uvm_processor_id_t dst_id, 101 uvm_gpu_t *gpu, 102 unsigned long start, 103 unsigned long outer, 104 uvm_push_t *push) 105 { 106 uvm_channel_type_t channel_type; 107 108 if (UVM_ID_IS_CPU(dst_id)) { 109 channel_type = UVM_CHANNEL_TYPE_GPU_TO_CPU; 110 } 111 else { 112 UVM_ASSERT(uvm_id_equal(dst_id, gpu->id)); 113 channel_type = UVM_CHANNEL_TYPE_GPU_INTERNAL; 114 } 115 116 return uvm_push_begin(gpu->channel_manager, 117 channel_type, 118 push, 119 "Zero %s from %s VMA region [0x%lx, 0x%lx]", 120 uvm_va_space_processor_name(va_space, dst_id), 121 uvm_va_space_processor_name(va_space, gpu->id), 122 start, 123 outer); 124 } 125 126 // Create a new push to copy pages between src_id and dst_id 127 static NV_STATUS migrate_vma_copy_begin_push(uvm_va_space_t *va_space, 128 uvm_processor_id_t dst_id, 129 uvm_processor_id_t src_id, 130 unsigned long start, 131 unsigned long outer, 132 uvm_push_t *push) 133 { 134 uvm_channel_type_t channel_type; 135 uvm_gpu_t *gpu; 136 137 UVM_ASSERT_MSG(!uvm_id_equal(src_id, dst_id), 138 "Unexpected copy to self, processor %s\n", 139 uvm_va_space_processor_name(va_space, src_id)); 140 141 if (UVM_ID_IS_CPU(src_id)) { 142 gpu = uvm_va_space_get_gpu(va_space, dst_id); 143 channel_type = UVM_CHANNEL_TYPE_CPU_TO_GPU; 144 } 145 else if (UVM_ID_IS_CPU(dst_id)) { 146 gpu = uvm_va_space_get_gpu(va_space, src_id); 147 channel_type = UVM_CHANNEL_TYPE_GPU_TO_CPU; 148 } 149 else { 150 // For GPU to GPU copies, prefer to "push" the data from the source as 151 // that works better 152 gpu = uvm_va_space_get_gpu(va_space, src_id); 153 154 channel_type = UVM_CHANNEL_TYPE_GPU_TO_GPU; 155 } 156 157 // NUMA-enabled GPUs can copy to any other NUMA node in the system even if 158 // P2P access has not been explicitly enabled (ie va_space->can_copy_from 159 // is not set). 160 if (!gpu->mem_info.numa.enabled) { 161 UVM_ASSERT_MSG(uvm_processor_mask_test(&va_space->can_copy_from[uvm_id_value(gpu->id)], dst_id), 162 "GPU %s dst %s src %s\n", 163 uvm_va_space_processor_name(va_space, gpu->id), 164 uvm_va_space_processor_name(va_space, dst_id), 165 uvm_va_space_processor_name(va_space, src_id)); 166 UVM_ASSERT_MSG(uvm_processor_mask_test(&va_space->can_copy_from[uvm_id_value(gpu->id)], src_id), 167 "GPU %s dst %s src %s\n", 168 uvm_va_space_processor_name(va_space, gpu->id), 169 uvm_va_space_processor_name(va_space, dst_id), 170 uvm_va_space_processor_name(va_space, src_id)); 171 } 172 173 if (channel_type == UVM_CHANNEL_TYPE_GPU_TO_GPU) { 174 uvm_gpu_t *dst_gpu = uvm_va_space_get_gpu(va_space, dst_id); 175 return uvm_push_begin_gpu_to_gpu(gpu->channel_manager, 176 dst_gpu, 177 push, 178 "Copy from %s to %s for VMA region [0x%lx, 0x%lx]", 179 uvm_va_space_processor_name(va_space, src_id), 180 uvm_va_space_processor_name(va_space, dst_id), 181 start, 182 outer); 183 } 184 185 return uvm_push_begin(gpu->channel_manager, 186 channel_type, 187 push, 188 "Copy from %s to %s for VMA region [0x%lx, 0x%lx]", 189 uvm_va_space_processor_name(va_space, src_id), 190 uvm_va_space_processor_name(va_space, dst_id), 191 start, 192 outer); 193 } 194 195 static void migrate_vma_compute_masks(struct vm_area_struct *vma, const unsigned long *src, migrate_vma_state_t *state) 196 { 197 unsigned long i; 198 const bool is_rw = vma->vm_flags & VM_WRITE; 199 uvm_migrate_args_t *uvm_migrate_args = state->uvm_migrate_args; 200 uvm_processor_id_t dst_id = uvm_migrate_args->dst_id; 201 202 UVM_ASSERT(vma_is_anonymous(vma)); 203 204 bitmap_zero(state->populate_pages_mask, state->num_pages); 205 bitmap_zero(state->allocation_failed_mask, state->num_pages); 206 bitmap_zero(state->dst_resident_pages_mask, state->num_pages); 207 208 uvm_processor_mask_zero(&state->src_processors); 209 state->num_populate_anon_pages = 0; 210 state->dma.num_pages = 0; 211 212 for (i = 0; i < state->num_pages; ++i) { 213 uvm_processor_id_t src_id; 214 struct page *src_page = NULL; 215 int src_nid; 216 uvm_gpu_t *src_gpu = NULL; 217 218 // Skip pages that cannot be migrated 219 if (!(src[i] & MIGRATE_PFN_MIGRATE)) { 220 // This can happen in two cases : 221 // - Page is populated but can't be migrated. 222 // - Page isn't populated 223 // In both the above cases, treat the page as failing migration and 224 // populate with get_user_pages. 225 if (!(src[i] & MIGRATE_PFN_VALID)) 226 __set_bit(i, state->populate_pages_mask); 227 228 continue; 229 } 230 231 src_page = migrate_pfn_to_page(src[i]); 232 if (!src_page) { 233 if (is_rw) { 234 // Populate PROT_WRITE vmas in migrate_vma so we can use the 235 // GPU's copy engines 236 if (state->num_populate_anon_pages++ == 0) 237 bitmap_zero(state->processors[uvm_id_value(dst_id)].page_mask, state->num_pages); 238 239 __set_bit(i, state->processors[uvm_id_value(dst_id)].page_mask); 240 } 241 else { 242 // PROT_NONE vmas cannot be populated. PROT_READ anonymous vmas 243 // are populated using the zero page. In order to match this 244 // behavior, we tell the caller to populate using 245 // get_user_pages. 246 __set_bit(i, state->populate_pages_mask); 247 } 248 249 continue; 250 } 251 252 // Page is already mapped. Skip migration of this page if requested. 253 if (uvm_migrate_args->skip_mapped) { 254 __set_bit(i, state->populate_pages_mask); 255 continue; 256 } 257 258 src_nid = page_to_nid(src_page); 259 260 // Already at destination 261 if (src_nid == uvm_migrate_args->dst_node_id) { 262 __set_bit(i, state->dst_resident_pages_mask); 263 continue; 264 } 265 266 // Already resident on a CPU node, don't move 267 if (UVM_ID_IS_CPU(dst_id) && node_state(src_nid, N_CPU)) { 268 __set_bit(i, state->dst_resident_pages_mask); 269 continue; 270 } 271 272 src_gpu = uvm_va_space_find_gpu_with_memory_node_id(uvm_migrate_args->va_space, src_nid); 273 274 // Already resident on a node with no CPUs that doesn't belong to a 275 // GPU, don't move 276 if (UVM_ID_IS_CPU(dst_id) && !src_gpu) { 277 __set_bit(i, state->dst_resident_pages_mask); 278 continue; 279 } 280 281 // TODO: Bug 2449272: Implement non-P2P copies. All systems that hit 282 // this path have P2P copy support between all GPUs in the system, but 283 // it could change in the future. 284 285 if (src_gpu) 286 src_id = src_gpu->id; 287 else 288 src_id = UVM_ID_CPU; 289 290 if (!uvm_processor_mask_test_and_set(&state->src_processors, src_id)) 291 bitmap_zero(state->processors[uvm_id_value(src_id)].page_mask, state->num_pages); 292 293 __set_bit(i, state->processors[uvm_id_value(src_id)].page_mask); 294 } 295 } 296 297 static struct page *migrate_vma_alloc_page(migrate_vma_state_t *state) 298 { 299 struct page *dst_page; 300 uvm_migrate_args_t *uvm_migrate_args = state->uvm_migrate_args; 301 uvm_va_space_t *va_space = uvm_migrate_args->va_space; 302 303 if (uvm_enable_builtin_tests && atomic_dec_if_positive(&va_space->test.migrate_vma_allocation_fail_nth) == 0) { 304 dst_page = NULL; 305 } 306 else { 307 dst_page = alloc_pages_node(uvm_migrate_args->dst_node_id, g_migrate_vma_gfp_flags, 0); 308 309 // TODO: Bug 2399573: Linux commit 310 // 183f6371aac2a5496a8ef2b0b0a68562652c3cdb introduced a bug that makes 311 // __GFP_THISNODE not always be honored (this was later fixed in commit 312 // 7810e6781e0fcbca78b91cf65053f895bf59e85f). Therefore, we verify 313 // whether the flag has been honored and abort the allocation, 314 // otherwise. Remove this check when the fix is deployed on all 315 // production systems. 316 if (dst_page && page_to_nid(dst_page) != uvm_migrate_args->dst_node_id) { 317 __free_page(dst_page); 318 dst_page = NULL; 319 } 320 } 321 322 return dst_page; 323 } 324 325 static NV_STATUS migrate_vma_populate_anon_pages(struct vm_area_struct *vma, 326 unsigned long *dst, 327 unsigned long start, 328 unsigned long outer, 329 migrate_vma_state_t *state) 330 { 331 NV_STATUS status = NV_OK; 332 uvm_migrate_args_t *uvm_migrate_args = state->uvm_migrate_args; 333 uvm_processor_id_t dst_id = uvm_migrate_args->dst_id; 334 unsigned long *page_mask = state->processors[uvm_id_value(dst_id)].page_mask; 335 uvm_gpu_t *copying_gpu = NULL; 336 uvm_va_space_t *va_space = uvm_migrate_args->va_space; 337 uvm_push_t push; 338 unsigned long i; 339 340 // Nothing to do 341 if (state->num_populate_anon_pages == 0) 342 return NV_OK; 343 344 UVM_ASSERT(state->num_populate_anon_pages == bitmap_weight(page_mask, state->num_pages)); 345 346 for_each_set_bit(i, page_mask, state->num_pages) { 347 uvm_gpu_address_t dst_address; 348 struct page *dst_page; 349 350 dst_page = migrate_vma_alloc_page(state); 351 if (!dst_page) { 352 __set_bit(i, state->allocation_failed_mask); 353 continue; 354 } 355 356 if (!copying_gpu) { 357 // Try to get a GPU attached to the node being populated. If there 358 // is none, use any of the GPUs registered in the VA space. 359 if (UVM_ID_IS_CPU(dst_id)) { 360 copying_gpu = uvm_va_space_find_first_gpu_attached_to_cpu_node(va_space, uvm_migrate_args->dst_node_id); 361 if (!copying_gpu) 362 copying_gpu = uvm_va_space_find_first_gpu(va_space); 363 } 364 else { 365 copying_gpu = uvm_va_space_get_gpu(va_space, dst_id); 366 } 367 368 UVM_ASSERT(copying_gpu); 369 370 status = migrate_vma_zero_begin_push(va_space, dst_id, copying_gpu, start, outer - 1, &push); 371 if (status != NV_OK) { 372 __free_page(dst_page); 373 return status; 374 } 375 } 376 else { 377 uvm_push_set_flag(&push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED); 378 } 379 380 status = migrate_vma_page_copy_address(dst_page, i, dst_id, copying_gpu, state, &dst_address); 381 if (status != NV_OK) { 382 __free_page(dst_page); 383 break; 384 } 385 386 lock_page(dst_page); 387 388 // We'll push one membar later for all memsets in this loop 389 uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE); 390 copying_gpu->parent->ce_hal->memset_8(&push, dst_address, 0, PAGE_SIZE); 391 392 dst[i] = migrate_pfn(page_to_pfn(dst_page)); 393 } 394 395 if (copying_gpu) { 396 NV_STATUS tracker_status; 397 398 uvm_push_end(&push); 399 400 tracker_status = uvm_tracker_add_push_safe(&state->tracker, &push); 401 if (status == NV_OK) 402 status = tracker_status; 403 } 404 405 return status; 406 } 407 408 static NV_STATUS migrate_vma_copy_pages_from(struct vm_area_struct *vma, 409 const unsigned long *src, 410 unsigned long *dst, 411 unsigned long start, 412 unsigned long outer, 413 uvm_processor_id_t src_id, 414 migrate_vma_state_t *state) 415 { 416 NV_STATUS status = NV_OK; 417 uvm_push_t push; 418 unsigned long i; 419 uvm_gpu_t *copying_gpu = NULL; 420 uvm_migrate_args_t *uvm_migrate_args = state->uvm_migrate_args; 421 uvm_processor_id_t dst_id = uvm_migrate_args->dst_id; 422 unsigned long *page_mask = state->processors[uvm_id_value(src_id)].page_mask; 423 uvm_va_space_t *va_space = uvm_migrate_args->va_space; 424 425 UVM_ASSERT(!bitmap_empty(page_mask, state->num_pages)); 426 427 for_each_set_bit(i, page_mask, state->num_pages) { 428 uvm_gpu_address_t src_address; 429 uvm_gpu_address_t dst_address; 430 struct page *src_page = migrate_pfn_to_page(src[i]); 431 struct page *dst_page; 432 433 UVM_ASSERT(src[i] & MIGRATE_PFN_VALID); 434 UVM_ASSERT(src_page); 435 436 dst_page = migrate_vma_alloc_page(state); 437 if (!dst_page) { 438 __set_bit(i, state->allocation_failed_mask); 439 continue; 440 } 441 442 if (!copying_gpu) { 443 status = migrate_vma_copy_begin_push(va_space, dst_id, src_id, start, outer - 1, &push); 444 if (status != NV_OK) { 445 __free_page(dst_page); 446 return status; 447 } 448 449 copying_gpu = uvm_push_get_gpu(&push); 450 } 451 else { 452 uvm_push_set_flag(&push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED); 453 } 454 455 // We don't have a case where both src and dst use the SYS aperture, so 456 // the second call can't overwrite a dma addr set up by the first call. 457 status = migrate_vma_page_copy_address(src_page, i, src_id, copying_gpu, state, &src_address); 458 if (status == NV_OK) 459 status = migrate_vma_page_copy_address(dst_page, i, dst_id, copying_gpu, state, &dst_address); 460 461 if (status != NV_OK) { 462 __free_page(dst_page); 463 break; 464 } 465 466 lock_page(dst_page); 467 468 // We'll push one membar later for all copies in this loop 469 uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE); 470 copying_gpu->parent->ce_hal->memcopy(&push, dst_address, src_address, PAGE_SIZE); 471 472 dst[i] = migrate_pfn(page_to_pfn(dst_page)); 473 } 474 475 // TODO: Bug 1766424: If the destination is a GPU and the copy was done by 476 // that GPU, use a GPU-local membar if no peer nor the CPU can 477 // currently map this page. When peer access gets enabled, do a 478 // MEMBAR_SYS at that point. 479 if (copying_gpu) { 480 NV_STATUS tracker_status; 481 482 uvm_push_end(&push); 483 484 tracker_status = uvm_tracker_add_push_safe(&state->tracker, &push); 485 if (status == NV_OK) 486 status = tracker_status; 487 } 488 489 return status; 490 } 491 492 static NV_STATUS migrate_vma_copy_pages(struct vm_area_struct *vma, 493 const unsigned long *src, 494 unsigned long *dst, 495 unsigned long start, 496 unsigned long outer, 497 migrate_vma_state_t *state) 498 { 499 uvm_processor_id_t src_id; 500 501 for_each_id_in_mask(src_id, &state->src_processors) { 502 NV_STATUS status = migrate_vma_copy_pages_from(vma, src, dst, start, outer, src_id, state); 503 if (status != NV_OK) 504 return status; 505 } 506 507 return NV_OK; 508 } 509 510 void migrate_vma_cleanup_pages(unsigned long *dst, unsigned long npages) 511 { 512 unsigned long i; 513 514 for (i = 0; i < npages; i++) { 515 struct page *dst_page = migrate_pfn_to_page(dst[i]); 516 517 if (!dst_page) 518 continue; 519 520 unlock_page(dst_page); 521 __free_page(dst_page); 522 dst[i] = 0; 523 } 524 } 525 526 void uvm_migrate_vma_alloc_and_copy(struct migrate_vma *args, migrate_vma_state_t *state) 527 { 528 struct vm_area_struct *vma = args->vma; 529 unsigned long start = args->start; 530 unsigned long outer = args->end; 531 NV_STATUS tracker_status; 532 533 uvm_tracker_init(&state->tracker); 534 535 state->num_pages = (outer - start) / PAGE_SIZE; 536 state->status = NV_OK; 537 538 migrate_vma_compute_masks(vma, args->src, state); 539 540 state->status = migrate_vma_populate_anon_pages(vma, args->dst, start, outer, state); 541 542 if (state->status == NV_OK) 543 state->status = migrate_vma_copy_pages(vma, args->src, args->dst, start, outer, state); 544 545 // Wait for tracker since all copies must have completed before returning 546 tracker_status = uvm_tracker_wait_deinit(&state->tracker); 547 548 if (state->status == NV_OK) 549 state->status = tracker_status; 550 551 // Mark all pages as not migrating if we're failing 552 if (state->status != NV_OK) 553 migrate_vma_cleanup_pages(args->dst, state->num_pages); 554 } 555 556 void uvm_migrate_vma_alloc_and_copy_helper(struct vm_area_struct *vma, 557 const unsigned long *src, 558 unsigned long *dst, 559 unsigned long start, 560 unsigned long end, 561 void *private) 562 { 563 struct migrate_vma args = 564 { 565 .vma = vma, 566 .dst = dst, 567 .src = (unsigned long *) src, 568 .start = start, 569 .end = end, 570 }; 571 572 uvm_migrate_vma_alloc_and_copy(&args, (migrate_vma_state_t *) private); 573 } 574 575 void uvm_migrate_vma_finalize_and_map(struct migrate_vma *args, migrate_vma_state_t *state) 576 { 577 unsigned long i; 578 579 for (i = 0; i < state->num_pages; i++) { 580 bool needs_touch = false; 581 uvm_migrate_args_t *uvm_migrate_args = state->uvm_migrate_args; 582 583 // The page was successfully migrated. 584 if (args->src[i] & MIGRATE_PFN_MIGRATE) { 585 // Touch if requested since population of these pages won't be tried 586 // later. 587 needs_touch = true; 588 } 589 else { 590 // The page was not migrated. This can happen for two reasons. 591 // 592 // 1. Page is already resident at the destination. 593 // 2. Page failed migration because the page state could not be 594 // migrated by the kernel. 595 // 596 // So, only set the corresponding populate_pages bit if both the 597 // following conditions are true. 598 // 599 // 1.Trying to populate pages (with gup) which are already resident 600 // at the destination is wasteful but usually harmless except in the 601 // PROT_NONE case. gup returns NV_ERR_INVALID_ADDRESS for such pages 602 // and will incorrectly lead to API migration failures even though 603 // migration worked as expected. 604 // 605 // 2. Migration failure was not because of allocation failure in 606 // uvm_migrate_vma_finalize_and_map() since such failures would be 607 // indicated in allocation_failed_mask. Failures other than 608 // allocation failures likely means that the page is populated 609 // somewhere. So, set the corresponding bit in populate_pages_mask. 610 if (test_bit(i, state->dst_resident_pages_mask)) { 611 612 // If touch was requested, pages in allocation_failed and 613 // populate_pages masks will be touched during population. But pages 614 // which are already resident at the destination need to be touched 615 // here since population isn't tried later for such pages. 616 needs_touch = true; 617 } 618 else if (!test_bit(i, state->allocation_failed_mask)) { 619 __set_bit(i, state->populate_pages_mask); 620 } 621 } 622 623 // Touch if requested and needed. 624 if (uvm_migrate_args->touch && needs_touch) { 625 struct page *dst_page; 626 627 UVM_ASSERT(args->dst[i] & MIGRATE_PFN_VALID); 628 629 dst_page = migrate_pfn_to_page(args->dst[i]); 630 UVM_ASSERT(dst_page); 631 uvm_touch_page(dst_page); 632 } 633 } 634 635 // Remove the IOMMU mappings created during the copy 636 if (state->dma.num_pages > 0) { 637 638 for_each_set_bit(i, state->dma.page_mask, state->num_pages) 639 uvm_gpu_unmap_cpu_page(state->dma.addrs_gpus[i]->parent, state->dma.addrs[i]); 640 } 641 642 UVM_ASSERT(!bitmap_intersects(state->populate_pages_mask, state->allocation_failed_mask, state->num_pages)); 643 } 644 645 void uvm_migrate_vma_finalize_and_map_helper(struct vm_area_struct *vma, 646 const unsigned long *src, 647 const unsigned long *dst, 648 unsigned long start, 649 unsigned long end, 650 void *private) 651 { 652 struct migrate_vma args = 653 { 654 .vma = vma, 655 .dst = (unsigned long *) dst, 656 .src = (unsigned long *) src, 657 .start = start, 658 .end = end, 659 }; 660 661 uvm_migrate_vma_finalize_and_map(&args, (migrate_vma_state_t *) private); 662 } 663 664 static NV_STATUS nv_migrate_vma(struct migrate_vma *args, migrate_vma_state_t *state) 665 { 666 int ret; 667 668 #if defined(CONFIG_MIGRATE_VMA_HELPER) 669 static const struct migrate_vma_ops uvm_migrate_vma_ops = 670 { 671 .alloc_and_copy = uvm_migrate_vma_alloc_and_copy_helper, 672 .finalize_and_map = uvm_migrate_vma_finalize_and_map_helper, 673 }; 674 675 ret = migrate_vma(&uvm_migrate_vma_ops, args->vma, args->start, args->end, args->src, args->dst, state); 676 if (ret < 0) 677 return errno_to_nv_status(ret); 678 #else // CONFIG_MIGRATE_VMA_HELPER 679 680 #if defined(NV_MIGRATE_VMA_FLAGS_PRESENT) 681 args->flags = MIGRATE_VMA_SELECT_SYSTEM; 682 #endif // NV_MIGRATE_VMA_FLAGS_PRESENT 683 684 ret = migrate_vma_setup(args); 685 if (ret < 0) 686 return errno_to_nv_status(ret); 687 688 uvm_migrate_vma_alloc_and_copy(args, state); 689 if (state->status == NV_OK) { 690 migrate_vma_pages(args); 691 uvm_migrate_vma_finalize_and_map(args, state); 692 } 693 694 migrate_vma_finalize(args); 695 #endif // CONFIG_MIGRATE_VMA_HELPER 696 697 return state->status; 698 } 699 700 static NV_STATUS migrate_pageable_vma_populate_mask(struct vm_area_struct *vma, 701 unsigned long start, 702 unsigned long outer, 703 const unsigned long *mask, 704 migrate_vma_state_t *state) 705 { 706 const unsigned long num_pages = (outer - start) / PAGE_SIZE; 707 unsigned long subregion_first = find_first_bit(mask, num_pages); 708 uvm_migrate_args_t *uvm_migrate_args = state->uvm_migrate_args; 709 710 while (subregion_first < num_pages) { 711 NV_STATUS status; 712 unsigned long subregion_outer = find_next_zero_bit(mask, num_pages, subregion_first + 1); 713 714 status = uvm_populate_pageable_vma(vma, 715 start + subregion_first * PAGE_SIZE, 716 (subregion_outer - subregion_first) * PAGE_SIZE, 717 0, 718 uvm_migrate_args->touch, 719 uvm_migrate_args->populate_permissions); 720 if (status != NV_OK) 721 return status; 722 723 subregion_first = find_next_bit(mask, num_pages, subregion_outer + 1); 724 } 725 726 return NV_OK; 727 } 728 729 static NV_STATUS migrate_pageable_vma_migrate_mask(struct vm_area_struct *vma, 730 unsigned long start, 731 unsigned long outer, 732 const unsigned long *mask, 733 migrate_vma_state_t *state) 734 { 735 NV_STATUS status; 736 const unsigned long num_pages = (outer - start) / PAGE_SIZE; 737 unsigned long subregion_first = find_first_bit(mask, num_pages); 738 uvm_migrate_args_t *uvm_migrate_args = state->uvm_migrate_args; 739 struct migrate_vma args = 740 { 741 .vma = vma, 742 .src = state->src_pfn_array, 743 .dst = state->dst_pfn_array, 744 }; 745 746 UVM_ASSERT(!uvm_migrate_args->skip_mapped); 747 748 while (subregion_first < num_pages) { 749 unsigned long subregion_outer = find_next_zero_bit(mask, num_pages, subregion_first + 1); 750 751 args.start = start + subregion_first * PAGE_SIZE; 752 args.end = start + subregion_outer * PAGE_SIZE; 753 754 status = nv_migrate_vma(&args, state); 755 if (status != NV_OK) 756 return status; 757 758 // We ignore allocation failure here as we are just retrying migration, 759 // but pages must have already been populated by the caller 760 761 subregion_first = find_next_bit(mask, num_pages, subregion_outer + 1); 762 } 763 764 return NV_OK; 765 } 766 767 static NV_STATUS migrate_pageable_vma_region(struct vm_area_struct *vma, 768 unsigned long start, 769 unsigned long outer, 770 migrate_vma_state_t *state, 771 unsigned long *next_addr) 772 { 773 NV_STATUS status; 774 const unsigned long num_pages = (outer - start) / PAGE_SIZE; 775 struct mm_struct *mm = vma->vm_mm; 776 uvm_migrate_args_t *uvm_migrate_args = state->uvm_migrate_args; 777 struct migrate_vma args = 778 { 779 .vma = vma, 780 .src = state->src_pfn_array, 781 .dst = state->dst_pfn_array, 782 .start = start, 783 .end = outer, 784 }; 785 786 UVM_ASSERT(PAGE_ALIGNED(start)); 787 UVM_ASSERT(PAGE_ALIGNED(outer)); 788 UVM_ASSERT(start < outer); 789 UVM_ASSERT(start >= vma->vm_start); 790 UVM_ASSERT(outer <= vma->vm_end); 791 UVM_ASSERT(outer - start <= UVM_MIGRATE_VMA_MAX_SIZE); 792 uvm_assert_mmap_lock_locked(mm); 793 uvm_assert_rwsem_locked(&uvm_migrate_args->va_space->lock); 794 795 status = nv_migrate_vma(&args, state); 796 if (status != NV_OK) 797 return status; 798 799 // Save the returned page masks because they can be overwritten by 800 // migrate_pageable_vma_migrate_mask(). 801 bitmap_copy(state->scratch1_mask, state->populate_pages_mask, num_pages); 802 bitmap_copy(state->scratch2_mask, state->allocation_failed_mask, num_pages); 803 804 if (!bitmap_empty(state->scratch1_mask, state->num_pages)) { 805 // Populate pages using get_user_pages 806 status = migrate_pageable_vma_populate_mask(vma, start, outer, state->scratch1_mask, state); 807 if (status != NV_OK) 808 return status; 809 810 if (!uvm_migrate_args->skip_mapped) { 811 status = migrate_pageable_vma_migrate_mask(vma, start, outer, state->scratch1_mask, state); 812 if (status != NV_OK) 813 return status; 814 } 815 } 816 817 // There is no need to copy the masks again after the migration is retried. 818 // We ignore the allocation_failed, populate_pages and dst_resident_pages 819 // masks set by the retried migration. 820 821 if (!bitmap_empty(state->scratch2_mask, state->num_pages)) { 822 // If the destination is the CPU, signal user-space to retry with a 823 // different node. Otherwise, just try to populate anywhere in the 824 // system 825 if (UVM_ID_IS_CPU(uvm_migrate_args->dst_id) && !uvm_migrate_args->populate_on_cpu_alloc_failures) { 826 *next_addr = start + find_first_bit(state->scratch2_mask, num_pages) * PAGE_SIZE; 827 return NV_ERR_MORE_PROCESSING_REQUIRED; 828 } 829 else { 830 status = migrate_pageable_vma_populate_mask(vma, start, outer, state->scratch2_mask, state); 831 if (status != NV_OK) 832 return status; 833 } 834 } 835 836 return NV_OK; 837 } 838 839 static NV_STATUS migrate_pageable_vma(struct vm_area_struct *vma, 840 unsigned long start, 841 unsigned long outer, 842 migrate_vma_state_t *state, 843 unsigned long *next_addr) 844 { 845 NV_STATUS status = NV_OK; 846 struct mm_struct *mm = vma->vm_mm; 847 uvm_migrate_args_t *uvm_migrate_args = state->uvm_migrate_args; 848 uvm_va_space_t *va_space = uvm_migrate_args->va_space; 849 850 UVM_ASSERT(PAGE_ALIGNED(start)); 851 UVM_ASSERT(PAGE_ALIGNED(outer)); 852 UVM_ASSERT(vma->vm_end > start); 853 UVM_ASSERT(vma->vm_start < outer); 854 uvm_assert_mmap_lock_locked(mm); 855 uvm_assert_rwsem_locked(&va_space->lock); 856 857 // Adjust to input range boundaries 858 start = max(start, vma->vm_start); 859 outer = min(outer, vma->vm_end); 860 861 // TODO: Bug 2419180: support file-backed pages in migrate_vma, when 862 // support for it is added to the Linux kernel 863 if (!vma_is_anonymous(vma)) 864 return NV_WARN_NOTHING_TO_DO; 865 866 if (uvm_processor_mask_empty(&va_space->registered_gpus)) 867 return NV_WARN_NOTHING_TO_DO; 868 869 while (start < outer) { 870 const size_t region_size = min(outer - start, UVM_MIGRATE_VMA_MAX_SIZE); 871 872 status = migrate_pageable_vma_region(vma, start, start + region_size, state, next_addr); 873 if (status == NV_ERR_MORE_PROCESSING_REQUIRED) { 874 UVM_ASSERT(*next_addr >= start); 875 UVM_ASSERT(*next_addr < outer); 876 } 877 878 if (status != NV_OK) 879 break; 880 881 start += region_size; 882 }; 883 884 return status; 885 } 886 887 static NV_STATUS migrate_pageable(migrate_vma_state_t *state) 888 { 889 uvm_migrate_args_t *uvm_migrate_args = state->uvm_migrate_args; 890 uvm_va_space_t *va_space = uvm_migrate_args->va_space; 891 const unsigned long length = uvm_migrate_args->length; 892 NvU64 *user_space_start = uvm_migrate_args->user_space_start; 893 NvU64 *user_space_length = uvm_migrate_args->user_space_length; 894 struct mm_struct *mm = uvm_migrate_args->mm; 895 unsigned long start = uvm_migrate_args->start; 896 unsigned long outer = start + length; 897 unsigned long prev_outer = outer; 898 struct vm_area_struct *vma; 899 900 UVM_ASSERT(PAGE_ALIGNED(start)); 901 UVM_ASSERT(PAGE_ALIGNED(length)); 902 uvm_assert_mmap_lock_locked(mm); 903 904 vma = find_vma_intersection(mm, start, outer); 905 if (!vma || (start < vma->vm_start)) 906 return NV_ERR_INVALID_ADDRESS; 907 908 // VMAs are validated and migrated one at a time, since migrate_vma works 909 // on one vma at a time 910 for (; vma->vm_start <= prev_outer; vma = find_vma_intersection(mm, prev_outer, outer)) { 911 unsigned long next_addr = 0; 912 NV_STATUS status; 913 914 // Callers have already validated the range so the vma should be valid. 915 UVM_ASSERT(vma); 916 917 status = migrate_pageable_vma(vma, start, outer, state, &next_addr); 918 if (status == NV_WARN_NOTHING_TO_DO) { 919 NV_STATUS populate_status = NV_OK; 920 bool touch = uvm_migrate_args->touch; 921 uvm_populate_permissions_t populate_permissions = uvm_migrate_args->populate_permissions; 922 923 UVM_ASSERT(!vma_is_anonymous(vma) || uvm_processor_mask_empty(&va_space->registered_gpus)); 924 925 // We can't use migrate_vma to move the pages as desired. Normally 926 // this fallback path is supposed to populate the memory then inform 927 // user mode that it should call move_pages, but that move_pages 928 // call won't work as expected if the caller is in the wrong 929 // process. Make that failure explicit so the caller is aware that 930 // move_pages won't behave as expected. 931 // 932 // If the caller is a kernel thread, such as the GPU BH, continue 933 // with population since there's no move_pages fallback. 934 if (current->mm != mm && !(current->flags & PF_KTHREAD)) 935 return NV_ERR_NOT_SUPPORTED; 936 937 // Populate pages with uvm_populate_pageable 938 populate_status = uvm_populate_pageable_vma(vma, start, length, 0, touch, populate_permissions); 939 if (populate_status == NV_OK) { 940 *user_space_start = max(vma->vm_start, start); 941 *user_space_length = min(vma->vm_end, outer) - *user_space_start; 942 } 943 else { 944 status = populate_status; 945 } 946 } 947 else if (status == NV_ERR_MORE_PROCESSING_REQUIRED) { 948 UVM_ASSERT(next_addr >= start); 949 UVM_ASSERT(next_addr < outer); 950 UVM_ASSERT(UVM_ID_IS_CPU(uvm_migrate_args->dst_id)); 951 952 *user_space_start = next_addr; 953 } 954 955 if (status != NV_OK) 956 return status; 957 958 if (vma->vm_end >= outer) 959 return NV_OK; 960 961 prev_outer = vma->vm_end; 962 } 963 964 // Input range not fully covered by VMAs. 965 return NV_ERR_INVALID_ADDRESS; 966 } 967 968 NV_STATUS uvm_migrate_pageable(uvm_migrate_args_t *uvm_migrate_args) 969 { 970 migrate_vma_state_t *state = NULL; 971 NV_STATUS status; 972 uvm_va_space_t *va_space = uvm_migrate_args->va_space; 973 uvm_processor_id_t dst_id = uvm_migrate_args->dst_id; 974 int dst_node_id = uvm_migrate_args->dst_node_id; 975 976 UVM_ASSERT(PAGE_ALIGNED(uvm_migrate_args->start)); 977 UVM_ASSERT(PAGE_ALIGNED(uvm_migrate_args->length)); 978 uvm_assert_mmap_lock_locked(uvm_migrate_args->mm); 979 980 if (UVM_ID_IS_CPU(dst_id)) { 981 // We only check that dst_node_id is a valid node in the system and it 982 // doesn't correspond to a GPU node. This is fine because 983 // alloc_pages_node will clamp the allocation to 984 // cpuset_current_mems_allowed when uvm_migrate_pageable is called from 985 // process context (uvm_migrate) when dst_id is CPU. UVM bottom half 986 // calls uvm_migrate_pageable with CPU dst_id only when the VMA memory 987 // policy is set to dst_node_id and dst_node_id is not NUMA_NO_NODE. 988 if (!nv_numa_node_has_memory(dst_node_id) || 989 uvm_va_space_find_gpu_with_memory_node_id(va_space, dst_node_id) != NULL) 990 return NV_ERR_INVALID_ARGUMENT; 991 } 992 else { 993 // Incoming dst_node_id is only valid if dst_id belongs to the CPU. Use 994 // dst_node_id as the GPU node id if dst_id doesn't belong to the CPU. 995 uvm_migrate_args->dst_node_id = uvm_gpu_numa_node(uvm_va_space_get_gpu(va_space, dst_id)); 996 } 997 998 state = kmem_cache_alloc(g_uvm_migrate_vma_state_cache, NV_UVM_GFP_FLAGS); 999 if (!state) 1000 return NV_ERR_NO_MEMORY; 1001 1002 state->uvm_migrate_args = uvm_migrate_args; 1003 status = migrate_pageable(state); 1004 1005 kmem_cache_free(g_uvm_migrate_vma_state_cache, state); 1006 1007 return status; 1008 } 1009 1010 NV_STATUS uvm_migrate_pageable_init(void) 1011 { 1012 g_uvm_migrate_vma_state_cache = NV_KMEM_CACHE_CREATE("migrate_vma_state_t", migrate_vma_state_t); 1013 if (!g_uvm_migrate_vma_state_cache) 1014 return NV_ERR_NO_MEMORY; 1015 1016 return NV_OK; 1017 } 1018 1019 void uvm_migrate_pageable_exit(void) 1020 { 1021 kmem_cache_destroy_safe(&g_uvm_migrate_vma_state_cache); 1022 } 1023 #endif 1024