1 /******************************************************************************* 2 Copyright (c) 2018-2022 NVIDIA Corporation 3 4 Permission is hereby granted, free of charge, to any person obtaining a copy 5 of this software and associated documentation files (the "Software"), to 6 deal in the Software without restriction, including without limitation the 7 rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 8 sell copies of the Software, and to permit persons to whom the Software is 9 furnished to do so, subject to the following conditions: 10 11 The above copyright notice and this permission notice shall be 12 included in all copies or substantial portions of the Software. 13 14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 DEALINGS IN THE SOFTWARE. 21 22 *******************************************************************************/ 23 24 #include "uvm_common.h" 25 #include "uvm_linux.h" 26 #include "uvm_gpu.h" 27 #include "uvm_lock.h" 28 #include "uvm_va_space.h" 29 #include "uvm_tracker.h" 30 #include "uvm_api.h" 31 #include "uvm_push.h" 32 #include "uvm_hal.h" 33 #include "uvm_migrate_pageable.h" 34 #include "uvm_populate_pageable.h" 35 36 #ifdef UVM_MIGRATE_VMA_SUPPORTED 37 38 static struct kmem_cache *g_uvm_migrate_vma_state_cache __read_mostly; 39 40 static const gfp_t g_migrate_vma_gfp_flags = NV_UVM_GFP_FLAGS | GFP_HIGHUSER_MOVABLE | __GFP_THISNODE; 41 42 // Compute the address needed for copying_gpu to access the given page, 43 // resident on resident_id. 44 static NV_STATUS migrate_vma_page_copy_address(struct page *page, 45 unsigned long page_index, 46 uvm_processor_id_t resident_id, 47 uvm_gpu_t *copying_gpu, 48 migrate_vma_state_t *state, 49 uvm_gpu_address_t *gpu_addr) 50 { 51 uvm_va_space_t *va_space = state->uvm_migrate_args->va_space; 52 uvm_gpu_t *owning_gpu = UVM_ID_IS_CPU(resident_id)? NULL: uvm_va_space_get_gpu(va_space, resident_id); 53 const bool can_copy_from = uvm_processor_mask_test(&va_space->can_copy_from[uvm_id_value(copying_gpu->id)], 54 resident_id); 55 const bool direct_peer = owning_gpu && 56 (owning_gpu != copying_gpu) && 57 can_copy_from && 58 !uvm_gpu_peer_caps(owning_gpu, copying_gpu)->is_indirect_peer; 59 60 UVM_ASSERT(page_index < state->num_pages); 61 62 memset(gpu_addr, 0, sizeof(*gpu_addr)); 63 64 if (owning_gpu == copying_gpu) { 65 // Local vidmem address 66 *gpu_addr = uvm_gpu_address_from_phys(uvm_gpu_page_to_phys_address(owning_gpu, page)); 67 } 68 else if (direct_peer) { 69 // Direct GPU peer 70 uvm_gpu_identity_mapping_t *gpu_peer_mappings = uvm_gpu_get_peer_mapping(copying_gpu, owning_gpu->id); 71 uvm_gpu_phys_address_t phys_addr = uvm_gpu_page_to_phys_address(owning_gpu, page); 72 73 *gpu_addr = uvm_gpu_address_virtual(gpu_peer_mappings->base + phys_addr.address); 74 } 75 else { 76 // Sysmem/Indirect Peer 77 NV_STATUS status = uvm_gpu_map_cpu_page(copying_gpu->parent, page, &state->dma.addrs[page_index]); 78 79 if (status != NV_OK) 80 return status; 81 82 state->dma.addrs_gpus[page_index] = copying_gpu; 83 84 if (state->dma.num_pages++ == 0) 85 bitmap_zero(state->dma.page_mask, state->num_pages); 86 87 UVM_ASSERT(!test_bit(page_index, state->dma.page_mask)); 88 89 __set_bit(page_index, state->dma.page_mask); 90 91 *gpu_addr = uvm_gpu_address_physical(UVM_APERTURE_SYS, state->dma.addrs[page_index]); 92 } 93 94 return NV_OK; 95 } 96 97 // Return the GPU identified with the given NUMA node id 98 static uvm_gpu_t *get_gpu_from_node_id(uvm_va_space_t *va_space, int node_id) 99 { 100 uvm_gpu_t *gpu; 101 102 for_each_va_space_gpu(gpu, va_space) { 103 if (uvm_gpu_numa_info(gpu)->node_id == node_id) 104 return gpu; 105 } 106 107 return NULL; 108 } 109 110 // Create a new push to zero pages on dst_id 111 static NV_STATUS migrate_vma_zero_begin_push(uvm_va_space_t *va_space, 112 uvm_processor_id_t dst_id, 113 uvm_gpu_t *gpu, 114 unsigned long start, 115 unsigned long outer, 116 uvm_push_t *push) 117 { 118 uvm_channel_type_t channel_type; 119 120 if (UVM_ID_IS_CPU(dst_id)) { 121 channel_type = UVM_CHANNEL_TYPE_GPU_TO_CPU; 122 } 123 else { 124 UVM_ASSERT(uvm_id_equal(dst_id, gpu->id)); 125 channel_type = UVM_CHANNEL_TYPE_GPU_INTERNAL; 126 } 127 128 return uvm_push_begin(gpu->channel_manager, 129 channel_type, 130 push, 131 "Zero %s from %s VMA region [0x%lx, 0x%lx]", 132 uvm_va_space_processor_name(va_space, dst_id), 133 uvm_va_space_processor_name(va_space, gpu->id), 134 start, 135 outer); 136 } 137 138 // Create a new push to copy pages between src_id and dst_id 139 static NV_STATUS migrate_vma_copy_begin_push(uvm_va_space_t *va_space, 140 uvm_processor_id_t dst_id, 141 uvm_processor_id_t src_id, 142 unsigned long start, 143 unsigned long outer, 144 uvm_push_t *push) 145 { 146 uvm_channel_type_t channel_type; 147 uvm_gpu_t *gpu; 148 149 UVM_ASSERT_MSG(!uvm_id_equal(src_id, dst_id), 150 "Unexpected copy to self, processor %s\n", 151 uvm_va_space_processor_name(va_space, src_id)); 152 153 if (UVM_ID_IS_CPU(src_id)) { 154 gpu = uvm_va_space_get_gpu(va_space, dst_id); 155 channel_type = UVM_CHANNEL_TYPE_CPU_TO_GPU; 156 } 157 else if (UVM_ID_IS_CPU(dst_id)) { 158 gpu = uvm_va_space_get_gpu(va_space, src_id); 159 channel_type = UVM_CHANNEL_TYPE_GPU_TO_CPU; 160 } 161 else { 162 // For GPU to GPU copies, prefer to "push" the data from the source as 163 // that works better 164 gpu = uvm_va_space_get_gpu(va_space, src_id); 165 166 channel_type = UVM_CHANNEL_TYPE_GPU_TO_GPU; 167 } 168 169 // NUMA-enabled GPUs can copy to any other NUMA node in the system even if 170 // P2P access has not been explicitly enabled (ie va_space->can_copy_from 171 // is not set). 172 if (!gpu->parent->numa_info.enabled) { 173 UVM_ASSERT_MSG(uvm_processor_mask_test(&va_space->can_copy_from[uvm_id_value(gpu->id)], dst_id), 174 "GPU %s dst %s src %s\n", 175 uvm_va_space_processor_name(va_space, gpu->id), 176 uvm_va_space_processor_name(va_space, dst_id), 177 uvm_va_space_processor_name(va_space, src_id)); 178 UVM_ASSERT_MSG(uvm_processor_mask_test(&va_space->can_copy_from[uvm_id_value(gpu->id)], src_id), 179 "GPU %s dst %s src %s\n", 180 uvm_va_space_processor_name(va_space, gpu->id), 181 uvm_va_space_processor_name(va_space, dst_id), 182 uvm_va_space_processor_name(va_space, src_id)); 183 } 184 185 if (channel_type == UVM_CHANNEL_TYPE_GPU_TO_GPU) { 186 uvm_gpu_t *dst_gpu = uvm_va_space_get_gpu(va_space, dst_id); 187 return uvm_push_begin_gpu_to_gpu(gpu->channel_manager, 188 dst_gpu, 189 push, 190 "Copy from %s to %s for VMA region [0x%lx, 0x%lx]", 191 uvm_va_space_processor_name(va_space, src_id), 192 uvm_va_space_processor_name(va_space, dst_id), 193 start, 194 outer); 195 } 196 197 return uvm_push_begin(gpu->channel_manager, 198 channel_type, 199 push, 200 "Copy from %s to %s for VMA region [0x%lx, 0x%lx]", 201 uvm_va_space_processor_name(va_space, src_id), 202 uvm_va_space_processor_name(va_space, dst_id), 203 start, 204 outer); 205 } 206 207 static void migrate_vma_compute_masks(struct vm_area_struct *vma, const unsigned long *src, migrate_vma_state_t *state) 208 { 209 unsigned long i; 210 const bool is_rw = vma->vm_flags & VM_WRITE; 211 uvm_migrate_args_t *uvm_migrate_args = state->uvm_migrate_args; 212 uvm_processor_id_t dst_id = uvm_migrate_args->dst_id; 213 214 UVM_ASSERT(vma_is_anonymous(vma)); 215 216 bitmap_zero(state->populate_pages_mask, state->num_pages); 217 bitmap_zero(state->allocation_failed_mask, state->num_pages); 218 bitmap_zero(state->dst_resident_pages_mask, state->num_pages); 219 220 uvm_processor_mask_zero(&state->src_processors); 221 state->num_populate_anon_pages = 0; 222 state->dma.num_pages = 0; 223 224 for (i = 0; i < state->num_pages; ++i) { 225 uvm_processor_id_t src_id; 226 struct page *src_page = NULL; 227 int src_nid; 228 uvm_gpu_t *src_gpu = NULL; 229 230 // Skip pages that cannot be migrated 231 if (!(src[i] & MIGRATE_PFN_MIGRATE)) { 232 // This can happen in two cases : 233 // - Page is populated but can't be migrated. 234 // - Page isn't populated 235 // In both the above cases, treat the page as failing migration and 236 // populate with get_user_pages. 237 if (!(src[i] & MIGRATE_PFN_VALID)) 238 __set_bit(i, state->populate_pages_mask); 239 240 continue; 241 } 242 243 src_page = migrate_pfn_to_page(src[i]); 244 if (!src_page) { 245 if (is_rw) { 246 // Populate PROT_WRITE vmas in migrate_vma so we can use the 247 // GPU's copy engines 248 if (state->num_populate_anon_pages++ == 0) 249 bitmap_zero(state->processors[uvm_id_value(dst_id)].page_mask, state->num_pages); 250 251 __set_bit(i, state->processors[uvm_id_value(dst_id)].page_mask); 252 } 253 else { 254 // PROT_NONE vmas cannot be populated. PROT_READ anonymous vmas 255 // are populated using the zero page. In order to match this 256 // behavior, we tell the caller to populate using 257 // get_user_pages. 258 __set_bit(i, state->populate_pages_mask); 259 } 260 261 continue; 262 } 263 264 // Page is already mapped. Skip migration of this page if requested. 265 if (uvm_migrate_args->skip_mapped) { 266 __set_bit(i, state->populate_pages_mask); 267 continue; 268 } 269 270 src_nid = page_to_nid(src_page); 271 272 // Already at destination 273 if (src_nid == uvm_migrate_args->dst_node_id) { 274 __set_bit(i, state->dst_resident_pages_mask); 275 continue; 276 } 277 278 // Already resident on a CPU node, don't move 279 if (UVM_ID_IS_CPU(dst_id) && node_state(src_nid, N_CPU)) { 280 __set_bit(i, state->dst_resident_pages_mask); 281 continue; 282 } 283 284 src_gpu = get_gpu_from_node_id(uvm_migrate_args->va_space, src_nid); 285 286 // Already resident on a node with no CPUs that doesn't belong to a 287 // GPU, don't move 288 if (UVM_ID_IS_CPU(dst_id) && !src_gpu) { 289 __set_bit(i, state->dst_resident_pages_mask); 290 continue; 291 } 292 293 // TODO: Bug 2449272: Implement non-P2P copies. All systems that hit 294 // this path have P2P copy support between all GPUs in the system, but 295 // it could change in the future. 296 297 if (src_gpu) 298 src_id = src_gpu->id; 299 else 300 src_id = UVM_ID_CPU; 301 302 if (!uvm_processor_mask_test_and_set(&state->src_processors, src_id)) 303 bitmap_zero(state->processors[uvm_id_value(src_id)].page_mask, state->num_pages); 304 305 __set_bit(i, state->processors[uvm_id_value(src_id)].page_mask); 306 } 307 } 308 309 static struct page *migrate_vma_alloc_page(migrate_vma_state_t *state) 310 { 311 struct page *dst_page; 312 uvm_migrate_args_t *uvm_migrate_args = state->uvm_migrate_args; 313 uvm_va_space_t *va_space = uvm_migrate_args->va_space; 314 315 if (uvm_enable_builtin_tests && atomic_dec_if_positive(&va_space->test.migrate_vma_allocation_fail_nth) == 0) { 316 dst_page = NULL; 317 } 318 else { 319 dst_page = alloc_pages_node(uvm_migrate_args->dst_node_id, g_migrate_vma_gfp_flags, 0); 320 321 // TODO: Bug 2399573: Linux commit 322 // 183f6371aac2a5496a8ef2b0b0a68562652c3cdb introduced a bug that makes 323 // __GFP_THISNODE not always be honored (this was later fixed in commit 324 // 7810e6781e0fcbca78b91cf65053f895bf59e85f). Therefore, we verify 325 // whether the flag has been honored and abort the allocation, 326 // otherwise. Remove this check when the fix is deployed on all 327 // production systems. 328 if (dst_page && page_to_nid(dst_page) != uvm_migrate_args->dst_node_id) { 329 __free_page(dst_page); 330 dst_page = NULL; 331 } 332 } 333 334 return dst_page; 335 } 336 337 static NV_STATUS migrate_vma_populate_anon_pages(struct vm_area_struct *vma, 338 unsigned long *dst, 339 unsigned long start, 340 unsigned long outer, 341 migrate_vma_state_t *state) 342 { 343 NV_STATUS status = NV_OK; 344 uvm_migrate_args_t *uvm_migrate_args = state->uvm_migrate_args; 345 uvm_processor_id_t dst_id = uvm_migrate_args->dst_id; 346 unsigned long *page_mask = state->processors[uvm_id_value(dst_id)].page_mask; 347 uvm_gpu_t *copying_gpu = NULL; 348 uvm_va_space_t *va_space = uvm_migrate_args->va_space; 349 uvm_push_t push; 350 unsigned long i; 351 352 // Nothing to do 353 if (state->num_populate_anon_pages == 0) 354 return NV_OK; 355 356 UVM_ASSERT(state->num_populate_anon_pages == bitmap_weight(page_mask, state->num_pages)); 357 358 for_each_set_bit(i, page_mask, state->num_pages) { 359 uvm_gpu_address_t dst_address; 360 struct page *dst_page; 361 362 dst_page = migrate_vma_alloc_page(state); 363 if (!dst_page) { 364 __set_bit(i, state->allocation_failed_mask); 365 continue; 366 } 367 368 if (!copying_gpu) { 369 // Try to get a GPU attached to the node being populated. If there 370 // is none, use any of the GPUs registered in the VA space. 371 if (UVM_ID_IS_CPU(dst_id)) { 372 copying_gpu = uvm_va_space_find_first_gpu_attached_to_cpu_node(va_space, uvm_migrate_args->dst_node_id); 373 if (!copying_gpu) 374 copying_gpu = uvm_va_space_find_first_gpu(va_space); 375 } 376 else { 377 copying_gpu = uvm_va_space_get_gpu(va_space, dst_id); 378 } 379 380 UVM_ASSERT(copying_gpu); 381 382 status = migrate_vma_zero_begin_push(va_space, dst_id, copying_gpu, start, outer - 1, &push); 383 if (status != NV_OK) { 384 __free_page(dst_page); 385 return status; 386 } 387 } 388 else { 389 uvm_push_set_flag(&push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED); 390 } 391 392 status = migrate_vma_page_copy_address(dst_page, i, dst_id, copying_gpu, state, &dst_address); 393 if (status != NV_OK) { 394 __free_page(dst_page); 395 break; 396 } 397 398 lock_page(dst_page); 399 400 // We'll push one membar later for all memsets in this loop 401 uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE); 402 copying_gpu->parent->ce_hal->memset_8(&push, dst_address, 0, PAGE_SIZE); 403 404 dst[i] = migrate_pfn(page_to_pfn(dst_page)); 405 } 406 407 if (copying_gpu) { 408 NV_STATUS tracker_status; 409 410 uvm_push_end(&push); 411 412 tracker_status = uvm_tracker_add_push_safe(&state->tracker, &push); 413 if (status == NV_OK) 414 status = tracker_status; 415 } 416 417 return status; 418 } 419 420 static NV_STATUS migrate_vma_copy_pages_from(struct vm_area_struct *vma, 421 const unsigned long *src, 422 unsigned long *dst, 423 unsigned long start, 424 unsigned long outer, 425 uvm_processor_id_t src_id, 426 migrate_vma_state_t *state) 427 { 428 NV_STATUS status = NV_OK; 429 uvm_push_t push; 430 unsigned long i; 431 uvm_gpu_t *copying_gpu = NULL; 432 uvm_migrate_args_t *uvm_migrate_args = state->uvm_migrate_args; 433 uvm_processor_id_t dst_id = uvm_migrate_args->dst_id; 434 unsigned long *page_mask = state->processors[uvm_id_value(src_id)].page_mask; 435 uvm_va_space_t *va_space = uvm_migrate_args->va_space; 436 437 UVM_ASSERT(!bitmap_empty(page_mask, state->num_pages)); 438 439 for_each_set_bit(i, page_mask, state->num_pages) { 440 uvm_gpu_address_t src_address; 441 uvm_gpu_address_t dst_address; 442 struct page *src_page = migrate_pfn_to_page(src[i]); 443 struct page *dst_page; 444 445 UVM_ASSERT(src[i] & MIGRATE_PFN_VALID); 446 UVM_ASSERT(src_page); 447 448 dst_page = migrate_vma_alloc_page(state); 449 if (!dst_page) { 450 __set_bit(i, state->allocation_failed_mask); 451 continue; 452 } 453 454 if (!copying_gpu) { 455 status = migrate_vma_copy_begin_push(va_space, dst_id, src_id, start, outer - 1, &push); 456 if (status != NV_OK) { 457 __free_page(dst_page); 458 return status; 459 } 460 461 copying_gpu = uvm_push_get_gpu(&push); 462 } 463 else { 464 uvm_push_set_flag(&push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED); 465 } 466 467 // We don't have a case where both src and dst use the SYS aperture, so 468 // the second call can't overwrite a dma addr set up by the first call. 469 status = migrate_vma_page_copy_address(src_page, i, src_id, copying_gpu, state, &src_address); 470 if (status == NV_OK) 471 status = migrate_vma_page_copy_address(dst_page, i, dst_id, copying_gpu, state, &dst_address); 472 473 if (status != NV_OK) { 474 __free_page(dst_page); 475 break; 476 } 477 478 lock_page(dst_page); 479 480 // We'll push one membar later for all copies in this loop 481 uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE); 482 copying_gpu->parent->ce_hal->memcopy(&push, dst_address, src_address, PAGE_SIZE); 483 484 dst[i] = migrate_pfn(page_to_pfn(dst_page)); 485 } 486 487 // TODO: Bug 1766424: If the destination is a GPU and the copy was done by 488 // that GPU, use a GPU-local membar if no peer nor the CPU can 489 // currently map this page. When peer access gets enabled, do a 490 // MEMBAR_SYS at that point. 491 if (copying_gpu) { 492 NV_STATUS tracker_status; 493 494 uvm_push_end(&push); 495 496 tracker_status = uvm_tracker_add_push_safe(&state->tracker, &push); 497 if (status == NV_OK) 498 status = tracker_status; 499 } 500 501 return status; 502 } 503 504 static NV_STATUS migrate_vma_copy_pages(struct vm_area_struct *vma, 505 const unsigned long *src, 506 unsigned long *dst, 507 unsigned long start, 508 unsigned long outer, 509 migrate_vma_state_t *state) 510 { 511 uvm_processor_id_t src_id; 512 513 for_each_id_in_mask(src_id, &state->src_processors) { 514 NV_STATUS status = migrate_vma_copy_pages_from(vma, src, dst, start, outer, src_id, state); 515 if (status != NV_OK) 516 return status; 517 } 518 519 return NV_OK; 520 } 521 522 void uvm_migrate_vma_alloc_and_copy(struct migrate_vma *args, migrate_vma_state_t *state) 523 { 524 struct vm_area_struct *vma = args->vma; 525 unsigned long start = args->start; 526 unsigned long outer = args->end; 527 NV_STATUS tracker_status; 528 529 uvm_tracker_init(&state->tracker); 530 531 state->num_pages = (outer - start) / PAGE_SIZE; 532 state->status = NV_OK; 533 534 migrate_vma_compute_masks(vma, args->src, state); 535 536 state->status = migrate_vma_populate_anon_pages(vma, args->dst, start, outer, state); 537 538 if (state->status == NV_OK) 539 state->status = migrate_vma_copy_pages(vma, args->src, args->dst, start, outer, state); 540 541 // Wait for tracker since all copies must have completed before returning 542 tracker_status = uvm_tracker_wait_deinit(&state->tracker); 543 544 if (state->status == NV_OK) 545 state->status = tracker_status; 546 } 547 548 void uvm_migrate_vma_alloc_and_copy_helper(struct vm_area_struct *vma, 549 const unsigned long *src, 550 unsigned long *dst, 551 unsigned long start, 552 unsigned long end, 553 void *private) 554 { 555 struct migrate_vma args = 556 { 557 .vma = vma, 558 .dst = dst, 559 .src = (unsigned long *) src, 560 .start = start, 561 .end = end, 562 }; 563 564 uvm_migrate_vma_alloc_and_copy(&args, (migrate_vma_state_t *) private); 565 } 566 567 void uvm_migrate_vma_finalize_and_map(struct migrate_vma *args, migrate_vma_state_t *state) 568 { 569 unsigned long i; 570 571 for (i = 0; i < state->num_pages; i++) { 572 bool needs_touch = false; 573 uvm_migrate_args_t *uvm_migrate_args = state->uvm_migrate_args; 574 575 // The page was successfully migrated. 576 if (args->src[i] & MIGRATE_PFN_MIGRATE) { 577 // Touch if requested since population of these pages won't be tried 578 // later. 579 needs_touch = true; 580 } 581 else { 582 // The page was not migrated. This can happen for two reasons. 583 // 584 // 1. Page is already resident at the destination. 585 // 2. Page failed migration because the page state could not be 586 // migrated by the kernel. 587 // 588 // So, only set the corresponding populate_pages bit if both the 589 // following conditions are true. 590 // 591 // 1.Trying to populate pages (with gup) which are already resident 592 // at the destination is wasteful but usually harmless except in the 593 // PROT_NONE case. gup returns NV_ERR_INVALID_ADDRESS for such pages 594 // and will incorrectly lead to API migration failures even though 595 // migration worked as expected. 596 // 597 // 2. Migration failure was not because of allocation failure in 598 // uvm_migrate_vma_finalize_and_map() since such failures would be 599 // indicated in allocation_failed_mask. Failures other than 600 // allocation failures likely means that the page is populated 601 // somewhere. So, set the corresponding bit in populate_pages_mask. 602 if (test_bit(i, state->dst_resident_pages_mask)) { 603 604 // If touch was requested, pages in allocation_failed and 605 // populate_pages masks will be touched during population. But pages 606 // which are already resident at the destination need to be touched 607 // here since population isn't tried later for such pages. 608 needs_touch = true; 609 } 610 else if (!test_bit(i, state->allocation_failed_mask)) { 611 __set_bit(i, state->populate_pages_mask); 612 } 613 } 614 615 // Touch if requested and needed. 616 if (uvm_migrate_args->touch && needs_touch) { 617 struct page *dst_page; 618 619 UVM_ASSERT(args->dst[i] & MIGRATE_PFN_VALID); 620 621 dst_page = migrate_pfn_to_page(args->dst[i]); 622 UVM_ASSERT(dst_page); 623 uvm_touch_page(dst_page); 624 } 625 } 626 627 // Remove the IOMMU mappings created during the copy 628 if (state->dma.num_pages > 0) { 629 630 for_each_set_bit(i, state->dma.page_mask, state->num_pages) 631 uvm_gpu_unmap_cpu_page(state->dma.addrs_gpus[i]->parent, state->dma.addrs[i]); 632 } 633 634 UVM_ASSERT(!bitmap_intersects(state->populate_pages_mask, state->allocation_failed_mask, state->num_pages)); 635 } 636 637 void uvm_migrate_vma_finalize_and_map_helper(struct vm_area_struct *vma, 638 const unsigned long *src, 639 const unsigned long *dst, 640 unsigned long start, 641 unsigned long end, 642 void *private) 643 { 644 struct migrate_vma args = 645 { 646 .vma = vma, 647 .dst = (unsigned long *) dst, 648 .src = (unsigned long *) src, 649 .start = start, 650 .end = end, 651 }; 652 653 uvm_migrate_vma_finalize_and_map(&args, (migrate_vma_state_t *) private); 654 } 655 656 static NV_STATUS nv_migrate_vma(struct migrate_vma *args, migrate_vma_state_t *state) 657 { 658 int ret; 659 660 #if defined(CONFIG_MIGRATE_VMA_HELPER) 661 static const struct migrate_vma_ops uvm_migrate_vma_ops = 662 { 663 .alloc_and_copy = uvm_migrate_vma_alloc_and_copy_helper, 664 .finalize_and_map = uvm_migrate_vma_finalize_and_map_helper, 665 }; 666 667 ret = migrate_vma(&uvm_migrate_vma_ops, args->vma, args->start, args->end, args->src, args->dst, state); 668 if (ret < 0) 669 return errno_to_nv_status(ret); 670 #else // CONFIG_MIGRATE_VMA_HELPER 671 672 #if defined(NV_MIGRATE_VMA_FLAGS_PRESENT) 673 args->flags = MIGRATE_VMA_SELECT_SYSTEM; 674 #endif // NV_MIGRATE_VMA_FLAGS_PRESENT 675 676 ret = migrate_vma_setup(args); 677 if (ret < 0) 678 return errno_to_nv_status(ret); 679 680 uvm_migrate_vma_alloc_and_copy(args, state); 681 if (state->status == NV_OK) { 682 migrate_vma_pages(args); 683 uvm_migrate_vma_finalize_and_map(args, state); 684 } 685 686 migrate_vma_finalize(args); 687 #endif // CONFIG_MIGRATE_VMA_HELPER 688 689 return state->status; 690 } 691 692 static NV_STATUS migrate_pageable_vma_populate_mask(struct vm_area_struct *vma, 693 unsigned long start, 694 unsigned long outer, 695 const unsigned long *mask, 696 migrate_vma_state_t *state) 697 { 698 const unsigned long num_pages = (outer - start) / PAGE_SIZE; 699 unsigned long subregion_first = find_first_bit(mask, num_pages); 700 uvm_migrate_args_t *uvm_migrate_args = state->uvm_migrate_args; 701 702 while (subregion_first < num_pages) { 703 NV_STATUS status; 704 unsigned long subregion_outer = find_next_zero_bit(mask, num_pages, subregion_first + 1); 705 706 status = uvm_populate_pageable_vma(vma, 707 start + subregion_first * PAGE_SIZE, 708 (subregion_outer - subregion_first) * PAGE_SIZE, 709 0, 710 uvm_migrate_args->touch, 711 uvm_migrate_args->populate_permissions); 712 if (status != NV_OK) 713 return status; 714 715 subregion_first = find_next_bit(mask, num_pages, subregion_outer + 1); 716 } 717 718 return NV_OK; 719 } 720 721 static NV_STATUS migrate_pageable_vma_migrate_mask(struct vm_area_struct *vma, 722 unsigned long start, 723 unsigned long outer, 724 const unsigned long *mask, 725 migrate_vma_state_t *state) 726 { 727 NV_STATUS status; 728 const unsigned long num_pages = (outer - start) / PAGE_SIZE; 729 unsigned long subregion_first = find_first_bit(mask, num_pages); 730 uvm_migrate_args_t *uvm_migrate_args = state->uvm_migrate_args; 731 struct migrate_vma args = 732 { 733 .vma = vma, 734 .src = state->src_pfn_array, 735 .dst = state->dst_pfn_array, 736 }; 737 738 UVM_ASSERT(!uvm_migrate_args->skip_mapped); 739 740 while (subregion_first < num_pages) { 741 unsigned long subregion_outer = find_next_zero_bit(mask, num_pages, subregion_first + 1); 742 743 args.start = start + subregion_first * PAGE_SIZE; 744 args.end = start + subregion_outer * PAGE_SIZE; 745 746 status = nv_migrate_vma(&args, state); 747 if (status != NV_OK) 748 return status; 749 750 // We ignore allocation failure here as we are just retrying migration, 751 // but pages must have already been populated by the caller 752 753 subregion_first = find_next_bit(mask, num_pages, subregion_outer + 1); 754 } 755 756 return NV_OK; 757 } 758 759 static NV_STATUS migrate_pageable_vma_region(struct vm_area_struct *vma, 760 unsigned long start, 761 unsigned long outer, 762 migrate_vma_state_t *state, 763 unsigned long *next_addr) 764 { 765 NV_STATUS status; 766 const unsigned long num_pages = (outer - start) / PAGE_SIZE; 767 struct mm_struct *mm = vma->vm_mm; 768 uvm_migrate_args_t *uvm_migrate_args = state->uvm_migrate_args; 769 struct migrate_vma args = 770 { 771 .vma = vma, 772 .src = state->src_pfn_array, 773 .dst = state->dst_pfn_array, 774 .start = start, 775 .end = outer, 776 }; 777 778 UVM_ASSERT(PAGE_ALIGNED(start)); 779 UVM_ASSERT(PAGE_ALIGNED(outer)); 780 UVM_ASSERT(start < outer); 781 UVM_ASSERT(start >= vma->vm_start); 782 UVM_ASSERT(outer <= vma->vm_end); 783 UVM_ASSERT(outer - start <= UVM_MIGRATE_VMA_MAX_SIZE); 784 uvm_assert_mmap_lock_locked(mm); 785 uvm_assert_rwsem_locked(&uvm_migrate_args->va_space->lock); 786 787 status = nv_migrate_vma(&args, state); 788 if (status != NV_OK) 789 return status; 790 791 // Save the returned page masks because they can be overwritten by 792 // migrate_pageable_vma_migrate_mask(). 793 bitmap_copy(state->scratch1_mask, state->populate_pages_mask, num_pages); 794 bitmap_copy(state->scratch2_mask, state->allocation_failed_mask, num_pages); 795 796 if (!bitmap_empty(state->scratch1_mask, state->num_pages)) { 797 // Populate pages using get_user_pages 798 status = migrate_pageable_vma_populate_mask(vma, start, outer, state->scratch1_mask, state); 799 if (status != NV_OK) 800 return status; 801 802 if (!uvm_migrate_args->skip_mapped) { 803 status = migrate_pageable_vma_migrate_mask(vma, start, outer, state->scratch1_mask, state); 804 if (status != NV_OK) 805 return status; 806 } 807 } 808 809 // There is no need to copy the masks again after the migration is retried. 810 // We ignore the allocation_failed, populate_pages and dst_resident_pages 811 // masks set by the retried migration. 812 813 if (!bitmap_empty(state->scratch2_mask, state->num_pages)) { 814 // If the destination is the CPU, signal user-space to retry with a 815 // different node. Otherwise, just try to populate anywhere in the 816 // system 817 if (UVM_ID_IS_CPU(uvm_migrate_args->dst_id)) { 818 *next_addr = start + find_first_bit(state->scratch2_mask, num_pages) * PAGE_SIZE; 819 return NV_ERR_MORE_PROCESSING_REQUIRED; 820 } 821 else { 822 status = migrate_pageable_vma_populate_mask(vma, start, outer, state->scratch2_mask, state); 823 if (status != NV_OK) 824 return status; 825 } 826 } 827 828 return NV_OK; 829 } 830 831 static NV_STATUS migrate_pageable_vma(struct vm_area_struct *vma, 832 unsigned long start, 833 unsigned long outer, 834 migrate_vma_state_t *state, 835 unsigned long *next_addr) 836 { 837 NV_STATUS status = NV_OK; 838 struct mm_struct *mm = vma->vm_mm; 839 uvm_migrate_args_t *uvm_migrate_args = state->uvm_migrate_args; 840 uvm_va_space_t *va_space = uvm_migrate_args->va_space; 841 842 UVM_ASSERT(PAGE_ALIGNED(start)); 843 UVM_ASSERT(PAGE_ALIGNED(outer)); 844 UVM_ASSERT(vma->vm_end > start); 845 UVM_ASSERT(vma->vm_start < outer); 846 uvm_assert_mmap_lock_locked(mm); 847 uvm_assert_rwsem_locked(&va_space->lock); 848 849 // Adjust to input range boundaries 850 start = max(start, vma->vm_start); 851 outer = min(outer, vma->vm_end); 852 853 // TODO: Bug 2419180: support file-backed pages in migrate_vma, when 854 // support for it is added to the Linux kernel 855 if (!vma_is_anonymous(vma)) 856 return NV_WARN_NOTHING_TO_DO; 857 858 if (uvm_processor_mask_empty(&va_space->registered_gpus)) 859 return NV_WARN_NOTHING_TO_DO; 860 861 while (start < outer) { 862 const size_t region_size = min(outer - start, UVM_MIGRATE_VMA_MAX_SIZE); 863 864 status = migrate_pageable_vma_region(vma, start, start + region_size, state, next_addr); 865 if (status == NV_ERR_MORE_PROCESSING_REQUIRED) { 866 UVM_ASSERT(*next_addr >= start); 867 UVM_ASSERT(*next_addr < outer); 868 } 869 870 if (status != NV_OK) 871 break; 872 873 start += region_size; 874 }; 875 876 return status; 877 } 878 879 static NV_STATUS migrate_pageable(migrate_vma_state_t *state) 880 { 881 uvm_migrate_args_t *uvm_migrate_args = state->uvm_migrate_args; 882 uvm_va_space_t *va_space = uvm_migrate_args->va_space; 883 const unsigned long length = uvm_migrate_args->length; 884 NvU64 *user_space_start = uvm_migrate_args->user_space_start; 885 NvU64 *user_space_length = uvm_migrate_args->user_space_length; 886 struct mm_struct *mm = uvm_migrate_args->mm; 887 unsigned long start = uvm_migrate_args->start; 888 unsigned long outer = start + length; 889 unsigned long prev_outer = outer; 890 struct vm_area_struct *vma; 891 892 UVM_ASSERT(PAGE_ALIGNED(start)); 893 UVM_ASSERT(PAGE_ALIGNED(length)); 894 uvm_assert_mmap_lock_locked(mm); 895 896 vma = find_vma_intersection(mm, start, outer); 897 if (!vma || (start < vma->vm_start)) 898 return NV_ERR_INVALID_ADDRESS; 899 900 // VMAs are validated and migrated one at a time, since migrate_vma works 901 // on one vma at a time 902 for (; vma->vm_start <= prev_outer; vma = find_vma_intersection(mm, prev_outer, outer)) { 903 unsigned long next_addr = 0; 904 NV_STATUS status; 905 906 // Callers have already validated the range so the vma should be valid. 907 UVM_ASSERT(vma); 908 909 status = migrate_pageable_vma(vma, start, outer, state, &next_addr); 910 if (status == NV_WARN_NOTHING_TO_DO) { 911 NV_STATUS populate_status = NV_OK; 912 bool touch = uvm_migrate_args->touch; 913 uvm_populate_permissions_t populate_permissions = uvm_migrate_args->populate_permissions; 914 915 UVM_ASSERT(!vma_is_anonymous(vma) || uvm_processor_mask_empty(&va_space->registered_gpus)); 916 917 // We can't use migrate_vma to move the pages as desired. Normally 918 // this fallback path is supposed to populate the memory then inform 919 // user mode that it should call move_pages, but that move_pages 920 // call won't work as expected if the caller is in the wrong 921 // process. Make that failure explicit so the caller is aware that 922 // move_pages won't behave as expected. 923 // 924 // If the caller is a kernel thread, such as the GPU BH, continue 925 // with population since there's no move_pages fallback. 926 if (current->mm != mm && !(current->flags & PF_KTHREAD)) 927 return NV_ERR_NOT_SUPPORTED; 928 929 // Populate pages with uvm_populate_pageable 930 populate_status = uvm_populate_pageable_vma(vma, start, length, 0, touch, populate_permissions); 931 if (populate_status == NV_OK) { 932 *user_space_start = max(vma->vm_start, start); 933 *user_space_length = min(vma->vm_end, outer) - *user_space_start; 934 } 935 else { 936 status = populate_status; 937 } 938 } 939 else if (status == NV_ERR_MORE_PROCESSING_REQUIRED) { 940 UVM_ASSERT(next_addr >= start); 941 UVM_ASSERT(next_addr < outer); 942 UVM_ASSERT(UVM_ID_IS_CPU(uvm_migrate_args->dst_id)); 943 944 *user_space_start = next_addr; 945 } 946 947 if (status != NV_OK) 948 return status; 949 950 if (vma->vm_end >= outer) 951 return NV_OK; 952 953 prev_outer = vma->vm_end; 954 } 955 956 // Input range not fully covered by VMAs. 957 return NV_ERR_INVALID_ADDRESS; 958 } 959 960 NV_STATUS uvm_migrate_pageable(uvm_migrate_args_t *uvm_migrate_args) 961 { 962 migrate_vma_state_t *state = NULL; 963 NV_STATUS status; 964 uvm_va_space_t *va_space = uvm_migrate_args->va_space; 965 uvm_processor_id_t dst_id = uvm_migrate_args->dst_id; 966 int dst_node_id = uvm_migrate_args->dst_node_id; 967 968 UVM_ASSERT(PAGE_ALIGNED(uvm_migrate_args->start)); 969 UVM_ASSERT(PAGE_ALIGNED(uvm_migrate_args->length)); 970 uvm_assert_mmap_lock_locked(uvm_migrate_args->mm); 971 972 if (UVM_ID_IS_CPU(dst_id)) { 973 // We only check that dst_node_id is a valid node in the system and it 974 // doesn't correspond to a GPU node. This is fine because 975 // alloc_pages_node will clamp the allocation to 976 // cpuset_current_mems_allowed, and uvm_migrate_pageable is only called 977 // from process context (uvm_migrate) when dst_id is CPU. UVM bottom 978 // half never calls uvm_migrate_pageable when dst_id is CPU. So, assert 979 // that we're in a user thread. However, this would need to change if we 980 // wanted to call this function from a bottom half with CPU dst_id. 981 UVM_ASSERT(!(current->flags & PF_KTHREAD)); 982 983 if (!nv_numa_node_has_memory(dst_node_id) || get_gpu_from_node_id(va_space, dst_node_id) != NULL) 984 return NV_ERR_INVALID_ARGUMENT; 985 } 986 else { 987 // Incoming dst_node_id is only valid if dst_id belongs to the CPU. Use 988 // dst_node_id as the GPU node id if dst_id doesn't belong to the CPU. 989 uvm_migrate_args->dst_node_id = uvm_gpu_numa_info(uvm_va_space_get_gpu(va_space, dst_id))->node_id; 990 } 991 992 state = kmem_cache_alloc(g_uvm_migrate_vma_state_cache, NV_UVM_GFP_FLAGS); 993 if (!state) 994 return NV_ERR_NO_MEMORY; 995 996 state->uvm_migrate_args = uvm_migrate_args; 997 status = migrate_pageable(state); 998 999 kmem_cache_free(g_uvm_migrate_vma_state_cache, state); 1000 1001 return status; 1002 } 1003 1004 NV_STATUS uvm_migrate_pageable_init(void) 1005 { 1006 g_uvm_migrate_vma_state_cache = NV_KMEM_CACHE_CREATE("migrate_vma_state_t", migrate_vma_state_t); 1007 if (!g_uvm_migrate_vma_state_cache) 1008 return NV_ERR_NO_MEMORY; 1009 1010 return NV_OK; 1011 } 1012 1013 void uvm_migrate_pageable_exit(void) 1014 { 1015 kmem_cache_destroy_safe(&g_uvm_migrate_vma_state_cache); 1016 } 1017 #endif 1018