1 /******************************************************************************* 2 Copyright (c) 2018-2022 NVIDIA Corporation 3 4 Permission is hereby granted, free of charge, to any person obtaining a copy 5 of this software and associated documentation files (the "Software"), to 6 deal in the Software without restriction, including without limitation the 7 rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 8 sell copies of the Software, and to permit persons to whom the Software is 9 furnished to do so, subject to the following conditions: 10 11 The above copyright notice and this permission notice shall be 12 included in all copies or substantial portions of the Software. 13 14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 DEALINGS IN THE SOFTWARE. 21 22 *******************************************************************************/ 23 24 #include "uvm_common.h" 25 #include "uvm_kvmalloc.h" 26 #include "uvm_va_space.h" 27 #include "uvm_va_space_mm.h" 28 #include "uvm_ats.h" 29 #include "uvm_api.h" 30 #include "uvm_test.h" 31 #include "uvm_test_ioctl.h" 32 33 #if defined(NV_LINUX_SCHED_MM_H_PRESENT) 34 #include <linux/sched/mm.h> 35 #elif defined(NV_LINUX_SCHED_H_PRESENT) 36 #include <linux/sched.h> 37 #endif 38 39 // 40 // This comment block describes some implementation rationale. See the header 41 // for the API descriptions. 42 // 43 // ========================= Retain count vs mm_users ========================== 44 // 45 // We use two methods to guarantee the mm is available and won't be destroyed. 46 // 47 // On the call paths where mmput() can be called, we call 48 // uvm_va_space_mm_or_current_retain() which calls mmget_not_zero(). This 49 // prevents mm teardown and avoids races with uvm_va_space_mm_shutdown() since 50 // it prevents mmput() -> __mmput() -> exit_mmap() -> mmu_notifier_release() -> 51 // uvm_va_space_mm_shutdown() until uvm_va_space_mm_or_current_release(), and 52 // we guarantee that we can't call uvm_va_space_mm_unregister() -> 53 // mmu_notifier_unregister() -> uvm_va_space_mm_shutdown() path when someone is 54 // about to call uvm_va_space_mm_or_current_retain(). 55 // Kernel calls like mmu_interval_notifier_insert() require mm_users to be 56 // greater than 0. In general, these are the ioctl paths. 57 // 58 // On the replayable GPU fault handling path, we need the mm to be able to 59 // service faults in the window when mm_users == 0 but mmu_notifier_release() 60 // hasn't yet been called. We can't call mmput() because it may result in 61 // exit_mmap(), which could result in RM calls and VA space destroy. Those need 62 // to wait for the GPU fault handler to finish, so on that path we use an 63 // internal retained reference count and wait queue. When the mm is disabled 64 // via mmu_notifier_release(), we use the wait queue to wait for the reference 65 // count to go to 0. 66 // We also use this path for older Linux kernels where mm_users > 0 isn't 67 // required. 68 // 69 // ============================ Handling mm teardown =========================== 70 // 71 // mmu_notifiers call the mm release callback both when the mm is really getting 72 // shut down, and whenever mmu_notifier_unregister is called. This has several 73 // consequences, including that these two paths can race. If they do race, they 74 // wait for each other to finish (real teardown of the mm won't start until the 75 // mmu_notifier_unregister's callback has returned, and mmu_notifier_unregister 76 // won't return until the mm release callback has returned). 77 // 78 // When the mm is really getting torn down, uvm_va_space_mm_shutdown is expected 79 // to stop all GPU memory accesses to that mm and stop servicing faults in that 80 // mm. This essentially shuts down the VA space for new work. The VA space 81 // object remains valid for most teardown ioctls until the file is closed, 82 // because it's legal for the associated process to die then for another process 83 // with a reference on the file to perform the unregisters or associated ioctls. 84 // This is particularly true for tools users. 85 // 86 // An exception to the above is UvmUnregisterChannel. Since channels are 87 // completely removed from the VA space on mm teardown, later channel 88 // unregisters will fail to find the handles and will return an error. 89 // 90 // The UVM driver will only call mmu_notifier_unregister during VA space destroy 91 // (file close). 92 // 93 // Here is a table of the various teardown scenarios: 94 // 95 // Can race with 96 // Scenario mm teardown 97 // ----------------------------------------------------------------------------- 98 // 1) Process exit (mm teardown, file open) - 99 // 2) Explicit file close in original mm No 100 // 3) Explicit file close in different mm Yes 101 // 4) Implicit file close (exit) in original mm No 102 // 5) Implicit file close (exit) in different mm Yes 103 // 104 // At a high level, the sequence of operations to perform during mm teardown is: 105 // 106 // 1) Stop all channels 107 // - Prevents new faults and accesses on non-MPS 108 // 2) Detach all channels 109 // - Prevents pending faults from being translated to this VA space 110 // - Non-replayable faults will be dropped so no new ones can arrive 111 // - Access counter notifications will be prevented from getting new 112 // translations to this VA space. Pending entries may attempt to retain 113 // the mm, but will drop the notification if they can't be serviced. 114 // 3) Flush the fault buffer 115 // - The only reason to flush the fault buffer is to avoid spurious 116 // cancels. If we didn't flush the fault buffer before marking the mm 117 // as dead, then remaining faults which require the mm would be 118 // cancelled. Since the faults might be stale, we would record cancel 119 // events which didn't really happen (the access didn't happen after 120 // the mm died). By flushing we clear out all stale faults, and in 121 // the case of MPS, cancel real faults after. 122 // 4) UnsetPageDir 123 // - Prevents new accesses on MPS 124 // 5) Mark the va_space_mm as dead 125 // - Prevents new retainers from using the mm. There won't be any more on 126 // the fault handling paths, but there could be others in worker threads. 127 // 128 // Here are some tables of each step in the sequence, and what operations can 129 // still be performed after each step. This is all from the perspective of a 130 // single VA space. "Untranslated" means that the fault entry has not been 131 // translated to a uvm_va_space yet. 132 // 133 // Replayable non-MPS Behavior: 134 // 135 // Can Pending Pending Can be 136 // access Can untranslated translated servicing 137 // memory fault faults faults faults 138 // ----------------------------------------------------------------------------- 139 // Shutdown start Yes Yes Service Service Yes 140 // Stop channels No No Service [1] Service [1] Yes [1] 141 // Detach channels No No Flush buffer Service [1] Yes [1], [2] 142 // Flush buffer No No None possible None possible No 143 // UnsetPageDir No No None possible None possible No 144 // 145 // 146 // Replayable MPS Behavior: 147 // 148 // Can Pending Pending Can be 149 // access Can untranslated translated servicing 150 // memory fault faults faults faults 151 // ----------------------------------------------------------------------------- 152 // Shutdown start Yes Yes Service Service Yes 153 // Stop channels Yes Yes Service Service Yes 154 // Detach channels Yes Yes Cancel, flush Service Yes 155 // Flush buffer Yes Yes Cancel, flush None possible No 156 // UnsetPageDir No [3] Yes Cancel, flush None possible No 157 // 158 // 159 // [1]: All pending faults in this VA space are stale since channel stop 160 // preempted the context. 161 // [2]: Faults in this VA space can't be serviced concurrently with detach since 162 // detach holds the VA space lock in write mode. Faults in other VA spaces 163 // can be serviced, and stale faults in this VA space can resume service 164 // after detach is done. 165 // [3]: Due to the nature of MPS, remaining work which had started under the VA 166 // space could still execute and attempt to make memory accesses. However, 167 // since the PDB at that point is empty and ATS is disabled (if available), 168 // all accesses will fault and be cancelled rather than successfully 169 // translate to physical memory. 170 // 171 // ============================================================================= 172 173 #define UVM_VA_SPACE_MM_SHUTDOWN_DELAY_MAX_MS 100 174 175 static int uvm_enable_va_space_mm = 1; 176 module_param(uvm_enable_va_space_mm, int, S_IRUGO); 177 MODULE_PARM_DESC(uvm_enable_va_space_mm, 178 "Set to 0 to disable UVM from using mmu_notifiers to create " 179 "an association between a UVM VA space and a process. This " 180 "will also disable pageable memory access via either ATS or " 181 "HMM."); 182 183 bool uvm_va_space_mm_enabled_system(void) 184 { 185 return UVM_CAN_USE_MMU_NOTIFIERS() && uvm_enable_va_space_mm; 186 } 187 188 bool uvm_va_space_mm_enabled(uvm_va_space_t *va_space) 189 { 190 // A va_space doesn't have any association with an mm in multi-process 191 // sharing mode. 192 if (va_space->initialization_flags & UVM_INIT_FLAGS_MULTI_PROCESS_SHARING_MODE) 193 return false; 194 195 return uvm_va_space_mm_enabled_system(); 196 } 197 198 static void uvm_va_space_mm_shutdown(uvm_va_space_t *va_space); 199 200 #if !defined(NV_MMGET_NOT_ZERO_PRESENT) 201 static bool mmget_not_zero(struct mm_struct *mm) 202 { 203 return atomic_inc_not_zero(&mm->mm_users); 204 } 205 #endif 206 207 #if UVM_CAN_USE_MMU_NOTIFIERS() 208 209 static void uvm_mmput(struct mm_struct *mm) 210 { 211 mmput(mm); 212 } 213 214 static uvm_va_space_t *get_va_space(struct mmu_notifier *mn) 215 { 216 // This may be called without a thread context present, so be careful 217 // what is used here. 218 return container_of(mn, uvm_va_space_t, va_space_mm.mmu_notifier); 219 } 220 221 static void uvm_mmu_notifier_release(struct mmu_notifier *mn, struct mm_struct *mm) 222 { 223 UVM_ENTRY_VOID(uvm_va_space_mm_shutdown(get_va_space(mn))); 224 } 225 226 static void uvm_mmu_notifier_invalidate_range_ats(struct mmu_notifier *mn, 227 struct mm_struct *mm, 228 unsigned long start, 229 unsigned long end) 230 { 231 // In most cases ->invalidate_range() is called with exclusive end. 232 // uvm_ats_invalidate() expects an inclusive end so we have to 233 // convert it. 234 // 235 // There's a special case however. Kernel TLB gathering sometimes 236 // identifies "fullmm" invalidates by setting both start and end to ~0. 237 // 238 // It's unclear if there are any other cases in which the kernel will 239 // call us with start == end. Since we can't definitively say no, we 240 // conservatively treat all such calls as full invalidates. 241 if (start == end) { 242 start = 0; 243 end = ~0UL; 244 } 245 else { 246 --end; 247 } 248 249 UVM_ENTRY_VOID(uvm_ats_invalidate(get_va_space(mn), start, end)); 250 } 251 252 static struct mmu_notifier_ops uvm_mmu_notifier_ops_release = 253 { 254 .release = uvm_mmu_notifier_release, 255 }; 256 257 static struct mmu_notifier_ops uvm_mmu_notifier_ops_ats = 258 { 259 .release = uvm_mmu_notifier_release, 260 .invalidate_range = uvm_mmu_notifier_invalidate_range_ats, 261 }; 262 263 static int uvm_mmu_notifier_register(uvm_va_space_mm_t *va_space_mm) 264 { 265 UVM_ASSERT(va_space_mm->mm); 266 uvm_assert_mmap_lock_locked_write(va_space_mm->mm); 267 268 if (UVM_ATS_IBM_SUPPORTED_IN_DRIVER() && g_uvm_global.ats.enabled) 269 va_space_mm->mmu_notifier.ops = &uvm_mmu_notifier_ops_ats; 270 else 271 va_space_mm->mmu_notifier.ops = &uvm_mmu_notifier_ops_release; 272 273 return __mmu_notifier_register(&va_space_mm->mmu_notifier, va_space_mm->mm); 274 } 275 276 static void uvm_mmu_notifier_unregister(uvm_va_space_mm_t *va_space_mm) 277 { 278 mmu_notifier_unregister(&va_space_mm->mmu_notifier, va_space_mm->mm); 279 } 280 #else 281 static void uvm_mmput(struct mm_struct *mm) 282 { 283 UVM_ASSERT(0); 284 } 285 286 static int uvm_mmu_notifier_register(uvm_va_space_mm_t *va_space_mm) 287 { 288 UVM_ASSERT(0); 289 return 0; 290 } 291 292 static void uvm_mmu_notifier_unregister(uvm_va_space_mm_t *va_space_mm) 293 { 294 UVM_ASSERT(0); 295 } 296 #endif // UVM_CAN_USE_MMU_NOTIFIERS() 297 298 NV_STATUS uvm_va_space_mm_register(uvm_va_space_t *va_space) 299 { 300 uvm_va_space_mm_t *va_space_mm = &va_space->va_space_mm; 301 int ret; 302 303 uvm_assert_mmap_lock_locked_write(current->mm); 304 uvm_assert_rwsem_locked_write(&va_space->lock); 305 306 if (!uvm_va_space_mm_enabled(va_space)) 307 return NV_OK; 308 309 UVM_ASSERT(!va_space_mm->mm); 310 va_space_mm->mm = current->mm; 311 312 // We must be prepared to handle callbacks as soon as we make this call, 313 // except for ->release() which can't be called since the mm belongs to 314 // current. 315 ret = uvm_mmu_notifier_register(va_space_mm); 316 if (ret) { 317 // Inform uvm_va_space_mm_unregister() that it has nothing to do. 318 va_space_mm->mm = NULL; 319 return errno_to_nv_status(ret); 320 } 321 322 uvm_spin_lock(&va_space_mm->lock); 323 va_space_mm->alive = true; 324 uvm_spin_unlock(&va_space_mm->lock); 325 326 return NV_OK; 327 } 328 329 void uvm_va_space_mm_unregister(uvm_va_space_t *va_space) 330 { 331 uvm_va_space_mm_t *va_space_mm = &va_space->va_space_mm; 332 333 // We can't hold the VA space lock or mmap_lock across this function since 334 // mmu_notifier_unregister() may trigger uvm_va_space_mm_shutdown(), which 335 // takes those locks and also waits for other threads which may take those 336 // locks. 337 uvm_assert_unlocked_order(UVM_LOCK_ORDER_MMAP_LOCK); 338 uvm_assert_unlocked_order(UVM_LOCK_ORDER_VA_SPACE); 339 340 if (!va_space_mm->mm) 341 return; 342 343 UVM_ASSERT(uvm_va_space_mm_enabled(va_space)); 344 uvm_mmu_notifier_unregister(va_space_mm); 345 346 // We're guaranteed that upon return from mmu_notifier_unregister(), 347 // uvm_va_space_mm_shutdown() will have been called (though perhaps not by 348 // this thread). Therefore all retainers have been flushed. 349 UVM_ASSERT(!va_space_mm->alive); 350 UVM_ASSERT(va_space_mm->retained_count == 0); 351 va_space_mm->mm = NULL; 352 } 353 354 struct mm_struct *uvm_va_space_mm_retain(uvm_va_space_t *va_space) 355 { 356 uvm_va_space_mm_t *va_space_mm = &va_space->va_space_mm; 357 struct mm_struct *mm = NULL; 358 359 if (!uvm_va_space_mm_enabled(va_space)) 360 return NULL; 361 362 uvm_spin_lock(&va_space_mm->lock); 363 364 if (va_space_mm->alive) { 365 ++va_space_mm->retained_count; 366 mm = va_space_mm->mm; 367 UVM_ASSERT(mm); 368 } 369 370 uvm_spin_unlock(&va_space_mm->lock); 371 372 return mm; 373 } 374 375 struct mm_struct *uvm_va_space_mm_or_current_retain(uvm_va_space_t *va_space) 376 { 377 uvm_va_space_mm_t *va_space_mm = &va_space->va_space_mm; 378 379 // We should only attempt to use current->mm from a user thread 380 UVM_ASSERT(!(current->flags & PF_KTHREAD)); 381 382 // current->mm is NULL when we're in process teardown. In that case it 383 // doesn't make sense to use any mm. 384 if (!current->mm) 385 return NULL; 386 387 // If the va_space_mm matches current->mm then it would be safe but sub- 388 // optimal to call mmget_not_zero(). current->mm is always valid to 389 // use when non-NULL so there is no need to retain it. 390 if (!uvm_va_space_mm_enabled(va_space) || va_space_mm->mm == current->mm) 391 return current->mm; 392 393 return mmget_not_zero(va_space_mm->mm) ? va_space_mm->mm : NULL; 394 } 395 396 void uvm_va_space_mm_release(uvm_va_space_t *va_space) 397 { 398 uvm_va_space_mm_t *va_space_mm = &va_space->va_space_mm; 399 bool do_wake = false; 400 401 UVM_ASSERT(uvm_va_space_mm_enabled(va_space)); 402 403 // The mm must not have been torn down while we have it retained 404 UVM_ASSERT(va_space_mm->mm); 405 406 uvm_spin_lock(&va_space_mm->lock); 407 408 UVM_ASSERT(va_space_mm->retained_count > 0); 409 --va_space_mm->retained_count; 410 411 // If we're the last retainer on a dead mm, signal any potential waiters 412 if (va_space_mm->retained_count == 0 && !va_space_mm->alive) 413 do_wake = true; 414 415 uvm_spin_unlock(&va_space_mm->lock); 416 417 // There could be multiple threads in uvm_va_space_mm_shutdown() waiting on 418 // us, so we have to wake up all waiters. 419 if (do_wake) 420 wake_up_all(&va_space_mm->last_retainer_wait_queue); 421 } 422 423 void uvm_va_space_mm_or_current_release(uvm_va_space_t *va_space, struct mm_struct *mm) 424 { 425 // We can't hold the VA space lock or mmap_lock across this function since 426 // mmput() may trigger uvm_va_space_mm_shutdown(), which takes those locks 427 // and also waits for other threads which may take those locks. 428 uvm_assert_unlocked_order(UVM_LOCK_ORDER_MMAP_LOCK); 429 uvm_assert_unlocked_order(UVM_LOCK_ORDER_VA_SPACE); 430 431 if (mm && mm != current->mm) 432 uvm_mmput(mm); 433 } 434 435 static void uvm_va_space_mm_shutdown_delay(uvm_va_space_t *va_space) 436 { 437 uvm_va_space_mm_t *va_space_mm = &va_space->va_space_mm; 438 NvU64 start_time; 439 int num_threads; 440 bool timed_out = false; 441 442 if (!va_space_mm->test.delay_shutdown) 443 return; 444 445 start_time = NV_GETTIME(); 446 447 num_threads = atomic_inc_return(&va_space_mm->test.num_mm_shutdown_threads); 448 UVM_ASSERT(num_threads > 0); 449 450 if (num_threads == 1) { 451 // Wait for another thread to arrive unless we time out 452 while (atomic_read(&va_space_mm->test.num_mm_shutdown_threads) == 1) { 453 if (NV_GETTIME() - start_time >= 1000*1000*UVM_VA_SPACE_MM_SHUTDOWN_DELAY_MAX_MS) { 454 timed_out = true; 455 break; 456 } 457 } 458 459 if (va_space_mm->test.verbose) 460 UVM_TEST_PRINT("Multiple threads: %d\n", !timed_out); 461 } 462 463 // No need to decrement num_mm_shutdown_threads since this va_space_mm is 464 // being shut down. 465 } 466 467 // Handles the va_space's mm being torn down while the VA space still exists. 468 // This function won't return until all in-flight retainers have called 469 // uvm_va_space_mm_release(). Subsequent calls to uvm_va_space_mm_retain() will 470 // return NULL. 471 // 472 // uvm_va_space_mm_unregister() must still be called. It is guaranteed that 473 // uvm_va_space_mm_shutdown() will not be called after 474 // uvm_va_space_mm_unregister() returns, though they may execute concurrently. 475 // If so, uvm_va_space_mm_unregister() will not return until 476 // uvm_va_space_mm_shutdown() is done. 477 // 478 // After this call returns the VA space is essentially dead. GPUs cannot make 479 // any new memory accesses in registered GPU VA spaces, and no more GPU faults 480 // which are attributed to this VA space will arrive. Additionally, no more 481 // registration within the VA space is allowed (GPU, GPU VA space, or channel). 482 // 483 // The requirements for this callback are that, once we return, the GPU and 484 // driver are completely done using the associated mm_struct. This includes: 485 // 486 // 1) GPUs will not issue any more memory accesses under this mm 487 // 2) [ATS only] GPUs will not issue any more ATRs under this mm 488 // 3) The driver will not ask the kernel to service faults on this mm 489 // 490 static void uvm_va_space_mm_shutdown(uvm_va_space_t *va_space) 491 { 492 uvm_va_space_mm_t *va_space_mm = &va_space->va_space_mm; 493 uvm_gpu_va_space_t *gpu_va_space; 494 uvm_gpu_t *gpu; 495 uvm_global_processor_mask_t gpus_to_flush; 496 LIST_HEAD(deferred_free_list); 497 498 // The mm must not have been torn down completely yet, but it may have been 499 // marked as dead by a concurrent thread. 500 UVM_ASSERT(uvm_va_space_mm_enabled(va_space)); 501 UVM_ASSERT(va_space_mm->mm); 502 503 // Inject a delay for testing if requested 504 uvm_va_space_mm_shutdown_delay(va_space); 505 506 // There can be at most two threads here concurrently: 507 // 508 // 1) Thread A in process teardown of the original process 509 // 510 // 2) Thread B must be in the file close path of another process (either 511 // implicit or explicit), having already stopped all GPU accesses and 512 // having called uvm_va_space_mm_unregister. 513 // 514 // This corresponds to scenario #5 in the mm teardown block comment at the 515 // top of the file. We serialize between these threads with the VA space 516 // lock, but otherwise don't have any special handling: both threads will 517 // execute the full teardown sequence below. Also, remember that the threads 518 // won't return to their callers until both threads have returned from this 519 // function (following the rules for mmu_notifier_unregister). 520 521 uvm_va_space_down_write(va_space); 522 523 // Prevent future registrations of any kind. We'll be iterating over all 524 // GPUs and GPU VA spaces below but taking and dropping the VA space lock. 525 // It's ok for other threads to unregister those objects, but not to 526 // register new ones. 527 // 528 // We also need to prevent new channel work from arriving since we're trying 529 // to stop memory accesses. 530 va_space->disallow_new_registers = true; 531 532 uvm_va_space_downgrade_write_rm(va_space); 533 534 // Stop channels to prevent new accesses and new faults on non-MPS 535 uvm_va_space_stop_all_user_channels(va_space); 536 537 uvm_va_space_up_read_rm(va_space); 538 539 // Detach all channels to prevent pending untranslated faults from getting 540 // to this VA space. This also removes those channels from the VA space and 541 // puts them on the deferred free list, so only one thread will do this. 542 uvm_va_space_down_write(va_space); 543 uvm_va_space_detach_all_user_channels(va_space, &deferred_free_list); 544 uvm_va_space_global_gpus_in_mask(va_space, &gpus_to_flush, &va_space->faultable_processors); 545 uvm_global_mask_retain(&gpus_to_flush); 546 uvm_va_space_up_write(va_space); 547 548 // Flush the fault buffer on all GPUs. This will avoid spurious cancels 549 // of stale pending translated faults after we clear va_space_mm->alive 550 // later. 551 for_each_global_gpu_in_mask(gpu, &gpus_to_flush) 552 uvm_gpu_fault_buffer_flush(gpu); 553 554 uvm_global_mask_release(&gpus_to_flush); 555 556 // Call nvUvmInterfaceUnsetPageDirectory. This has no effect on non-MPS. 557 // Under MPS this guarantees that no new GPU accesses will be made using 558 // this mm. 559 // 560 // We need only one thread to make this call, but two threads in here could 561 // race for it, or we could have one thread in here and one in 562 // destroy_gpu_va_space. Serialize these by starting in write mode then 563 // downgrading to read. 564 uvm_va_space_down_write(va_space); 565 uvm_va_space_downgrade_write_rm(va_space); 566 for_each_gpu_va_space(gpu_va_space, va_space) 567 uvm_gpu_va_space_unset_page_dir(gpu_va_space); 568 uvm_va_space_up_read_rm(va_space); 569 570 // The above call to uvm_gpu_va_space_unset_page_dir handles the GPU VA 571 // spaces which are known to be registered. However, we could've raced with 572 // a concurrent uvm_va_space_unregister_gpu_va_space, giving this sequence: 573 // 574 // unregister_gpu_va_space uvm_va_space_mm_shutdown 575 // uvm_va_space_down_write 576 // remove_gpu_va_space 577 // uvm_va_space_up_write 578 // uvm_va_space_down_write(va_space); 579 // // No GPU VA spaces 580 // Unlock, return 581 // uvm_deferred_free_object_list 582 // uvm_gpu_va_space_unset_page_dir 583 // 584 // We have to be sure that all accesses in this GPU VA space are done before 585 // returning, so we have to wait for the other thread to finish its 586 // uvm_gpu_va_space_unset_page_dir call. 587 // 588 // We can be sure that num_pending will eventually go to zero because we've 589 // prevented new GPU VA spaces from being registered above. 590 wait_event(va_space->gpu_va_space_deferred_free.wait_queue, 591 atomic_read(&va_space->gpu_va_space_deferred_free.num_pending) == 0); 592 593 // Now that there won't be any new GPU faults, prevent subsequent retainers 594 // from accessing this mm. 595 uvm_spin_lock(&va_space_mm->lock); 596 va_space_mm->alive = false; 597 uvm_spin_unlock(&va_space_mm->lock); 598 599 // Finish channel destroy. This can be done at any point after detach as 600 // long as we don't hold the VA space lock. 601 uvm_deferred_free_object_list(&deferred_free_list); 602 603 // Flush out all pending retainers 604 wait_event(va_space_mm->last_retainer_wait_queue, va_space_mm->retained_count == 0); 605 } 606 607 static NV_STATUS mm_read64(struct mm_struct *mm, NvU64 addr, NvU64 *val) 608 { 609 long ret; 610 struct page *page; 611 NvU64 *mapping; 612 613 UVM_ASSERT(IS_ALIGNED(addr, sizeof(*val))); 614 615 uvm_down_read_mmap_lock(mm); 616 ret = NV_PIN_USER_PAGES_REMOTE(mm, (unsigned long)addr, 1, 0, &page, NULL, NULL); 617 uvm_up_read_mmap_lock(mm); 618 619 if (ret < 0) 620 return errno_to_nv_status(ret); 621 622 UVM_ASSERT(ret == 1); 623 624 mapping = (NvU64 *)((char *)kmap(page) + (addr % PAGE_SIZE)); 625 *val = *mapping; 626 kunmap(page); 627 NV_UNPIN_USER_PAGE(page); 628 629 return NV_OK; 630 } 631 632 NV_STATUS uvm_test_va_space_mm_retain(UVM_TEST_VA_SPACE_MM_RETAIN_PARAMS *params, struct file *filp) 633 { 634 uvm_va_space_t *va_space = NULL; 635 struct mm_struct *mm = NULL; 636 NV_STATUS status = NV_OK; 637 638 if (!IS_ALIGNED(params->addr, sizeof(params->val_before))) 639 return NV_ERR_INVALID_ARGUMENT; 640 641 uvm_mutex_lock(&g_uvm_global.va_spaces.lock); 642 643 list_for_each_entry(va_space, &g_uvm_global.va_spaces.list, list_node) { 644 if ((uintptr_t)va_space == params->va_space_ptr) { 645 mm = uvm_va_space_mm_retain(va_space); 646 break; 647 } 648 } 649 650 uvm_mutex_unlock(&g_uvm_global.va_spaces.lock); 651 652 if ((uintptr_t)va_space != params->va_space_ptr) 653 return NV_ERR_MISSING_TABLE_ENTRY; 654 655 if (!mm) 656 return NV_ERR_PAGE_TABLE_NOT_AVAIL; 657 658 status = mm_read64(mm, params->addr, ¶ms->val_before); 659 660 if (status == NV_OK && params->sleep_us) { 661 usleep_range(params->sleep_us, params->sleep_us + 1000); 662 status = mm_read64(mm, params->addr, ¶ms->val_after); 663 } 664 665 uvm_va_space_mm_release(va_space); 666 return status; 667 } 668 669 NV_STATUS uvm_test_va_space_mm_delay_shutdown(UVM_TEST_VA_SPACE_MM_DELAY_SHUTDOWN_PARAMS *params, struct file *filp) 670 { 671 uvm_va_space_t *va_space = uvm_va_space_get(filp); 672 uvm_va_space_mm_t *va_space_mm = &va_space->va_space_mm; 673 NV_STATUS status = NV_ERR_PAGE_TABLE_NOT_AVAIL; 674 675 uvm_va_space_down_write(va_space); 676 677 if (uvm_va_space_mm_retain(va_space)) { 678 va_space_mm->test.delay_shutdown = true; 679 va_space_mm->test.verbose = params->verbose; 680 uvm_va_space_mm_release(va_space); 681 status = NV_OK; 682 } 683 684 uvm_va_space_up_write(va_space); 685 686 return status; 687 } 688 689 NV_STATUS uvm_test_va_space_mm_or_current_retain(UVM_TEST_VA_SPACE_MM_OR_CURRENT_RETAIN_PARAMS *params, 690 struct file *filp) 691 { 692 uvm_va_space_t *va_space = uvm_va_space_get(filp); 693 struct mm_struct *mm; 694 NV_STATUS status = NV_OK; 695 696 mm = uvm_va_space_mm_or_current_retain(va_space); 697 if (!mm) 698 return NV_ERR_PAGE_TABLE_NOT_AVAIL; 699 700 if (params->retain_done_ptr) { 701 NvU64 flag = true; 702 703 if (nv_copy_to_user((void __user *)params->retain_done_ptr, &flag, sizeof(flag))) 704 status = NV_ERR_INVALID_ARGUMENT; 705 } 706 707 if (status == NV_OK) { 708 if (params->sleep_us) 709 usleep_range(params->sleep_us, params->sleep_us + 1000); 710 711 params->mm_users = atomic_read(&mm->mm_users); 712 } 713 714 uvm_va_space_mm_or_current_release(va_space, mm); 715 716 return status; 717 } 718