1 /*******************************************************************************
2     Copyright (c) 2018-2022 NVIDIA Corporation
3 
4     Permission is hereby granted, free of charge, to any person obtaining a copy
5     of this software and associated documentation files (the "Software"), to
6     deal in the Software without restriction, including without limitation the
7     rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
8     sell copies of the Software, and to permit persons to whom the Software is
9     furnished to do so, subject to the following conditions:
10 
11         The above copyright notice and this permission notice shall be
12         included in all copies or substantial portions of the Software.
13 
14     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17     THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19     FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20     DEALINGS IN THE SOFTWARE.
21 
22 *******************************************************************************/
23 
24 #include "uvm_common.h"
25 #include "uvm_kvmalloc.h"
26 #include "uvm_va_space.h"
27 #include "uvm_va_space_mm.h"
28 #include "uvm_ats.h"
29 #include "uvm_api.h"
30 #include "uvm_test.h"
31 #include "uvm_test_ioctl.h"
32 
33 #if defined(NV_LINUX_SCHED_MM_H_PRESENT)
34 #include <linux/sched/mm.h>
35 #elif defined(NV_LINUX_SCHED_H_PRESENT)
36 #include <linux/sched.h>
37 #endif
38 
39 //
40 // This comment block describes some implementation rationale. See the header
41 // for the API descriptions.
42 //
43 // ========================= Retain count vs mm_users ==========================
44 //
45 // We use two methods to guarantee the mm is available and won't be destroyed.
46 //
47 // On the call paths where mmput() can be called, we call
48 // uvm_va_space_mm_or_current_retain() which calls mmget_not_zero(). This
49 // prevents mm teardown and avoids races with uvm_va_space_mm_shutdown() since
50 // it prevents mmput() -> __mmput() -> exit_mmap() -> mmu_notifier_release() ->
51 // uvm_va_space_mm_shutdown() until uvm_va_space_mm_or_current_release(), and
52 // we guarantee that we can't call uvm_va_space_mm_unregister() ->
53 // mmu_notifier_unregister() -> uvm_va_space_mm_shutdown() path when someone is
54 // about to call uvm_va_space_mm_or_current_retain().
55 // Kernel calls like mmu_interval_notifier_insert() require mm_users to be
56 // greater than 0. In general, these are the ioctl paths.
57 //
58 // On the replayable GPU fault handling path, we need the mm to be able to
59 // service faults in the window when mm_users == 0 but mmu_notifier_release()
60 // hasn't yet been called. We can't call mmput() because it may result in
61 // exit_mmap(), which could result in RM calls and VA space destroy. Those need
62 // to wait for the GPU fault handler to finish, so on that path we use an
63 // internal retained reference count and wait queue. When the mm is disabled
64 // via mmu_notifier_release(), we use the wait queue to wait for the reference
65 // count to go to 0.
66 // We also use this path for older Linux kernels where mm_users > 0 isn't
67 // required.
68 //
69 // ============================ Handling mm teardown ===========================
70 //
71 // mmu_notifiers call the mm release callback both when the mm is really getting
72 // shut down, and whenever mmu_notifier_unregister is called. This has several
73 // consequences, including that these two paths can race. If they do race, they
74 // wait for each other to finish (real teardown of the mm won't start until the
75 // mmu_notifier_unregister's callback has returned, and mmu_notifier_unregister
76 // won't return until the mm release callback has returned).
77 //
78 // When the mm is really getting torn down, uvm_va_space_mm_shutdown is expected
79 // to stop all GPU memory accesses to that mm and stop servicing faults in that
80 // mm. This essentially shuts down the VA space for new work. The VA space
81 // object remains valid for most teardown ioctls until the file is closed,
82 // because it's legal for the associated process to die then for another process
83 // with a reference on the file to perform the unregisters or associated ioctls.
84 // This is particularly true for tools users.
85 //
86 // An exception to the above is UvmUnregisterChannel. Since channels are
87 // completely removed from the VA space on mm teardown, later channel
88 // unregisters will fail to find the handles and will return an error.
89 //
90 // The UVM driver will only call mmu_notifier_unregister during VA space destroy
91 // (file close).
92 //
93 // Here is a table of the various teardown scenarios:
94 //
95 //                                                Can race with
96 // Scenario                                       mm teardown
97 // -----------------------------------------------------------------------------
98 // 1) Process exit (mm teardown, file open)            -
99 // 2) Explicit file close in original mm               No
100 // 3) Explicit file close in different mm              Yes
101 // 4) Implicit file close (exit) in original mm        No
102 // 5) Implicit file close (exit) in different mm       Yes
103 //
104 // At a high level, the sequence of operations to perform during mm teardown is:
105 //
106 // 1) Stop all channels
107 //      - Prevents new faults and accesses on non-MPS
108 // 2) Detach all channels
109 //      - Prevents pending faults from being translated to this VA space
110 //      - Non-replayable faults will be dropped so no new ones can arrive
111 //      - Access counter notifications will be prevented from getting new
112 //        translations to this VA space. Pending entries may attempt to retain
113 //        the mm, but will drop the notification if they can't be serviced.
114 // 3) Flush the fault buffer
115 //      - The only reason to flush the fault buffer is to avoid spurious
116 //        cancels. If we didn't flush the fault buffer before marking the mm
117 //        as dead, then remaining faults which require the mm would be
118 //        cancelled. Since the faults might be stale, we would record cancel
119 //        events which didn't really happen (the access didn't happen after
120 //        the mm died). By flushing we clear out all stale faults, and in
121 //        the case of MPS, cancel real faults after.
122 // 4) UnsetPageDir
123 //      - Prevents new accesses on MPS
124 // 5) Mark the va_space_mm as dead
125 //      - Prevents new retainers from using the mm. There won't be any more on
126 //        the fault handling paths, but there could be others in worker threads.
127 //
128 // Here are some tables of each step in the sequence, and what operations can
129 // still be performed after each step. This is all from the perspective of a
130 // single VA space. "Untranslated" means that the fault entry has not been
131 // translated to a uvm_va_space yet.
132 //
133 // Replayable non-MPS Behavior:
134 //
135 //                  Can              Pending         Pending         Can be
136 //                  access   Can     untranslated    translated      servicing
137 //                  memory   fault   faults          faults          faults
138 // -----------------------------------------------------------------------------
139 // Shutdown start   Yes      Yes     Service         Service         Yes
140 // Stop channels    No       No      Service [1]     Service [1]     Yes [1]
141 // Detach channels  No       No      Flush buffer    Service [1]     Yes [1], [2]
142 // Flush buffer     No       No      None possible   None possible   No
143 // UnsetPageDir     No       No      None possible   None possible   No
144 //
145 //
146 // Replayable MPS Behavior:
147 //
148 //                  Can              Pending         Pending         Can be
149 //                  access   Can     untranslated    translated      servicing
150 //                  memory   fault   faults          faults          faults
151 // -----------------------------------------------------------------------------
152 // Shutdown start   Yes      Yes     Service         Service         Yes
153 // Stop channels    Yes      Yes     Service         Service         Yes
154 // Detach channels  Yes      Yes     Cancel, flush   Service         Yes
155 // Flush buffer     Yes      Yes     Cancel, flush   None possible   No
156 // UnsetPageDir     No [3]   Yes     Cancel, flush   None possible   No
157 //
158 //
159 // [1]: All pending faults in this VA space are stale since channel stop
160 //      preempted the context.
161 // [2]: Faults in this VA space can't be serviced concurrently with detach since
162 //      detach holds the VA space lock in write mode. Faults in other VA spaces
163 //      can be serviced, and stale faults in this VA space can resume service
164 //      after detach is done.
165 // [3]: Due to the nature of MPS, remaining work which had started under the VA
166 //      space could still execute and attempt to make memory accesses. However,
167 //      since the PDB at that point is empty and ATS is disabled (if available),
168 //      all accesses will fault and be cancelled rather than successfully
169 //      translate to physical memory.
170 //
171 // =============================================================================
172 
173 #define UVM_VA_SPACE_MM_SHUTDOWN_DELAY_MAX_MS 100
174 
175 static int uvm_enable_va_space_mm = 1;
176 module_param(uvm_enable_va_space_mm, int, S_IRUGO);
177 MODULE_PARM_DESC(uvm_enable_va_space_mm,
178                  "Set to 0 to disable UVM from using mmu_notifiers to create "
179                  "an association between a UVM VA space and a process. This "
180                  "will also disable pageable memory access via either ATS or "
181                  "HMM.");
182 
183 bool uvm_va_space_mm_enabled_system(void)
184 {
185     return UVM_CAN_USE_MMU_NOTIFIERS() && uvm_enable_va_space_mm;
186 }
187 
188 bool uvm_va_space_mm_enabled(uvm_va_space_t *va_space)
189 {
190     // A va_space doesn't have any association with an mm in multi-process
191     // sharing mode.
192     if (va_space->initialization_flags & UVM_INIT_FLAGS_MULTI_PROCESS_SHARING_MODE)
193         return false;
194 
195     return uvm_va_space_mm_enabled_system();
196 }
197 
198 static void uvm_va_space_mm_shutdown(uvm_va_space_t *va_space);
199 
200 #if !defined(NV_MMGET_NOT_ZERO_PRESENT)
201 static bool mmget_not_zero(struct mm_struct *mm)
202 {
203     return atomic_inc_not_zero(&mm->mm_users);
204 }
205 #endif
206 
207 #if UVM_CAN_USE_MMU_NOTIFIERS()
208 
209     static void uvm_mmput(struct mm_struct *mm)
210     {
211         mmput(mm);
212     }
213 
214     static uvm_va_space_t *get_va_space(struct mmu_notifier *mn)
215     {
216         // This may be called without a thread context present, so be careful
217         // what is used here.
218         return container_of(mn, uvm_va_space_t, va_space_mm.mmu_notifier);
219     }
220 
221     static void uvm_mmu_notifier_release(struct mmu_notifier *mn, struct mm_struct *mm)
222     {
223         UVM_ENTRY_VOID(uvm_va_space_mm_shutdown(get_va_space(mn)));
224     }
225 
226     static void uvm_mmu_notifier_invalidate_range_ats(struct mmu_notifier *mn,
227                                                       struct mm_struct *mm,
228                                                       unsigned long start,
229                                                       unsigned long end)
230     {
231         // In most cases ->invalidate_range() is called with exclusive end.
232         // uvm_ats_invalidate() expects an inclusive end so we have to
233         // convert it.
234         //
235         // There's a special case however. Kernel TLB gathering sometimes
236         // identifies "fullmm" invalidates by setting both start and end to ~0.
237         //
238         // It's unclear if there are any other cases in which the kernel will
239         // call us with start == end. Since we can't definitively say no, we
240         // conservatively treat all such calls as full invalidates.
241         if (start == end) {
242             start = 0;
243             end = ~0UL;
244         }
245         else {
246             --end;
247         }
248 
249         UVM_ENTRY_VOID(uvm_ats_invalidate(get_va_space(mn), start, end));
250     }
251 
252     static struct mmu_notifier_ops uvm_mmu_notifier_ops_release =
253     {
254         .release = uvm_mmu_notifier_release,
255     };
256 
257     static struct mmu_notifier_ops uvm_mmu_notifier_ops_ats =
258     {
259         .release          = uvm_mmu_notifier_release,
260         .invalidate_range = uvm_mmu_notifier_invalidate_range_ats,
261     };
262 
263     static int uvm_mmu_notifier_register(uvm_va_space_mm_t *va_space_mm)
264     {
265         UVM_ASSERT(va_space_mm->mm);
266         uvm_assert_mmap_lock_locked_write(va_space_mm->mm);
267 
268         if (UVM_ATS_IBM_SUPPORTED_IN_DRIVER() && g_uvm_global.ats.enabled)
269             va_space_mm->mmu_notifier.ops = &uvm_mmu_notifier_ops_ats;
270         else
271             va_space_mm->mmu_notifier.ops = &uvm_mmu_notifier_ops_release;
272 
273         return __mmu_notifier_register(&va_space_mm->mmu_notifier, va_space_mm->mm);
274     }
275 
276     static void uvm_mmu_notifier_unregister(uvm_va_space_mm_t *va_space_mm)
277     {
278         mmu_notifier_unregister(&va_space_mm->mmu_notifier, va_space_mm->mm);
279     }
280 #else
281     static void uvm_mmput(struct mm_struct *mm)
282     {
283         UVM_ASSERT(0);
284     }
285 
286     static int uvm_mmu_notifier_register(uvm_va_space_mm_t *va_space_mm)
287     {
288         UVM_ASSERT(0);
289         return 0;
290     }
291 
292     static void uvm_mmu_notifier_unregister(uvm_va_space_mm_t *va_space_mm)
293     {
294         UVM_ASSERT(0);
295     }
296 #endif // UVM_CAN_USE_MMU_NOTIFIERS()
297 
298 NV_STATUS uvm_va_space_mm_register(uvm_va_space_t *va_space)
299 {
300     uvm_va_space_mm_t *va_space_mm = &va_space->va_space_mm;
301     int ret;
302 
303     uvm_assert_mmap_lock_locked_write(current->mm);
304     uvm_assert_rwsem_locked_write(&va_space->lock);
305 
306     if (!uvm_va_space_mm_enabled(va_space))
307         return NV_OK;
308 
309     UVM_ASSERT(!va_space_mm->mm);
310     va_space_mm->mm = current->mm;
311 
312     // We must be prepared to handle callbacks as soon as we make this call,
313     // except for ->release() which can't be called since the mm belongs to
314     // current.
315     ret = uvm_mmu_notifier_register(va_space_mm);
316     if (ret) {
317         // Inform uvm_va_space_mm_unregister() that it has nothing to do.
318         va_space_mm->mm = NULL;
319         return errno_to_nv_status(ret);
320     }
321 
322     uvm_spin_lock(&va_space_mm->lock);
323     va_space_mm->alive = true;
324     uvm_spin_unlock(&va_space_mm->lock);
325 
326     return NV_OK;
327 }
328 
329 void uvm_va_space_mm_unregister(uvm_va_space_t *va_space)
330 {
331     uvm_va_space_mm_t *va_space_mm = &va_space->va_space_mm;
332 
333     // We can't hold the VA space lock or mmap_lock across this function since
334     // mmu_notifier_unregister() may trigger uvm_va_space_mm_shutdown(), which
335     // takes those locks and also waits for other threads which may take those
336     // locks.
337     uvm_assert_unlocked_order(UVM_LOCK_ORDER_MMAP_LOCK);
338     uvm_assert_unlocked_order(UVM_LOCK_ORDER_VA_SPACE);
339 
340     if (!va_space_mm->mm)
341         return;
342 
343     UVM_ASSERT(uvm_va_space_mm_enabled(va_space));
344     uvm_mmu_notifier_unregister(va_space_mm);
345 
346     // We're guaranteed that upon return from mmu_notifier_unregister(),
347     // uvm_va_space_mm_shutdown() will have been called (though perhaps not by
348     // this thread). Therefore all retainers have been flushed.
349     UVM_ASSERT(!va_space_mm->alive);
350     UVM_ASSERT(va_space_mm->retained_count == 0);
351     va_space_mm->mm = NULL;
352 }
353 
354 struct mm_struct *uvm_va_space_mm_retain(uvm_va_space_t *va_space)
355 {
356     uvm_va_space_mm_t *va_space_mm = &va_space->va_space_mm;
357     struct mm_struct *mm = NULL;
358 
359     if (!uvm_va_space_mm_enabled(va_space))
360         return NULL;
361 
362     uvm_spin_lock(&va_space_mm->lock);
363 
364     if (va_space_mm->alive) {
365         ++va_space_mm->retained_count;
366         mm = va_space_mm->mm;
367         UVM_ASSERT(mm);
368     }
369 
370     uvm_spin_unlock(&va_space_mm->lock);
371 
372     return mm;
373 }
374 
375 struct mm_struct *uvm_va_space_mm_or_current_retain(uvm_va_space_t *va_space)
376 {
377     uvm_va_space_mm_t *va_space_mm = &va_space->va_space_mm;
378 
379     // We should only attempt to use current->mm from a user thread
380     UVM_ASSERT(!(current->flags & PF_KTHREAD));
381 
382     // current->mm is NULL when we're in process teardown. In that case it
383     // doesn't make sense to use any mm.
384     if (!current->mm)
385         return NULL;
386 
387     // If the va_space_mm matches current->mm then it would be safe but sub-
388     // optimal to call mmget_not_zero(). current->mm is always valid to
389     // use when non-NULL so there is no need to retain it.
390     if (!uvm_va_space_mm_enabled(va_space) || va_space_mm->mm == current->mm)
391         return current->mm;
392 
393     return mmget_not_zero(va_space_mm->mm) ? va_space_mm->mm : NULL;
394 }
395 
396 void uvm_va_space_mm_release(uvm_va_space_t *va_space)
397 {
398     uvm_va_space_mm_t *va_space_mm = &va_space->va_space_mm;
399     bool do_wake = false;
400 
401     UVM_ASSERT(uvm_va_space_mm_enabled(va_space));
402 
403     // The mm must not have been torn down while we have it retained
404     UVM_ASSERT(va_space_mm->mm);
405 
406     uvm_spin_lock(&va_space_mm->lock);
407 
408     UVM_ASSERT(va_space_mm->retained_count > 0);
409     --va_space_mm->retained_count;
410 
411     // If we're the last retainer on a dead mm, signal any potential waiters
412     if (va_space_mm->retained_count == 0 && !va_space_mm->alive)
413         do_wake = true;
414 
415     uvm_spin_unlock(&va_space_mm->lock);
416 
417     // There could be multiple threads in uvm_va_space_mm_shutdown() waiting on
418     // us, so we have to wake up all waiters.
419     if (do_wake)
420         wake_up_all(&va_space_mm->last_retainer_wait_queue);
421 }
422 
423 void uvm_va_space_mm_or_current_release(uvm_va_space_t *va_space, struct mm_struct *mm)
424 {
425     // We can't hold the VA space lock or mmap_lock across this function since
426     // mmput() may trigger uvm_va_space_mm_shutdown(), which takes those locks
427     // and also waits for other threads which may take those locks.
428     uvm_assert_unlocked_order(UVM_LOCK_ORDER_MMAP_LOCK);
429     uvm_assert_unlocked_order(UVM_LOCK_ORDER_VA_SPACE);
430 
431     if (mm && mm != current->mm)
432         uvm_mmput(mm);
433 }
434 
435 static void uvm_va_space_mm_shutdown_delay(uvm_va_space_t *va_space)
436 {
437     uvm_va_space_mm_t *va_space_mm = &va_space->va_space_mm;
438     NvU64 start_time;
439     int num_threads;
440     bool timed_out = false;
441 
442     if (!va_space_mm->test.delay_shutdown)
443         return;
444 
445     start_time = NV_GETTIME();
446 
447     num_threads = atomic_inc_return(&va_space_mm->test.num_mm_shutdown_threads);
448     UVM_ASSERT(num_threads > 0);
449 
450     if (num_threads == 1) {
451         // Wait for another thread to arrive unless we time out
452         while (atomic_read(&va_space_mm->test.num_mm_shutdown_threads) == 1) {
453             if (NV_GETTIME() - start_time >= 1000*1000*UVM_VA_SPACE_MM_SHUTDOWN_DELAY_MAX_MS) {
454                 timed_out = true;
455                 break;
456             }
457         }
458 
459         if (va_space_mm->test.verbose)
460             UVM_TEST_PRINT("Multiple threads: %d\n", !timed_out);
461     }
462 
463     // No need to decrement num_mm_shutdown_threads since this va_space_mm is
464     // being shut down.
465 }
466 
467 // Handles the va_space's mm being torn down while the VA space still exists.
468 // This function won't return until all in-flight retainers have called
469 // uvm_va_space_mm_release(). Subsequent calls to uvm_va_space_mm_retain() will
470 // return NULL.
471 //
472 // uvm_va_space_mm_unregister() must still be called. It is guaranteed that
473 // uvm_va_space_mm_shutdown() will not be called after
474 // uvm_va_space_mm_unregister() returns, though they may execute concurrently.
475 // If so, uvm_va_space_mm_unregister() will not return until
476 // uvm_va_space_mm_shutdown() is done.
477 //
478 // After this call returns the VA space is essentially dead. GPUs cannot make
479 // any new memory accesses in registered GPU VA spaces, and no more GPU faults
480 // which are attributed to this VA space will arrive. Additionally, no more
481 // registration within the VA space is allowed (GPU, GPU VA space, or channel).
482 //
483 // The requirements for this callback are that, once we return, the GPU and
484 // driver are completely done using the associated mm_struct. This includes:
485 //
486 // 1) GPUs will not issue any more memory accesses under this mm
487 // 2) [ATS only] GPUs will not issue any more ATRs under this mm
488 // 3) The driver will not ask the kernel to service faults on this mm
489 //
490 static void uvm_va_space_mm_shutdown(uvm_va_space_t *va_space)
491 {
492     uvm_va_space_mm_t *va_space_mm = &va_space->va_space_mm;
493     uvm_gpu_va_space_t *gpu_va_space;
494     uvm_gpu_t *gpu;
495     uvm_global_processor_mask_t gpus_to_flush;
496     LIST_HEAD(deferred_free_list);
497 
498     // The mm must not have been torn down completely yet, but it may have been
499     // marked as dead by a concurrent thread.
500     UVM_ASSERT(uvm_va_space_mm_enabled(va_space));
501     UVM_ASSERT(va_space_mm->mm);
502 
503     // Inject a delay for testing if requested
504     uvm_va_space_mm_shutdown_delay(va_space);
505 
506     // There can be at most two threads here concurrently:
507     //
508     // 1) Thread A in process teardown of the original process
509     //
510     // 2) Thread B must be in the file close path of another process (either
511     //    implicit or explicit), having already stopped all GPU accesses and
512     //    having called uvm_va_space_mm_unregister.
513     //
514     // This corresponds to scenario #5 in the mm teardown block comment at the
515     // top of the file. We serialize between these threads with the VA space
516     // lock, but otherwise don't have any special handling: both threads will
517     // execute the full teardown sequence below. Also, remember that the threads
518     // won't return to their callers until both threads have returned from this
519     // function (following the rules for mmu_notifier_unregister).
520 
521     uvm_va_space_down_write(va_space);
522 
523     // Prevent future registrations of any kind. We'll be iterating over all
524     // GPUs and GPU VA spaces below but taking and dropping the VA space lock.
525     // It's ok for other threads to unregister those objects, but not to
526     // register new ones.
527     //
528     // We also need to prevent new channel work from arriving since we're trying
529     // to stop memory accesses.
530     va_space->disallow_new_registers = true;
531 
532     uvm_va_space_downgrade_write_rm(va_space);
533 
534     // Stop channels to prevent new accesses and new faults on non-MPS
535     uvm_va_space_stop_all_user_channels(va_space);
536 
537     uvm_va_space_up_read_rm(va_space);
538 
539     // Detach all channels to prevent pending untranslated faults from getting
540     // to this VA space. This also removes those channels from the VA space and
541     // puts them on the deferred free list, so only one thread will do this.
542     uvm_va_space_down_write(va_space);
543     uvm_va_space_detach_all_user_channels(va_space, &deferred_free_list);
544     uvm_va_space_global_gpus_in_mask(va_space, &gpus_to_flush, &va_space->faultable_processors);
545     uvm_global_mask_retain(&gpus_to_flush);
546     uvm_va_space_up_write(va_space);
547 
548     // Flush the fault buffer on all GPUs. This will avoid spurious cancels
549     // of stale pending translated faults after we clear va_space_mm->alive
550     // later.
551     for_each_global_gpu_in_mask(gpu, &gpus_to_flush)
552         uvm_gpu_fault_buffer_flush(gpu);
553 
554     uvm_global_mask_release(&gpus_to_flush);
555 
556     // Call nvUvmInterfaceUnsetPageDirectory. This has no effect on non-MPS.
557     // Under MPS this guarantees that no new GPU accesses will be made using
558     // this mm.
559     //
560     // We need only one thread to make this call, but two threads in here could
561     // race for it, or we could have one thread in here and one in
562     // destroy_gpu_va_space. Serialize these by starting in write mode then
563     // downgrading to read.
564     uvm_va_space_down_write(va_space);
565     uvm_va_space_downgrade_write_rm(va_space);
566     for_each_gpu_va_space(gpu_va_space, va_space)
567         uvm_gpu_va_space_unset_page_dir(gpu_va_space);
568     uvm_va_space_up_read_rm(va_space);
569 
570     // The above call to uvm_gpu_va_space_unset_page_dir handles the GPU VA
571     // spaces which are known to be registered. However, we could've raced with
572     // a concurrent uvm_va_space_unregister_gpu_va_space, giving this sequence:
573     //
574     // unregister_gpu_va_space                  uvm_va_space_mm_shutdown
575     //     uvm_va_space_down_write
576     //     remove_gpu_va_space
577     //     uvm_va_space_up_write
578     //                                          uvm_va_space_down_write(va_space);
579     //                                          // No GPU VA spaces
580     //                                          Unlock, return
581     //     uvm_deferred_free_object_list
582     //         uvm_gpu_va_space_unset_page_dir
583     //
584     // We have to be sure that all accesses in this GPU VA space are done before
585     // returning, so we have to wait for the other thread to finish its
586     // uvm_gpu_va_space_unset_page_dir call.
587     //
588     // We can be sure that num_pending will eventually go to zero because we've
589     // prevented new GPU VA spaces from being registered above.
590     wait_event(va_space->gpu_va_space_deferred_free.wait_queue,
591                atomic_read(&va_space->gpu_va_space_deferred_free.num_pending) == 0);
592 
593     // Now that there won't be any new GPU faults, prevent subsequent retainers
594     // from accessing this mm.
595     uvm_spin_lock(&va_space_mm->lock);
596     va_space_mm->alive = false;
597     uvm_spin_unlock(&va_space_mm->lock);
598 
599     // Finish channel destroy. This can be done at any point after detach as
600     // long as we don't hold the VA space lock.
601     uvm_deferred_free_object_list(&deferred_free_list);
602 
603     // Flush out all pending retainers
604     wait_event(va_space_mm->last_retainer_wait_queue, va_space_mm->retained_count == 0);
605 }
606 
607 static NV_STATUS mm_read64(struct mm_struct *mm, NvU64 addr, NvU64 *val)
608 {
609     long ret;
610     struct page *page;
611     NvU64 *mapping;
612 
613     UVM_ASSERT(IS_ALIGNED(addr, sizeof(*val)));
614 
615     uvm_down_read_mmap_lock(mm);
616     ret = NV_PIN_USER_PAGES_REMOTE(mm, (unsigned long)addr, 1, 0, &page, NULL, NULL);
617     uvm_up_read_mmap_lock(mm);
618 
619     if (ret < 0)
620         return errno_to_nv_status(ret);
621 
622     UVM_ASSERT(ret == 1);
623 
624     mapping = (NvU64 *)((char *)kmap(page) + (addr % PAGE_SIZE));
625     *val = *mapping;
626     kunmap(page);
627     NV_UNPIN_USER_PAGE(page);
628 
629     return NV_OK;
630 }
631 
632 NV_STATUS uvm_test_va_space_mm_retain(UVM_TEST_VA_SPACE_MM_RETAIN_PARAMS *params, struct file *filp)
633 {
634     uvm_va_space_t *va_space = NULL;
635     struct mm_struct *mm = NULL;
636     NV_STATUS status = NV_OK;
637 
638     if (!IS_ALIGNED(params->addr, sizeof(params->val_before)))
639         return NV_ERR_INVALID_ARGUMENT;
640 
641     uvm_mutex_lock(&g_uvm_global.va_spaces.lock);
642 
643     list_for_each_entry(va_space, &g_uvm_global.va_spaces.list, list_node) {
644         if ((uintptr_t)va_space == params->va_space_ptr) {
645             mm = uvm_va_space_mm_retain(va_space);
646             break;
647         }
648     }
649 
650     uvm_mutex_unlock(&g_uvm_global.va_spaces.lock);
651 
652     if ((uintptr_t)va_space != params->va_space_ptr)
653         return NV_ERR_MISSING_TABLE_ENTRY;
654 
655     if (!mm)
656         return NV_ERR_PAGE_TABLE_NOT_AVAIL;
657 
658     status = mm_read64(mm, params->addr, &params->val_before);
659 
660     if (status == NV_OK && params->sleep_us) {
661         usleep_range(params->sleep_us, params->sleep_us + 1000);
662         status = mm_read64(mm, params->addr, &params->val_after);
663     }
664 
665     uvm_va_space_mm_release(va_space);
666     return status;
667 }
668 
669 NV_STATUS uvm_test_va_space_mm_delay_shutdown(UVM_TEST_VA_SPACE_MM_DELAY_SHUTDOWN_PARAMS *params, struct file *filp)
670 {
671     uvm_va_space_t *va_space = uvm_va_space_get(filp);
672     uvm_va_space_mm_t *va_space_mm = &va_space->va_space_mm;
673     NV_STATUS status = NV_ERR_PAGE_TABLE_NOT_AVAIL;
674 
675     uvm_va_space_down_write(va_space);
676 
677     if (uvm_va_space_mm_retain(va_space)) {
678         va_space_mm->test.delay_shutdown = true;
679         va_space_mm->test.verbose = params->verbose;
680         uvm_va_space_mm_release(va_space);
681         status = NV_OK;
682     }
683 
684     uvm_va_space_up_write(va_space);
685 
686     return status;
687 }
688 
689 NV_STATUS uvm_test_va_space_mm_or_current_retain(UVM_TEST_VA_SPACE_MM_OR_CURRENT_RETAIN_PARAMS *params,
690                                                  struct file *filp)
691 {
692     uvm_va_space_t *va_space = uvm_va_space_get(filp);
693     struct mm_struct *mm;
694     NV_STATUS status = NV_OK;
695 
696     mm = uvm_va_space_mm_or_current_retain(va_space);
697     if (!mm)
698         return NV_ERR_PAGE_TABLE_NOT_AVAIL;
699 
700     if (params->retain_done_ptr) {
701         NvU64 flag = true;
702 
703         if (nv_copy_to_user((void __user *)params->retain_done_ptr, &flag, sizeof(flag)))
704             status = NV_ERR_INVALID_ARGUMENT;
705     }
706 
707     if (status == NV_OK) {
708         if (params->sleep_us)
709             usleep_range(params->sleep_us, params->sleep_us + 1000);
710 
711         params->mm_users = atomic_read(&mm->mm_users);
712     }
713 
714     uvm_va_space_mm_or_current_release(va_space, mm);
715 
716     return status;
717 }
718