1 /*******************************************************************************
2     Copyright (c) 2016-2023 NVIDIA Corporation
3 
4     Permission is hereby granted, free of charge, to any person obtaining a copy
5     of this software and associated documentation files (the "Software"), to
6     deal in the Software without restriction, including without limitation the
7     rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
8     sell copies of the Software, and to permit persons to whom the Software is
9     furnished to do so, subject to the following conditions:
10 
11         The above copyright notice and this permission notice shall be
12         included in all copies or substantial portions of the Software.
13 
14     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17     THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19     FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20     DEALINGS IN THE SOFTWARE.
21 
22 *******************************************************************************/
23 
24 #include "uvm_common.h"
25 #include "uvm_ioctl.h"
26 #include "uvm_linux.h"
27 #include "uvm_global.h"
28 #include "uvm_gpu.h"
29 #include "uvm_lock.h"
30 #include "uvm_va_space.h"
31 #include "uvm_va_range.h"
32 #include "uvm_va_block.h"
33 #include "uvm_tracker.h"
34 #include "uvm_api.h"
35 #include "uvm_channel.h"
36 #include "uvm_processors.h"
37 #include "uvm_push.h"
38 #include "uvm_hal.h"
39 #include "uvm_tools.h"
40 #include "uvm_migrate.h"
41 #include "uvm_migrate_pageable.h"
42 #include "uvm_va_space_mm.h"
43 #include "nv_speculation_barrier.h"
44 
45 typedef enum
46 {
47     UVM_MIGRATE_PASS_FIRST,
48     UVM_MIGRATE_PASS_SECOND
49 } uvm_migrate_pass_t;
50 
51 static int uvm_perf_migrate_cpu_preunmap_enable = 1;
52 module_param(uvm_perf_migrate_cpu_preunmap_enable, int, S_IRUGO);
53 
54 #define UVM_PERF_MIGRATE_CPU_PREUNMAP_BLOCK_ORDER_DEFAULT 2
55 #define UVM_PERF_MIGRATE_CPU_PREUNMAP_BLOCK_ORDER_MAX     10
56 static unsigned uvm_perf_migrate_cpu_preunmap_block_order = UVM_PERF_MIGRATE_CPU_PREUNMAP_BLOCK_ORDER_DEFAULT;
57 module_param(uvm_perf_migrate_cpu_preunmap_block_order, uint, S_IRUGO);
58 
59 // Global post-processed values of the module parameters
60 static bool g_uvm_perf_migrate_cpu_preunmap_enable __read_mostly;
61 static NvU64 g_uvm_perf_migrate_cpu_preunmap_size __read_mostly;
62 
63 static bool is_migration_single_block(uvm_va_range_t *first_va_range, NvU64 base, NvU64 length)
64 {
65     NvU64 end = base + length - 1;
66 
67     if (end > first_va_range->node.end)
68         return false;
69 
70     return uvm_va_range_block_index(first_va_range, base) == uvm_va_range_block_index(first_va_range, end);
71 }
72 
73 static NV_STATUS block_migrate_map_mapped_pages(uvm_va_block_t *va_block,
74                                                 uvm_va_block_retry_t *va_block_retry,
75                                                 uvm_va_block_context_t *va_block_context,
76                                                 uvm_va_block_region_t region,
77                                                 uvm_processor_id_t dest_id)
78 {
79     uvm_prot_t prot;
80     uvm_page_index_t page_index;
81     NV_STATUS status = NV_OK;
82     const uvm_page_mask_t *pages_mapped_on_destination = uvm_va_block_map_mask_get(va_block, dest_id);
83 
84     for (prot = UVM_PROT_READ_ONLY; prot <= UVM_PROT_READ_WRITE_ATOMIC; ++prot)
85         va_block_context->mask_by_prot[prot - 1].count = 0;
86 
87     // Only map those pages that are not already mapped on destination
88     for_each_va_block_unset_page_in_region_mask(page_index, pages_mapped_on_destination, region) {
89         prot = uvm_va_block_page_compute_highest_permission(va_block, dest_id, page_index);
90         if (prot == UVM_PROT_NONE)
91             continue;
92 
93         if (va_block_context->mask_by_prot[prot - 1].count++ == 0)
94             uvm_page_mask_zero(&va_block_context->mask_by_prot[prot - 1].page_mask);
95 
96         uvm_page_mask_set(&va_block_context->mask_by_prot[prot - 1].page_mask, page_index);
97     }
98 
99     for (prot = UVM_PROT_READ_ONLY; prot <= UVM_PROT_READ_WRITE_ATOMIC; ++prot) {
100         if (va_block_context->mask_by_prot[prot - 1].count == 0)
101             continue;
102 
103         // We pass UvmEventMapRemoteCauseInvalid since the destination processor
104         // of a migration will never be mapped remotely
105         status = uvm_va_block_map(va_block,
106                                   va_block_context,
107                                   dest_id,
108                                   region,
109                                   &va_block_context->mask_by_prot[prot - 1].page_mask,
110                                   prot,
111                                   UvmEventMapRemoteCauseInvalid,
112                                   &va_block->tracker);
113         if (status != NV_OK)
114             break;
115 
116         // Whoever added the other mapping(s) should have already added
117         // SetAccessedBy processors
118     }
119 
120     return status;
121 }
122 
123 static NV_STATUS block_migrate_map_unmapped_pages(uvm_va_block_t *va_block,
124                                                   uvm_va_block_retry_t *va_block_retry,
125                                                   uvm_va_block_context_t *va_block_context,
126                                                   uvm_va_block_region_t region,
127                                                   uvm_processor_id_t dest_id)
128 
129 {
130     uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
131     NV_STATUS status = NV_OK;
132     NV_STATUS tracker_status;
133 
134     // Get the mask of unmapped pages because it will change after the
135     // first map operation
136     uvm_va_block_unmapped_pages_get(va_block, region, &va_block_context->caller_page_mask);
137 
138     if (uvm_va_block_is_hmm(va_block) && !UVM_ID_IS_CPU(dest_id)) {
139         // Do not map pages that are already resident on the CPU. This is in
140         // order to avoid breaking system-wide atomic operations on HMM. HMM's
141         // implementation of system-side atomic operations involves restricting
142         // mappings to one processor (CPU or a GPU) at a time. If we were to
143         // grant a GPU a mapping to system memory, this gets into trouble
144         // because, on the CPU side, Linux can silently upgrade PTE permissions
145         // (move from read-only, to read-write, without any MMU notifiers
146         // firing), thus breaking the model by allowing simultaneous read-write
147         // access from two separate processors. To avoid that, just don't map
148         // such pages at all, when migrating.
149         uvm_page_mask_andnot(&va_block_context->caller_page_mask,
150                              &va_block_context->caller_page_mask,
151                              uvm_va_block_resident_mask_get(va_block, UVM_ID_CPU, NUMA_NO_NODE));
152     }
153 
154     // Only map those pages that are not mapped anywhere else (likely due
155     // to a first touch or a migration). We pass
156     // UvmEventMapRemoteCauseInvalid since the destination processor of a
157     // migration will never be mapped remotely.
158     status = uvm_va_block_map(va_block,
159                               va_block_context,
160                               dest_id,
161                               region,
162                               &va_block_context->caller_page_mask,
163                               UVM_PROT_READ_WRITE_ATOMIC,
164                               UvmEventMapRemoteCauseInvalid,
165                               &local_tracker);
166     if (status != NV_OK)
167         goto out;
168 
169     // Add mappings for AccessedBy processors
170     //
171     // No mappings within this call will operate on dest_id, so we don't
172     // need to acquire the map operation above.
173     status = uvm_va_block_add_mappings_after_migration(va_block,
174                                                        va_block_context,
175                                                        dest_id,
176                                                        dest_id,
177                                                        region,
178                                                        &va_block_context->caller_page_mask,
179                                                        UVM_PROT_READ_WRITE_ATOMIC,
180                                                        NULL);
181 
182 out:
183     tracker_status = uvm_tracker_add_tracker_safe(&va_block->tracker, &local_tracker);
184     uvm_tracker_deinit(&local_tracker);
185     return status == NV_OK ? tracker_status : status;
186 }
187 
188 // Pages that are not mapped anywhere can be safely mapped with RWA permission.
189 // The rest of pages need to individually compute the maximum permission that
190 // does not require a revocation.
191 static NV_STATUS block_migrate_add_mappings(uvm_va_block_t *va_block,
192                                             uvm_va_block_retry_t *va_block_retry,
193                                             uvm_va_block_context_t *va_block_context,
194                                             uvm_va_block_region_t region,
195                                             uvm_processor_id_t dest_id)
196 
197 {
198     NV_STATUS status;
199 
200     status = block_migrate_map_unmapped_pages(va_block,
201                                               va_block_retry,
202                                               va_block_context,
203                                               region,
204                                               dest_id);
205     if (status != NV_OK)
206         return status;
207 
208     return block_migrate_map_mapped_pages(va_block,
209                                           va_block_retry,
210                                           va_block_context,
211                                           region,
212                                           dest_id);
213 }
214 
215 NV_STATUS uvm_va_block_migrate_locked(uvm_va_block_t *va_block,
216                                       uvm_va_block_retry_t *va_block_retry,
217                                       uvm_va_block_context_t *va_block_context,
218                                       uvm_va_block_region_t region,
219                                       uvm_processor_id_t dest_id,
220                                       uvm_migrate_mode_t mode,
221                                       uvm_tracker_t *out_tracker)
222 {
223     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
224     NV_STATUS status, tracker_status = NV_OK;
225 
226     uvm_assert_mutex_locked(&va_block->lock);
227     UVM_ASSERT(uvm_hmm_check_context_vma_is_valid(va_block, va_block_context->hmm.vma, region));
228 
229     if (uvm_va_block_is_hmm(va_block)) {
230         status = uvm_hmm_va_block_migrate_locked(va_block,
231                                                  va_block_retry,
232                                                  va_block_context,
233                                                  dest_id,
234                                                  region,
235                                                  UVM_MAKE_RESIDENT_CAUSE_API_MIGRATE);
236     }
237     else {
238         uvm_va_policy_t *policy = uvm_va_range_get_policy(va_block->va_range);
239 
240         if (uvm_va_policy_is_read_duplicate(policy, va_space)) {
241             status = uvm_va_block_make_resident_read_duplicate(va_block,
242                                                                va_block_retry,
243                                                                va_block_context,
244                                                                dest_id,
245                                                                region,
246                                                                NULL,
247                                                                NULL,
248                                                                UVM_MAKE_RESIDENT_CAUSE_API_MIGRATE);
249         }
250         else {
251             status = uvm_va_block_make_resident(va_block,
252                                                 va_block_retry,
253                                                 va_block_context,
254                                                 dest_id,
255                                                 region,
256                                                 NULL,
257                                                 NULL,
258                                                 UVM_MAKE_RESIDENT_CAUSE_API_MIGRATE);
259         }
260     }
261 
262     if (status == NV_OK && mode == UVM_MIGRATE_MODE_MAKE_RESIDENT_AND_MAP) {
263         // block_migrate_add_mappings will acquire the work from the above
264         // make_resident call and update the VA block tracker.
265         status = block_migrate_add_mappings(va_block, va_block_retry, va_block_context, region, dest_id);
266     }
267 
268     if (out_tracker)
269         tracker_status = uvm_tracker_add_tracker_safe(out_tracker, &va_block->tracker);
270 
271     return status == NV_OK ? tracker_status : status;
272 }
273 
274 // Unmapping CPU pages on P9 systems is very costly, to the point that it
275 // becomes the bottleneck of UvmMigrate. We have measured up to 3x lower BW for
276 // migrations that need to remove CPU mappings compared to migrations that only
277 // create CPU mappings. The overhead can be fully attributed to the TLB
278 // shootdown. When a CPU page is unmapped, it needs to (1) invalidate any copy
279 // in the P9 cores, and (2) if ATS is enabled, issue ATSD messages over NVLINK
280 // to remove the corresponding entries in the GPUs' TLBs. ATSDs are not even
281 // required when migration managed memory since UVM ensures that there are no
282 // ATS entries cached in the GPU TLBs for the managed VA ranges. However, we
283 // don't have a way to skip them as of today.
284 //
285 // In order to minimize the overhead of CPU unmaps during UvmMigrate we try to
286 // call unmap_mapping_range on VA regions larger than the VA block granularity
287 // before the actual migration so that TLB invalidations are batched better by
288 // the OS. This also has an impact in the number of ATSD messages issued. This
289 // is because the NPU code uses MMU notifiers in order to get a callback
290 // (invalidate_range) when a TLB invalidation is required. Fortunately, this
291 // callback is not called if there is nothing to be invalidated. Therefore, if
292 // we issue a large unmap, subsequent unmaps within that region will not invoke
293 // the callback.
294 //
295 // However, due to (1), even issuing a single invalidate for the whole migrated
296 // range introduces a noticeable overhead (20-30%) on systems with 3xNVLINK2.
297 // This is only expected to get worse if CPU-GPU interconnects' BW keeps
298 // increasing.
299 //
300 // Thus, VA range migrations are split into groups of contiguous VA blocks, and
301 // trigger a single pre-unmap of the group of VA blocks in the Linux kernel
302 // before the VA blocks' migration starts. This way, we trigger larger (more
303 // efficient) TLB invalidations than when we do it one VA block a time, while
304 // still being able to pipeline the migration, which allows to hide most of the
305 // costs of (1).
306 //
307 // However, there are some cases in which the CPU has mappings to the pages
308 // being migrated but they don't need to be removed (which can introduce
309 // unnecessary CPU faults later on). Therefore, we skip the pre-unmap step
310 // under the following conditions:
311 // - Pages mapped by the CPU that are *already* in the destination.
312 // - Pages mapped by the CPU that are *not* in the destination but
313 // read-duplication is enabled in the VA range.
314 
315 // This function checks if the pre-unmap optimization is required given the
316 // system capabilities and the destination of the migration. This is to skip
317 // any subsequent checks required by the optimization, which can be costly.
318 //
319 // The current logic checks that:
320 // - We are in the first pass of the migration (see the explanation of the
321 // two-pass strategy in uvm_migrate).
322 // - The CPU has an NVLINK interconnect to the GPUs. Otherwise, we don't
323 // need this optimization since we are already limited by PCIe BW.
324 // - If the migration spans several VA blocks, otherwise skip the preunmap to
325 // avoid the overhead.
326 static bool migration_should_do_cpu_preunmap(uvm_va_space_t *va_space,
327                                              uvm_migrate_pass_t pass,
328                                              bool is_single_block)
329 
330 {
331     if (!g_uvm_perf_migrate_cpu_preunmap_enable)
332         return false;
333 
334     if (pass != UVM_MIGRATE_PASS_FIRST || is_single_block)
335         return false;
336 
337     if (uvm_processor_mask_get_gpu_count(&va_space->has_nvlink[UVM_ID_CPU_VALUE]) == 0)
338         return false;
339 
340     return true;
341 }
342 
343 // This function determines if the VA range properties avoid the need to remove
344 // CPU mappings on UvmMigrate. Currently, it only checks whether
345 // read-duplication is enabled in the VA range. This is because, when migrating
346 // read-duplicated VA blocks, the source processor doesn't need to be unmapped
347 // (though it may need write access revoked).
348 static bool va_range_should_do_cpu_preunmap(const uvm_va_policy_t *policy,
349                                             uvm_va_space_t *va_space)
350 {
351     return !uvm_va_policy_is_read_duplicate(policy, va_space);
352 }
353 
354 // Function that determines if the VA block to be migrated contains pages with
355 // CPU mappings that don't need to be removed (see the comment above). In that
356 // case false is returned. Otherwise it returns true, and stores in the
357 // variable pointed by num_unmap_pages the number of pages that do need to
358 // remove their CPU mappings.
359 static bool va_block_should_do_cpu_preunmap(uvm_va_block_t *va_block,
360                                             uvm_va_block_context_t *va_block_context,
361                                             NvU64 start,
362                                             NvU64 end,
363                                             uvm_processor_id_t dest_id,
364                                             NvU32 *num_unmap_pages)
365 {
366     const uvm_page_mask_t *mapped_pages_cpu;
367     NvU32 num_cpu_unchanged_pages = 0;
368     uvm_va_block_region_t region;
369 
370     *num_unmap_pages = 0;
371 
372     if (!va_block)
373         return true;
374 
375     region = uvm_va_block_region_from_start_end(va_block, max(start, va_block->start), min(end, va_block->end));
376 
377     uvm_mutex_lock(&va_block->lock);
378 
379     mapped_pages_cpu = uvm_va_block_map_mask_get(va_block, UVM_ID_CPU);
380     if (uvm_processor_mask_test(&va_block->resident, dest_id)) {
381         const uvm_page_mask_t *resident_pages_dest = uvm_va_block_resident_mask_get(va_block, dest_id, NUMA_NO_NODE);
382         uvm_page_mask_t *do_not_unmap_pages = &va_block_context->scratch_page_mask;
383 
384         // TODO: Bug 1877578
385         //
386         // We assume that if pages are mapped on the CPU and not resident on
387         // the destination, the pages will change residency so the CPU must be
388         // unmapped. If we implement automatic read-duplication heuristics in
389         // the future, we'll also need to check if the pages are being
390         // read-duplicated.
391         uvm_page_mask_and(do_not_unmap_pages, mapped_pages_cpu, resident_pages_dest);
392 
393         num_cpu_unchanged_pages = uvm_page_mask_region_weight(do_not_unmap_pages, region);
394     }
395 
396     *num_unmap_pages = uvm_page_mask_region_weight(mapped_pages_cpu, region) - num_cpu_unchanged_pages;
397 
398     uvm_mutex_unlock(&va_block->lock);
399 
400     return num_cpu_unchanged_pages == 0;
401 }
402 
403 static void preunmap_multi_block(uvm_va_range_t *va_range,
404                                  uvm_va_block_context_t *va_block_context,
405                                  NvU64 start,
406                                  NvU64 end,
407                                  uvm_processor_id_t dest_id)
408 {
409     size_t i;
410     const size_t first_block_index = uvm_va_range_block_index(va_range, start);
411     const size_t last_block_index = uvm_va_range_block_index(va_range, end);
412     NvU32 num_unmap_pages = 0;
413 
414     UVM_ASSERT(start >= va_range->node.start);
415     UVM_ASSERT(end  <= va_range->node.end);
416     UVM_ASSERT(va_range->type == UVM_VA_RANGE_TYPE_MANAGED);
417     uvm_assert_rwsem_locked(&va_range->va_space->lock);
418 
419     UVM_ASSERT(uvm_range_group_all_migratable(va_range->va_space, start, end));
420 
421     for (i = first_block_index; i <= last_block_index; i++) {
422         NvU32 num_block_unmap_pages;
423 
424         if (!va_block_should_do_cpu_preunmap(uvm_va_range_block(va_range, i),
425                                              va_block_context,
426                                              start,
427                                              end,
428                                              dest_id,
429                                              &num_block_unmap_pages)) {
430             return;
431         }
432 
433         num_unmap_pages += num_block_unmap_pages;
434     }
435 
436     if (num_unmap_pages > 0)
437         unmap_mapping_range(va_range->va_space->mapping, start, end - start + 1, 1);
438 }
439 
440 static NV_STATUS uvm_va_range_migrate_multi_block(uvm_va_range_t *va_range,
441                                                   uvm_va_block_context_t *va_block_context,
442                                                   NvU64 start,
443                                                   NvU64 end,
444                                                   uvm_processor_id_t dest_id,
445                                                   uvm_migrate_mode_t mode,
446                                                   uvm_tracker_t *out_tracker)
447 {
448     size_t i;
449     const size_t first_block_index = uvm_va_range_block_index(va_range, start);
450     const size_t last_block_index = uvm_va_range_block_index(va_range, end);
451 
452     UVM_ASSERT(start >= va_range->node.start);
453     UVM_ASSERT(end  <= va_range->node.end);
454     UVM_ASSERT(va_range->type == UVM_VA_RANGE_TYPE_MANAGED);
455     uvm_assert_rwsem_locked(&va_range->va_space->lock);
456 
457     UVM_ASSERT(uvm_range_group_all_migratable(va_range->va_space, start, end));
458 
459     // Iterate over blocks, populating them if necessary
460     for (i = first_block_index; i <= last_block_index; i++) {
461         uvm_va_block_retry_t va_block_retry;
462         uvm_va_block_region_t region;
463         uvm_va_block_t *va_block;
464         NV_STATUS status = uvm_va_range_block_create(va_range, i, &va_block);
465 
466         if (status != NV_OK)
467             return status;
468 
469         region = uvm_va_block_region_from_start_end(va_block,
470                                                     max(start, va_block->start),
471                                                     min(end, va_block->end));
472 
473         status = UVM_VA_BLOCK_LOCK_RETRY(va_block, &va_block_retry,
474                                          uvm_va_block_migrate_locked(va_block,
475                                                                      &va_block_retry,
476                                                                      va_block_context,
477                                                                      region,
478                                                                      dest_id,
479                                                                      mode,
480                                                                      out_tracker));
481         if (status != NV_OK)
482             return status;
483     }
484 
485     return NV_OK;
486 }
487 
488 static NV_STATUS uvm_va_range_migrate(uvm_va_range_t *va_range,
489                                       uvm_va_block_context_t *va_block_context,
490                                       NvU64 start,
491                                       NvU64 end,
492                                       uvm_processor_id_t dest_id,
493                                       uvm_migrate_mode_t mode,
494                                       bool should_do_cpu_preunmap,
495                                       uvm_tracker_t *out_tracker)
496 {
497     NvU64 preunmap_range_start = start;
498     uvm_va_policy_t *policy = uvm_va_range_get_policy(va_range);
499 
500     should_do_cpu_preunmap = should_do_cpu_preunmap && va_range_should_do_cpu_preunmap(policy, va_range->va_space);
501 
502     // Divide migrations into groups of contiguous VA blocks. This is to trigger
503     // CPU unmaps for that region before the migration starts.
504     while (preunmap_range_start < end) {
505         NV_STATUS status;
506         NvU64 preunmap_range_end;
507 
508         if (should_do_cpu_preunmap) {
509             preunmap_range_end = UVM_ALIGN_UP(preunmap_range_start + 1, g_uvm_perf_migrate_cpu_preunmap_size);
510             preunmap_range_end = min(preunmap_range_end - 1, end);
511 
512             preunmap_multi_block(va_range,
513                                  va_block_context,
514                                  preunmap_range_start,
515                                  preunmap_range_end,
516                                  dest_id);
517         }
518         else {
519             preunmap_range_end = end;
520         }
521 
522         status = uvm_va_range_migrate_multi_block(va_range,
523                                                   va_block_context,
524                                                   preunmap_range_start,
525                                                   preunmap_range_end,
526                                                   dest_id,
527                                                   mode,
528                                                   out_tracker);
529         if (status != NV_OK)
530             return status;
531 
532         preunmap_range_start = preunmap_range_end + 1;
533     }
534 
535     return NV_OK;
536 }
537 
538 static NV_STATUS uvm_migrate_ranges(uvm_va_space_t *va_space,
539                                     uvm_va_block_context_t *va_block_context,
540                                     uvm_va_range_t *first_va_range,
541                                     NvU64 base,
542                                     NvU64 length,
543                                     uvm_processor_id_t dest_id,
544                                     uvm_migrate_mode_t mode,
545                                     bool should_do_cpu_preunmap,
546                                     uvm_tracker_t *out_tracker)
547 {
548     uvm_va_range_t *va_range, *va_range_last;
549     NvU64 end = base + length - 1;
550     NV_STATUS status = NV_OK;
551     bool skipped_migrate = false;
552 
553     if (!first_va_range) {
554         // For HMM, we iterate over va_blocks since there is no va_range.
555         return uvm_hmm_migrate_ranges(va_space,
556                                       va_block_context,
557                                       base,
558                                       length,
559                                       dest_id,
560                                       mode,
561                                       out_tracker);
562     }
563 
564     UVM_ASSERT(first_va_range == uvm_va_space_iter_first(va_space, base, base));
565 
566     va_range_last = NULL;
567     uvm_for_each_va_range_in_contig_from(va_range, va_space, first_va_range, end) {
568         uvm_range_group_range_iter_t iter;
569         uvm_va_policy_t *policy = uvm_va_range_get_policy(va_range);
570 
571         va_range_last = va_range;
572 
573         // Only managed ranges can be migrated
574         if (va_range->type != UVM_VA_RANGE_TYPE_MANAGED) {
575             status = NV_ERR_INVALID_ADDRESS;
576             break;
577         }
578 
579         // For UVM-Lite GPUs, the CUDA driver may suballocate a single va_range
580         // into many range groups.  For this reason, we iterate over each va_range first
581         // then through the range groups within.
582         uvm_range_group_for_each_migratability_in(&iter,
583                                                   va_space,
584                                                   max(base, va_range->node.start),
585                                                   min(end, va_range->node.end)) {
586             // Skip non-migratable VA ranges
587             if (!iter.migratable) {
588                 // Only return NV_WARN_MORE_PROCESSING_REQUIRED if the pages aren't
589                 // already resident at dest_id.
590                 if (!uvm_va_policy_preferred_location_equal(policy, dest_id, va_block_context->make_resident.dest_nid))
591                     skipped_migrate = true;
592             }
593             else if (uvm_processor_mask_test(&va_range->uvm_lite_gpus, dest_id) &&
594                      !uvm_id_equal(dest_id, policy->preferred_location)) {
595                 // Don't migrate to a non-faultable GPU that is in UVM-Lite mode,
596                 // unless it's the preferred location
597                 status = NV_ERR_INVALID_DEVICE;
598                 break;
599             }
600             else {
601                 status = uvm_va_range_migrate(va_range,
602                                               va_block_context,
603                                               iter.start,
604                                               iter.end,
605                                               dest_id,
606                                               mode,
607                                               should_do_cpu_preunmap,
608                                               out_tracker);
609                 if (status != NV_OK)
610                     break;
611             }
612         }
613     }
614 
615     if (status != NV_OK)
616         return status;
617 
618     // Check that we were able to iterate over the entire range without any gaps
619     if (!va_range_last || va_range_last->node.end < end)
620         return NV_ERR_INVALID_ADDRESS;
621 
622     if (skipped_migrate)
623         return NV_WARN_MORE_PROCESSING_REQUIRED;
624 
625     return NV_OK;
626 }
627 
628 static NV_STATUS uvm_migrate(uvm_va_space_t *va_space,
629                              struct mm_struct *mm,
630                              NvU64 base,
631                              NvU64 length,
632                              uvm_processor_id_t dest_id,
633                              int dest_nid,
634                              NvU32 migrate_flags,
635                              uvm_va_range_t *first_va_range,
636                              uvm_tracker_t *out_tracker)
637 {
638     NV_STATUS status = NV_OK;
639     uvm_va_block_context_t *va_block_context;
640     bool do_mappings;
641     bool do_two_passes;
642     bool is_single_block;
643     bool should_do_cpu_preunmap;
644 
645     uvm_assert_rwsem_locked(&va_space->lock);
646 
647     // If the GPU has its memory disabled, just skip the migration and let
648     // faults take care of things.
649     if (!uvm_va_space_processor_has_memory(va_space, dest_id))
650         return NV_OK;
651 
652     if (mm)
653         uvm_assert_mmap_lock_locked(mm);
654     else if (!first_va_range)
655         return NV_ERR_INVALID_ADDRESS;
656 
657     va_block_context = uvm_va_block_context_alloc(mm);
658     if (!va_block_context)
659         return NV_ERR_NO_MEMORY;
660 
661     va_block_context->make_resident.dest_nid = dest_nid;
662 
663     // We perform two passes (unless the migration only covers a single VA
664     // block or UVM_MIGRATE_FLAG_SKIP_CPU_MAP is passed). This helps in the
665     // following scenarios:
666     //
667     // - Migrations that add CPU mappings, since they are synchronous operations
668     // that delay the migration of the next VA blocks.
669     // - Concurrent migrations. This is due to our current channel selection
670     // logic that doesn't prevent false dependencies between independent
671     // operations. For example, removal of mappings for outgoing transfers are
672     // delayed by the mappings added by incoming transfers.
673     // TODO: Bug 1764953: Re-evaluate the two-pass logic when channel selection
674     // is overhauled.
675     //
676     // The two passes are as follows:
677     //
678     // 1- Transfer all VA blocks (do not add mappings)
679     // 2- Go block by block reexecuting the transfer (in case someone moved it
680     // since the first pass), and adding the mappings.
681     //
682     // For HMM (!first_va_range), we always do a single pass.
683     is_single_block = !first_va_range || is_migration_single_block(first_va_range, base, length);
684     do_mappings = UVM_ID_IS_GPU(dest_id) || !(migrate_flags & UVM_MIGRATE_FLAG_SKIP_CPU_MAP);
685     do_two_passes = do_mappings && !is_single_block;
686 
687     if (do_two_passes) {
688         should_do_cpu_preunmap = migration_should_do_cpu_preunmap(va_space, UVM_MIGRATE_PASS_FIRST, is_single_block);
689 
690         status = uvm_migrate_ranges(va_space,
691                                     va_block_context,
692                                     first_va_range,
693                                     base,
694                                     length,
695                                     dest_id,
696                                     UVM_MIGRATE_MODE_MAKE_RESIDENT,
697                                     should_do_cpu_preunmap,
698                                     out_tracker);
699     }
700 
701     if (status == NV_OK) {
702         uvm_migrate_mode_t mode = do_mappings? UVM_MIGRATE_MODE_MAKE_RESIDENT_AND_MAP:
703                                                UVM_MIGRATE_MODE_MAKE_RESIDENT;
704         uvm_migrate_pass_t pass = do_two_passes? UVM_MIGRATE_PASS_SECOND:
705                                                  UVM_MIGRATE_PASS_FIRST;
706         should_do_cpu_preunmap = migration_should_do_cpu_preunmap(va_space, pass, is_single_block);
707 
708         status = uvm_migrate_ranges(va_space,
709                                     va_block_context,
710                                     first_va_range,
711                                     base,
712                                     length,
713                                     dest_id,
714                                     mode,
715                                     should_do_cpu_preunmap,
716                                     out_tracker);
717     }
718 
719     uvm_va_block_context_free(va_block_context);
720 
721     return status;
722 }
723 
724 static NV_STATUS semaphore_release_from_gpu(uvm_gpu_t *gpu,
725                                             uvm_va_range_semaphore_pool_t *semaphore_va_range,
726                                             NvU64 semaphore_user_addr,
727                                             NvU32 semaphore_payload,
728                                             uvm_tracker_t *release_after_tracker)
729 {
730     NV_STATUS status;
731     uvm_push_t push;
732     uvm_channel_type_t channel_type;
733     NvU64 semaphore_gpu_va;
734     NvU64 semaphore_offset;
735 
736     UVM_ASSERT(uvm_mem_mapped_on_gpu_kernel(semaphore_va_range->mem, gpu));
737 
738     semaphore_offset = semaphore_user_addr - (NvU64)(uintptr_t)semaphore_va_range->mem->user->addr;
739     semaphore_gpu_va = uvm_mem_get_gpu_va_kernel(semaphore_va_range->mem, gpu) + semaphore_offset;
740 
741     // Outside of SR-IOV heavy, using UVM_CHANNEL_TYPE_MEMOPS is optimal from a
742     // performance standpoint because if the migration is targeting a GPU, it is
743     // likely that the channel used for the GPU page table update (pushed to
744     // UVM_CHANNEL_TYPE_MEMOPS) will also be used for the release. The
745     // inter-channel dependency avoided by using a single channel can add a
746     // significant overhead to the enclosing migration.
747     //
748     // In SR-IOV heavy, the user semaphore release is functionally forbidden
749     // from being pushed to a UVM_CHANNEL_TYPE_MEMOPS channel, because it is not
750     // a page tree operation.
751     if (uvm_parent_gpu_is_virt_mode_sriov_heavy(gpu->parent))
752         channel_type = UVM_CHANNEL_TYPE_GPU_INTERNAL;
753     else
754         channel_type = UVM_CHANNEL_TYPE_MEMOPS;
755 
756     status = uvm_push_begin_acquire(gpu->channel_manager,
757                                     channel_type,
758                                     release_after_tracker,
759                                     &push,
760                                     "Pushing semaphore release (*0x%llx = %u)",
761                                     semaphore_user_addr,
762                                     semaphore_payload);
763     if (status != NV_OK)
764         return status;
765 
766     gpu->parent->ce_hal->semaphore_release(&push, semaphore_gpu_va, semaphore_payload);
767     uvm_push_end(&push);
768 
769     uvm_mutex_lock(&semaphore_va_range->tracker_lock);
770     status = uvm_tracker_add_push_safe(&semaphore_va_range->tracker, &push);
771     uvm_tracker_remove_completed(&semaphore_va_range->tracker);
772     uvm_mutex_unlock(&semaphore_va_range->tracker_lock);
773 
774     return status;
775 }
776 
777 static void semaphore_release_from_cpu(uvm_mem_t *semaphore_mem, NvU64 semaphore_user_addr, NvU32 semaphore_payload)
778 {
779     char *semaphore_cpu_va;
780     NvU64 semaphore_offset;
781 
782     UVM_ASSERT(uvm_mem_mapped_on_cpu_kernel(semaphore_mem));
783 
784     semaphore_offset = semaphore_user_addr - (NvU64)(uintptr_t)semaphore_mem->user->addr;
785 
786     // Prevent processor speculation prior to accessing user-mapped memory to
787     // avoid leaking information from side-channel attacks. Under speculation, a
788     // valid VA range which does not contain this semaphore could be used by the
789     // caller. It's unclear but likely that the user might be able to control
790     // the data at that address. Auditing all potential ways that could happen
791     // is difficult and error-prone, so to be on the safe side we'll just always
792     // block speculation.
793     nv_speculation_barrier();
794 
795     semaphore_cpu_va = (char *) uvm_mem_get_cpu_addr_kernel(semaphore_mem) + semaphore_offset;
796 
797     UVM_WRITE_ONCE(*(NvU32 *)semaphore_cpu_va, semaphore_payload);
798 }
799 
800 static NV_STATUS semaphore_release(NvU64 semaphore_address,
801                                    NvU32 semaphore_payload,
802                                    uvm_va_range_semaphore_pool_t *semaphore_pool,
803                                    uvm_gpu_t *dest_gpu,
804                                    uvm_tracker_t *tracker_ptr)
805 {
806     uvm_gpu_t *gpu;
807     uvm_gpu_t *gpu_owner = semaphore_pool->owner;
808 
809     // If there is a GPU owner, release the semaphore from it.
810     if (gpu_owner != NULL)
811         return semaphore_release_from_gpu(gpu_owner, semaphore_pool, semaphore_address, semaphore_payload, tracker_ptr);
812 
813     // Attempt eager release from CPU if the tracker is already completed.
814     if (uvm_tracker_is_completed(tracker_ptr)) {
815         semaphore_release_from_cpu(semaphore_pool->mem, semaphore_address, semaphore_payload);
816         return NV_OK;
817     }
818 
819     if (dest_gpu == NULL) {
820         // The destination is the CPU, but we didn't do a CPU release above
821         // because the previous work is not complete. This situation arises when
822         // accessed_by mappings are being set up asynchronously, or the
823         // test-only flag UVM_MIGRATE_FLAG_SKIP_CPU_MAP is used. So there should
824         // be a registered GPU, since all CPU work is synchronous, and the
825         // semaphore must be mapped on that GPU.
826         //
827         // Note that the GPU selected for the release may not be the same device
828         // that prevented the tracker from being complete.
829         gpu = uvm_processor_mask_find_first_gpu(&semaphore_pool->mem->kernel.mapped_on);
830 
831         UVM_ASSERT(gpu != NULL);
832     }
833     else {
834         gpu = dest_gpu;
835     }
836 
837     return semaphore_release_from_gpu(gpu, semaphore_pool, semaphore_address, semaphore_payload, tracker_ptr);
838 }
839 
840 NV_STATUS uvm_migrate_init(void)
841 {
842     NV_STATUS status = uvm_migrate_pageable_init();
843     if (status != NV_OK)
844         return status;
845 
846     g_uvm_perf_migrate_cpu_preunmap_enable = uvm_perf_migrate_cpu_preunmap_enable != 0;
847 
848     BUILD_BUG_ON((UVM_VA_BLOCK_SIZE) & (UVM_VA_BLOCK_SIZE - 1));
849 
850     if (g_uvm_perf_migrate_cpu_preunmap_enable) {
851         if (uvm_perf_migrate_cpu_preunmap_block_order <= UVM_PERF_MIGRATE_CPU_PREUNMAP_BLOCK_ORDER_MAX) {
852             g_uvm_perf_migrate_cpu_preunmap_size = UVM_VA_BLOCK_SIZE << uvm_perf_migrate_cpu_preunmap_block_order;
853         }
854         else {
855             g_uvm_perf_migrate_cpu_preunmap_size = UVM_VA_BLOCK_SIZE << UVM_PERF_MIGRATE_CPU_PREUNMAP_BLOCK_ORDER_DEFAULT;
856 
857             pr_info("Invalid value %u for uvm_perf_migrate_cpu_preunmap_block_order. Using %u instead\n",
858                     uvm_perf_migrate_cpu_preunmap_block_order,
859                     UVM_PERF_MIGRATE_CPU_PREUNMAP_BLOCK_ORDER_DEFAULT);
860         }
861     }
862 
863     return NV_OK;
864 }
865 
866 void uvm_migrate_exit(void)
867 {
868     uvm_migrate_pageable_exit();
869 }
870 
871 NV_STATUS uvm_api_migrate(UVM_MIGRATE_PARAMS *params, struct file *filp)
872 {
873     uvm_va_space_t *va_space = uvm_va_space_get(filp);
874     uvm_tracker_t tracker = UVM_TRACKER_INIT();
875     uvm_tracker_t *tracker_ptr = NULL;
876     uvm_gpu_t *dest_gpu = NULL;
877     uvm_va_range_t *sema_va_range = NULL;
878     struct mm_struct *mm;
879     NV_STATUS status = NV_OK;
880     bool flush_events = false;
881     const bool synchronous = !(params->flags & UVM_MIGRATE_FLAG_ASYNC);
882     int cpu_numa_node = (int)params->cpuNumaNode;
883 
884     // We temporarily allow 0 length in the IOCTL parameters as a signal to
885     // only release the semaphore. This is because user-space is in charge of
886     // migrating pageable memory in some cases.
887     //
888     // TODO: Bug 2419180: do not allow 0 length migrations when we fully switch
889     // to migrate_vma for all types of vmas.
890     if (params->length > 0 || synchronous || params->semaphoreAddress == 0) {
891         if (uvm_api_range_invalid(params->base, params->length))
892             return NV_ERR_INVALID_ADDRESS;
893     }
894 
895     if (params->flags & ~UVM_MIGRATE_FLAGS_ALL)
896         return NV_ERR_INVALID_ARGUMENT;
897 
898     if ((params->flags & UVM_MIGRATE_FLAGS_TEST_ALL) && !uvm_enable_builtin_tests) {
899         UVM_INFO_PRINT("Test flag set for UVM_MIGRATE. Did you mean to insmod with uvm_enable_builtin_tests=1?\n");
900         UVM_INFO_PRINT("TEMP\n");
901         return NV_ERR_INVALID_ARGUMENT;
902     }
903 
904     // mmap_lock will be needed if we have to create CPU mappings
905     mm = uvm_va_space_mm_or_current_retain_lock(va_space);
906     uvm_va_space_down_read(va_space);
907 
908     if (synchronous) {
909         if (params->semaphoreAddress != 0) {
910             status = NV_ERR_INVALID_ARGUMENT;
911             goto done;
912         }
913     }
914     else {
915         if (params->semaphoreAddress == 0) {
916             if (params->semaphorePayload != 0) {
917                 status = NV_ERR_INVALID_ARGUMENT;
918                 goto done;
919             }
920         }
921         else {
922             sema_va_range = uvm_va_range_find(va_space, params->semaphoreAddress);
923             if (!IS_ALIGNED(params->semaphoreAddress, sizeof(params->semaphorePayload)) ||
924                     !sema_va_range || sema_va_range->type != UVM_VA_RANGE_TYPE_SEMAPHORE_POOL) {
925                 status = NV_ERR_INVALID_ADDRESS;
926                 goto done;
927             }
928         }
929     }
930 
931     if (!uvm_uuid_is_cpu(&params->destinationUuid)) {
932         if (params->flags & UVM_MIGRATE_FLAG_NO_GPU_VA_SPACE)
933             dest_gpu = uvm_va_space_get_gpu_by_uuid(va_space, &params->destinationUuid);
934         else
935             dest_gpu = uvm_va_space_get_gpu_by_uuid_with_gpu_va_space(va_space, &params->destinationUuid);
936 
937         if (!dest_gpu) {
938             status = NV_ERR_INVALID_DEVICE;
939             goto done;
940         }
941 
942         if (params->length > 0 && !uvm_gpu_can_address(dest_gpu, params->base, params->length)) {
943             status = NV_ERR_OUT_OF_RANGE;
944             goto done;
945         }
946     }
947     else {
948         // If cpu_numa_node is not -1, we only check that it is a valid node in
949         // the system, it has memory, and it doesn't correspond to a GPU node.
950         //
951         // For pageable memory, this is fine because alloc_pages_node will clamp
952         // the allocation to cpuset_current_mems_allowed when uvm_migrate
953         //_pageable is called from process context (uvm_migrate) when dst_id is
954         // CPU. UVM bottom half calls uvm_migrate_pageable with CPU dst_id only
955         // when the VMA memory policy is set to dst_node_id and dst_node_id is
956         // not NUMA_NO_NODE.
957         if (cpu_numa_node != -1 &&
958             (!nv_numa_node_has_memory(cpu_numa_node) ||
959              !node_isset(cpu_numa_node, node_possible_map) ||
960              uvm_va_space_find_gpu_with_memory_node_id(va_space, cpu_numa_node))) {
961             status = NV_ERR_INVALID_ARGUMENT;
962             goto done;
963         }
964     }
965 
966     UVM_ASSERT(status == NV_OK);
967 
968     // If we're synchronous or if we need to release a semaphore, use a tracker.
969     if (synchronous || params->semaphoreAddress)
970         tracker_ptr = &tracker;
971 
972     if (params->length > 0) {
973         uvm_api_range_type_t type;
974         uvm_processor_id_t dest_id = dest_gpu ? dest_gpu->id : UVM_ID_CPU;
975 
976         type = uvm_api_range_type_check(va_space, mm, params->base, params->length);
977         if (type == UVM_API_RANGE_TYPE_INVALID) {
978             status = NV_ERR_INVALID_ADDRESS;
979             goto done;
980         }
981 
982         if (type == UVM_API_RANGE_TYPE_ATS) {
983             uvm_migrate_args_t uvm_migrate_args =
984             {
985                 .va_space                       = va_space,
986                 .mm                             = mm,
987                 .start                          = params->base,
988                 .length                         = params->length,
989                 .dst_id                         = dest_id,
990                 .dst_node_id                    = cpu_numa_node,
991                 .populate_permissions           = UVM_POPULATE_PERMISSIONS_INHERIT,
992                 .touch                          = false,
993                 .skip_mapped                    = false,
994                 .populate_on_cpu_alloc_failures = false,
995                 .user_space_start               = &params->userSpaceStart,
996                 .user_space_length              = &params->userSpaceLength,
997             };
998 
999             status = uvm_migrate_pageable(&uvm_migrate_args);
1000         }
1001         else {
1002             status = uvm_migrate(va_space,
1003                                  mm,
1004                                  params->base,
1005                                  params->length,
1006                                  dest_id,
1007                                  (UVM_ID_IS_CPU(dest_id) ? cpu_numa_node : NUMA_NO_NODE),
1008                                  params->flags,
1009                                  uvm_va_space_iter_first(va_space, params->base, params->base),
1010                                  tracker_ptr);
1011         }
1012     }
1013 
1014 done:
1015     // We only need to hold mmap_lock to create new CPU mappings, so drop it if
1016     // we need to wait for the tracker to finish.
1017     //
1018     // TODO: Bug 1766650: For large migrations with destination CPU, try
1019     //       benchmarks to see if a two-pass approach would be faster (first
1020     //       pass pushes all GPU work asynchronously, second pass updates CPU
1021     //       mappings synchronously).
1022     if (mm)
1023         uvm_up_read_mmap_lock_out_of_order(mm);
1024 
1025     if (tracker_ptr) {
1026         // If requested, release semaphore
1027         if (params->semaphoreAddress && (status == NV_OK)) {
1028             status = semaphore_release(params->semaphoreAddress,
1029                                        params->semaphorePayload,
1030                                        &sema_va_range->semaphore_pool,
1031                                        dest_gpu,
1032                                        tracker_ptr);
1033         }
1034 
1035         // Wait on the tracker if we are synchronous or there was an error. The
1036         // VA space lock must be held to prevent GPUs from being unregistered.
1037         if (synchronous || (status != NV_OK)) {
1038             NV_STATUS tracker_status = uvm_tracker_wait(tracker_ptr);
1039 
1040             // Only clobber status if we didn't hit an earlier error
1041             if (status == NV_OK)
1042                 status = tracker_status;
1043 
1044             flush_events = true;
1045         }
1046 
1047         uvm_tracker_deinit(tracker_ptr);
1048     }
1049 
1050     uvm_va_space_up_read(va_space);
1051     uvm_va_space_mm_or_current_release(va_space, mm);
1052 
1053     // If the migration is known to be complete, eagerly dispatch the migration
1054     // events, instead of processing them on a later event flush. Note that an
1055     // asynchronous migration could be complete by now, but the flush would not
1056     // be triggered.
1057     if (flush_events)
1058         uvm_tools_flush_events();
1059 
1060     return status;
1061 }
1062 
1063 NV_STATUS uvm_api_migrate_range_group(UVM_MIGRATE_RANGE_GROUP_PARAMS *params, struct file *filp)
1064 {
1065     NV_STATUS status = NV_OK;
1066     NV_STATUS tracker_status = NV_OK;
1067     uvm_va_space_t *va_space = uvm_va_space_get(filp);
1068     struct mm_struct *mm;
1069     uvm_range_group_t *range_group;
1070     uvm_range_group_range_t *rgr;
1071     uvm_processor_id_t dest_id;
1072     uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
1073     NvU32 migrate_flags = 0;
1074     uvm_gpu_t *gpu = NULL;
1075 
1076     // mmap_lock will be needed if we have to create CPU mappings
1077     mm = uvm_va_space_mm_or_current_retain_lock(va_space);
1078     uvm_va_space_down_read(va_space);
1079 
1080     if (uvm_uuid_is_cpu(&params->destinationUuid)) {
1081         dest_id = UVM_ID_CPU;
1082     }
1083     else {
1084         gpu = uvm_va_space_get_gpu_by_uuid_with_gpu_va_space(va_space, &params->destinationUuid);
1085         if (!gpu) {
1086             status = NV_ERR_INVALID_DEVICE;
1087             goto done;
1088         }
1089 
1090         dest_id = gpu->id;
1091     }
1092 
1093     range_group = radix_tree_lookup(&va_space->range_groups, params->rangeGroupId);
1094     if (!range_group) {
1095         status = NV_ERR_OBJECT_NOT_FOUND;
1096         goto done;
1097     }
1098 
1099     // Migrate all VA ranges in the range group. uvm_migrate is used because it performs all
1100     // VA range validity checks.
1101     list_for_each_entry(rgr, &range_group->ranges, range_group_list_node) {
1102         NvU64 start = rgr->node.start;
1103         NvU64 length = rgr->node.end - rgr->node.start + 1;
1104 
1105         if (gpu && !uvm_gpu_can_address(gpu, start, length)) {
1106             status = NV_ERR_OUT_OF_RANGE;
1107         }
1108         else {
1109             uvm_va_range_t *first_va_range = uvm_va_space_iter_first(va_space, start, start);
1110 
1111             if (!first_va_range || first_va_range->type != UVM_VA_RANGE_TYPE_MANAGED) {
1112                 status = NV_ERR_INVALID_ADDRESS;
1113                 goto done;
1114             }
1115 
1116             status = uvm_migrate(va_space,
1117                                  mm,
1118                                  start,
1119                                  length,
1120                                  dest_id,
1121                                  NUMA_NO_NODE,
1122                                  migrate_flags,
1123                                  first_va_range,
1124                                  &local_tracker);
1125         }
1126 
1127         if (status != NV_OK)
1128             goto done;
1129     }
1130 
1131 done:
1132     // We only need to hold mmap_lock to create new CPU mappings, so drop it if
1133     // we need to wait for the tracker to finish.
1134     //
1135     // TODO: Bug 1766650: For large migrations with destination CPU, try
1136     //       benchmarks to see if a two-pass approach would be faster (first
1137     //       pass pushes all GPU work asynchronously, second pass updates CPU
1138     //       mappings synchronously).
1139     if (mm)
1140         uvm_up_read_mmap_lock_out_of_order(mm);
1141 
1142     tracker_status = uvm_tracker_wait_deinit(&local_tracker);
1143     uvm_va_space_up_read(va_space);
1144     uvm_va_space_mm_or_current_release(va_space, mm);
1145 
1146     // This API is synchronous, so wait for migrations to finish
1147     uvm_tools_flush_events();
1148 
1149     return status == NV_OK? tracker_status : status;
1150 }
1151