1 /*******************************************************************************
2     Copyright (c) 2016-2023 NVIDIA Corporation
3 
4     Permission is hereby granted, free of charge, to any person obtaining a copy
5     of this software and associated documentation files (the "Software"), to
6     deal in the Software without restriction, including without limitation the
7     rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
8     sell copies of the Software, and to permit persons to whom the Software is
9     furnished to do so, subject to the following conditions:
10 
11         The above copyright notice and this permission notice shall be
12         included in all copies or substantial portions of the Software.
13 
14     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17     THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19     FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20     DEALINGS IN THE SOFTWARE.
21 
22 *******************************************************************************/
23 
24 #include "uvm_common.h"
25 #include "uvm_ioctl.h"
26 #include "uvm_linux.h"
27 #include "uvm_global.h"
28 #include "uvm_gpu.h"
29 #include "uvm_lock.h"
30 #include "uvm_va_space.h"
31 #include "uvm_va_range.h"
32 #include "uvm_va_block.h"
33 #include "uvm_tracker.h"
34 #include "uvm_api.h"
35 #include "uvm_channel.h"
36 #include "uvm_processors.h"
37 #include "uvm_push.h"
38 #include "uvm_hal.h"
39 #include "uvm_tools.h"
40 #include "uvm_migrate.h"
41 #include "uvm_migrate_pageable.h"
42 #include "uvm_va_space_mm.h"
43 #include "nv_speculation_barrier.h"
44 
45 typedef enum
46 {
47     UVM_MIGRATE_PASS_FIRST,
48     UVM_MIGRATE_PASS_SECOND
49 } uvm_migrate_pass_t;
50 
51 static int uvm_perf_migrate_cpu_preunmap_enable = 1;
52 module_param(uvm_perf_migrate_cpu_preunmap_enable, int, S_IRUGO);
53 
54 #define UVM_PERF_MIGRATE_CPU_PREUNMAP_BLOCK_ORDER_DEFAULT 2
55 #define UVM_PERF_MIGRATE_CPU_PREUNMAP_BLOCK_ORDER_MAX     10
56 static unsigned uvm_perf_migrate_cpu_preunmap_block_order = UVM_PERF_MIGRATE_CPU_PREUNMAP_BLOCK_ORDER_DEFAULT;
57 module_param(uvm_perf_migrate_cpu_preunmap_block_order, uint, S_IRUGO);
58 
59 // Global post-processed values of the module parameters
60 static bool g_uvm_perf_migrate_cpu_preunmap_enable __read_mostly;
61 static NvU64 g_uvm_perf_migrate_cpu_preunmap_size __read_mostly;
62 
63 static bool is_migration_single_block(uvm_va_range_t *first_va_range, NvU64 base, NvU64 length)
64 {
65     NvU64 end = base + length - 1;
66 
67     if (end > first_va_range->node.end)
68         return false;
69 
70     return uvm_va_range_block_index(first_va_range, base) == uvm_va_range_block_index(first_va_range, end);
71 }
72 
73 static NV_STATUS block_migrate_map_mapped_pages(uvm_va_block_t *va_block,
74                                                 uvm_va_block_retry_t *va_block_retry,
75                                                 uvm_va_block_context_t *va_block_context,
76                                                 uvm_va_block_region_t region,
77                                                 uvm_processor_id_t dest_id)
78 {
79     uvm_prot_t prot;
80     uvm_page_index_t page_index;
81     NV_STATUS status = NV_OK;
82     const uvm_page_mask_t *pages_mapped_on_destination = uvm_va_block_map_mask_get(va_block, dest_id);
83 
84     for (prot = UVM_PROT_READ_ONLY; prot <= UVM_PROT_READ_WRITE_ATOMIC; ++prot)
85         va_block_context->mask_by_prot[prot - 1].count = 0;
86 
87     // Only map those pages that are not already mapped on destination
88     for_each_va_block_unset_page_in_region_mask(page_index, pages_mapped_on_destination, region) {
89         prot = uvm_va_block_page_compute_highest_permission(va_block, va_block_context, dest_id, page_index);
90         if (prot == UVM_PROT_NONE)
91             continue;
92 
93         if (va_block_context->mask_by_prot[prot - 1].count++ == 0)
94             uvm_page_mask_zero(&va_block_context->mask_by_prot[prot - 1].page_mask);
95 
96         uvm_page_mask_set(&va_block_context->mask_by_prot[prot - 1].page_mask, page_index);
97     }
98 
99     for (prot = UVM_PROT_READ_ONLY; prot <= UVM_PROT_READ_WRITE_ATOMIC; ++prot) {
100         if (va_block_context->mask_by_prot[prot - 1].count == 0)
101             continue;
102 
103         // We pass UvmEventMapRemoteCauseInvalid since the destination processor
104         // of a migration will never be mapped remotely
105         status = uvm_va_block_map(va_block,
106                                   va_block_context,
107                                   dest_id,
108                                   region,
109                                   &va_block_context->mask_by_prot[prot - 1].page_mask,
110                                   prot,
111                                   UvmEventMapRemoteCauseInvalid,
112                                   &va_block->tracker);
113         if (status != NV_OK)
114             break;
115 
116         // Whoever added the other mapping(s) should have already added
117         // SetAccessedBy processors
118     }
119 
120     return status;
121 }
122 
123 static NV_STATUS block_migrate_map_unmapped_pages(uvm_va_block_t *va_block,
124                                                   uvm_va_block_retry_t *va_block_retry,
125                                                   uvm_va_block_context_t *va_block_context,
126                                                   uvm_va_block_region_t region,
127                                                   uvm_processor_id_t dest_id)
128 
129 {
130     uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
131     NV_STATUS status = NV_OK;
132     NV_STATUS tracker_status;
133 
134     // Get the mask of unmapped pages because it will change after the
135     // first map operation
136     uvm_va_block_unmapped_pages_get(va_block, region, &va_block_context->caller_page_mask);
137 
138     if (uvm_va_block_is_hmm(va_block) && !UVM_ID_IS_CPU(dest_id)) {
139         // Do not map pages that are already resident on the CPU. This is in
140         // order to avoid breaking system-wide atomic operations on HMM. HMM's
141         // implementation of system-side atomic operations involves restricting
142         // mappings to one processor (CPU or a GPU) at a time. If we were to
143         // grant a GPU a mapping to system memory, this gets into trouble
144         // because, on the CPU side, Linux can silently upgrade PTE permissions
145         // (move from read-only, to read-write, without any MMU notifiers
146         // firing), thus breaking the model by allowing simultaneous read-write
147         // access from two separate processors. To avoid that, just don't map
148         // such pages at all, when migrating.
149         uvm_page_mask_andnot(&va_block_context->caller_page_mask,
150                              &va_block_context->caller_page_mask,
151                              uvm_va_block_resident_mask_get(va_block, UVM_ID_CPU, NUMA_NO_NODE));
152     }
153 
154     // Only map those pages that are not mapped anywhere else (likely due
155     // to a first touch or a migration). We pass
156     // UvmEventMapRemoteCauseInvalid since the destination processor of a
157     // migration will never be mapped remotely.
158     status = uvm_va_block_map(va_block,
159                               va_block_context,
160                               dest_id,
161                               region,
162                               &va_block_context->caller_page_mask,
163                               UVM_PROT_READ_WRITE_ATOMIC,
164                               UvmEventMapRemoteCauseInvalid,
165                               &local_tracker);
166     if (status != NV_OK)
167         goto out;
168 
169     // Add mappings for AccessedBy processors
170     //
171     // No mappings within this call will operate on dest_id, so we don't
172     // need to acquire the map operation above.
173     status = uvm_va_block_add_mappings_after_migration(va_block,
174                                                        va_block_context,
175                                                        dest_id,
176                                                        dest_id,
177                                                        region,
178                                                        &va_block_context->caller_page_mask,
179                                                        UVM_PROT_READ_WRITE_ATOMIC,
180                                                        NULL);
181 
182 out:
183     tracker_status = uvm_tracker_add_tracker_safe(&va_block->tracker, &local_tracker);
184     uvm_tracker_deinit(&local_tracker);
185     return status == NV_OK ? tracker_status : status;
186 }
187 
188 // Pages that are not mapped anywhere can be safely mapped with RWA permission.
189 // The rest of pages need to individually compute the maximum permission that
190 // does not require a revocation.
191 static NV_STATUS block_migrate_add_mappings(uvm_va_block_t *va_block,
192                                             uvm_va_block_retry_t *va_block_retry,
193                                             uvm_va_block_context_t *va_block_context,
194                                             uvm_va_block_region_t region,
195                                             uvm_processor_id_t dest_id)
196 
197 {
198     NV_STATUS status;
199 
200     status = block_migrate_map_unmapped_pages(va_block,
201                                               va_block_retry,
202                                               va_block_context,
203                                               region,
204                                               dest_id);
205     if (status != NV_OK)
206         return status;
207 
208     return block_migrate_map_mapped_pages(va_block,
209                                           va_block_retry,
210                                           va_block_context,
211                                           region,
212                                           dest_id);
213 }
214 
215 NV_STATUS uvm_va_block_migrate_locked(uvm_va_block_t *va_block,
216                                       uvm_va_block_retry_t *va_block_retry,
217                                       uvm_service_block_context_t *service_context,
218                                       uvm_va_block_region_t region,
219                                       uvm_processor_id_t dest_id,
220                                       uvm_migrate_mode_t mode,
221                                       uvm_tracker_t *out_tracker)
222 {
223     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
224     uvm_va_block_context_t *va_block_context = service_context->block_context;
225     NV_STATUS status, tracker_status = NV_OK;
226 
227     uvm_assert_mutex_locked(&va_block->lock);
228     UVM_ASSERT(uvm_hmm_check_context_vma_is_valid(va_block, va_block_context->hmm.vma, region));
229 
230     if (uvm_va_block_is_hmm(va_block)) {
231         status = uvm_hmm_va_block_migrate_locked(va_block,
232                                                  va_block_retry,
233                                                  service_context,
234                                                  dest_id,
235                                                  region,
236                                                  UVM_MAKE_RESIDENT_CAUSE_API_MIGRATE);
237     }
238     else {
239         uvm_va_policy_t *policy = uvm_va_range_get_policy(va_block->va_range);
240 
241         if (uvm_va_policy_is_read_duplicate(policy, va_space)) {
242             status = uvm_va_block_make_resident_read_duplicate(va_block,
243                                                                va_block_retry,
244                                                                va_block_context,
245                                                                dest_id,
246                                                                region,
247                                                                NULL,
248                                                                NULL,
249                                                                UVM_MAKE_RESIDENT_CAUSE_API_MIGRATE);
250         }
251         else {
252             status = uvm_va_block_make_resident(va_block,
253                                                 va_block_retry,
254                                                 va_block_context,
255                                                 dest_id,
256                                                 region,
257                                                 NULL,
258                                                 NULL,
259                                                 UVM_MAKE_RESIDENT_CAUSE_API_MIGRATE);
260         }
261     }
262 
263     if (status == NV_OK && mode == UVM_MIGRATE_MODE_MAKE_RESIDENT_AND_MAP) {
264         // block_migrate_add_mappings will acquire the work from the above
265         // make_resident call and update the VA block tracker.
266         status = block_migrate_add_mappings(va_block, va_block_retry, va_block_context, region, dest_id);
267     }
268 
269     if (out_tracker)
270         tracker_status = uvm_tracker_add_tracker_safe(out_tracker, &va_block->tracker);
271 
272     return status == NV_OK ? tracker_status : status;
273 }
274 
275 // Unmapping CPU pages on P9 systems is very costly, to the point that it
276 // becomes the bottleneck of UvmMigrate. We have measured up to 3x lower BW for
277 // migrations that need to remove CPU mappings compared to migrations that only
278 // create CPU mappings. The overhead can be fully attributed to the TLB
279 // shootdown. When a CPU page is unmapped, it needs to (1) invalidate any copy
280 // in the P9 cores, and (2) if ATS is enabled, issue ATSD messages over NVLINK
281 // to remove the corresponding entries in the GPUs' TLBs. ATSDs are not even
282 // required when migration managed memory since UVM ensures that there are no
283 // ATS entries cached in the GPU TLBs for the managed VA ranges. However, we
284 // don't have a way to skip them as of today.
285 //
286 // In order to minimize the overhead of CPU unmaps during UvmMigrate we try to
287 // call unmap_mapping_range on VA regions larger than the VA block granularity
288 // before the actual migration so that TLB invalidations are batched better by
289 // the OS. This also has an impact in the number of ATSD messages issued. This
290 // is because the NPU code uses MMU notifiers in order to get a callback
291 // (invalidate_range) when a TLB invalidation is required. Fortunately, this
292 // callback is not called if there is nothing to be invalidated. Therefore, if
293 // we issue a large unmap, subsequent unmaps within that region will not invoke
294 // the callback.
295 //
296 // However, due to (1), even issuing a single invalidate for the whole migrated
297 // range introduces a noticeable overhead (20-30%) on systems with 3xNVLINK2.
298 // This is only expected to get worse if CPU-GPU interconnects' BW keeps
299 // increasing.
300 //
301 // Thus, VA range migrations are split into groups of contiguous VA blocks, and
302 // trigger a single pre-unmap of the group of VA blocks in the Linux kernel
303 // before the VA blocks' migration starts. This way, we trigger larger (more
304 // efficient) TLB invalidations than when we do it one VA block a time, while
305 // still being able to pipeline the migration, which allows to hide most of the
306 // costs of (1).
307 //
308 // However, there are some cases in which the CPU has mappings to the pages
309 // being migrated but they don't need to be removed (which can introduce
310 // unnecessary CPU faults later on). Therefore, we skip the pre-unmap step
311 // under the following conditions:
312 // - Pages mapped by the CPU that are *already* in the destination.
313 // - Pages mapped by the CPU that are *not* in the destination but
314 // read-duplication is enabled in the VA range.
315 
316 // This function checks if the pre-unmap optimization is required given the
317 // system capabilities and the destination of the migration. This is to skip
318 // any subsequent checks required by the optimization, which can be costly.
319 //
320 // The current logic checks that:
321 // - We are in the first pass of the migration (see the explanation of the
322 // two-pass strategy in uvm_migrate).
323 // - The CPU has an NVLINK interconnect to the GPUs. Otherwise, we don't
324 // need this optimization since we are already limited by PCIe BW.
325 // - If the migration spans several VA blocks, otherwise skip the preunmap to
326 // avoid the overhead.
327 static bool migration_should_do_cpu_preunmap(uvm_va_space_t *va_space,
328                                              uvm_migrate_pass_t pass,
329                                              bool is_single_block)
330 
331 {
332     if (!g_uvm_perf_migrate_cpu_preunmap_enable)
333         return false;
334 
335     if (pass != UVM_MIGRATE_PASS_FIRST || is_single_block)
336         return false;
337 
338     if (uvm_processor_mask_get_gpu_count(&va_space->has_nvlink[UVM_ID_CPU_VALUE]) == 0)
339         return false;
340 
341     return true;
342 }
343 
344 // This function determines if the VA range properties avoid the need to remove
345 // CPU mappings on UvmMigrate. Currently, it only checks whether
346 // read-duplication is enabled in the VA range. This is because, when migrating
347 // read-duplicated VA blocks, the source processor doesn't need to be unmapped
348 // (though it may need write access revoked).
349 static bool va_range_should_do_cpu_preunmap(const uvm_va_policy_t *policy,
350                                             uvm_va_space_t *va_space)
351 {
352     return !uvm_va_policy_is_read_duplicate(policy, va_space);
353 }
354 
355 // Function that determines if the VA block to be migrated contains pages with
356 // CPU mappings that don't need to be removed (see the comment above). In that
357 // case false is returned. Otherwise it returns true, and stores in the
358 // variable pointed by num_unmap_pages the number of pages that do need to
359 // remove their CPU mappings.
360 static bool va_block_should_do_cpu_preunmap(uvm_va_block_t *va_block,
361                                             uvm_va_block_context_t *va_block_context,
362                                             NvU64 start,
363                                             NvU64 end,
364                                             uvm_processor_id_t dest_id,
365                                             NvU32 *num_unmap_pages)
366 {
367     const uvm_page_mask_t *mapped_pages_cpu;
368     NvU32 num_cpu_unchanged_pages = 0;
369     uvm_va_block_region_t region;
370 
371     *num_unmap_pages = 0;
372 
373     if (!va_block)
374         return true;
375 
376     region = uvm_va_block_region_from_start_end(va_block, max(start, va_block->start), min(end, va_block->end));
377 
378     uvm_mutex_lock(&va_block->lock);
379 
380     mapped_pages_cpu = uvm_va_block_map_mask_get(va_block, UVM_ID_CPU);
381     if (uvm_processor_mask_test(&va_block->resident, dest_id)) {
382         const uvm_page_mask_t *resident_pages_dest = uvm_va_block_resident_mask_get(va_block, dest_id, NUMA_NO_NODE);
383         uvm_page_mask_t *do_not_unmap_pages = &va_block_context->scratch_page_mask;
384 
385         // TODO: Bug 1877578
386         //
387         // We assume that if pages are mapped on the CPU and not resident on
388         // the destination, the pages will change residency so the CPU must be
389         // unmapped. If we implement automatic read-duplication heuristics in
390         // the future, we'll also need to check if the pages are being
391         // read-duplicated.
392         uvm_page_mask_and(do_not_unmap_pages, mapped_pages_cpu, resident_pages_dest);
393 
394         num_cpu_unchanged_pages = uvm_page_mask_region_weight(do_not_unmap_pages, region);
395     }
396 
397     *num_unmap_pages = uvm_page_mask_region_weight(mapped_pages_cpu, region) - num_cpu_unchanged_pages;
398 
399     uvm_mutex_unlock(&va_block->lock);
400 
401     return num_cpu_unchanged_pages == 0;
402 }
403 
404 static void preunmap_multi_block(uvm_va_range_t *va_range,
405                                  uvm_va_block_context_t *va_block_context,
406                                  NvU64 start,
407                                  NvU64 end,
408                                  uvm_processor_id_t dest_id)
409 {
410     size_t i;
411     const size_t first_block_index = uvm_va_range_block_index(va_range, start);
412     const size_t last_block_index = uvm_va_range_block_index(va_range, end);
413     NvU32 num_unmap_pages = 0;
414 
415     UVM_ASSERT(start >= va_range->node.start);
416     UVM_ASSERT(end  <= va_range->node.end);
417     UVM_ASSERT(va_range->type == UVM_VA_RANGE_TYPE_MANAGED);
418     uvm_assert_rwsem_locked(&va_range->va_space->lock);
419 
420     UVM_ASSERT(uvm_range_group_all_migratable(va_range->va_space, start, end));
421 
422     for (i = first_block_index; i <= last_block_index; i++) {
423         NvU32 num_block_unmap_pages;
424 
425         if (!va_block_should_do_cpu_preunmap(uvm_va_range_block(va_range, i),
426                                              va_block_context,
427                                              start,
428                                              end,
429                                              dest_id,
430                                              &num_block_unmap_pages)) {
431             return;
432         }
433 
434         num_unmap_pages += num_block_unmap_pages;
435     }
436 
437     if (num_unmap_pages > 0)
438         unmap_mapping_range(va_range->va_space->mapping, start, end - start + 1, 1);
439 }
440 
441 static NV_STATUS uvm_va_range_migrate_multi_block(uvm_va_range_t *va_range,
442                                                   uvm_service_block_context_t *service_context,
443                                                   NvU64 start,
444                                                   NvU64 end,
445                                                   uvm_processor_id_t dest_id,
446                                                   uvm_migrate_mode_t mode,
447                                                   uvm_tracker_t *out_tracker)
448 {
449     size_t i;
450     const size_t first_block_index = uvm_va_range_block_index(va_range, start);
451     const size_t last_block_index = uvm_va_range_block_index(va_range, end);
452 
453     UVM_ASSERT(start >= va_range->node.start);
454     UVM_ASSERT(end  <= va_range->node.end);
455     UVM_ASSERT(va_range->type == UVM_VA_RANGE_TYPE_MANAGED);
456     uvm_assert_rwsem_locked(&va_range->va_space->lock);
457 
458     UVM_ASSERT(uvm_range_group_all_migratable(va_range->va_space, start, end));
459 
460     // Iterate over blocks, populating them if necessary
461     for (i = first_block_index; i <= last_block_index; i++) {
462         uvm_va_block_retry_t va_block_retry;
463         uvm_va_block_region_t region;
464         uvm_va_block_t *va_block;
465         NV_STATUS status = uvm_va_range_block_create(va_range, i, &va_block);
466 
467         if (status != NV_OK)
468             return status;
469 
470         region = uvm_va_block_region_from_start_end(va_block,
471                                                     max(start, va_block->start),
472                                                     min(end, va_block->end));
473 
474         status = UVM_VA_BLOCK_LOCK_RETRY(va_block,
475                                          &va_block_retry,
476                                          uvm_va_block_migrate_locked(va_block,
477                                                                      &va_block_retry,
478                                                                      service_context,
479                                                                      region,
480                                                                      dest_id,
481                                                                      mode,
482                                                                      out_tracker));
483         if (status != NV_OK)
484             return status;
485     }
486 
487     return NV_OK;
488 }
489 
490 static NV_STATUS uvm_va_range_migrate(uvm_va_range_t *va_range,
491                                       uvm_service_block_context_t *service_context,
492                                       NvU64 start,
493                                       NvU64 end,
494                                       uvm_processor_id_t dest_id,
495                                       uvm_migrate_mode_t mode,
496                                       bool should_do_cpu_preunmap,
497                                       uvm_tracker_t *out_tracker)
498 {
499     NvU64 preunmap_range_start = start;
500     uvm_va_policy_t *policy = uvm_va_range_get_policy(va_range);
501 
502     should_do_cpu_preunmap = should_do_cpu_preunmap && va_range_should_do_cpu_preunmap(policy, va_range->va_space);
503 
504     // Divide migrations into groups of contiguous VA blocks. This is to trigger
505     // CPU unmaps for that region before the migration starts.
506     while (preunmap_range_start < end) {
507         NV_STATUS status;
508         NvU64 preunmap_range_end;
509 
510         if (should_do_cpu_preunmap) {
511             preunmap_range_end = UVM_ALIGN_UP(preunmap_range_start + 1, g_uvm_perf_migrate_cpu_preunmap_size);
512             preunmap_range_end = min(preunmap_range_end - 1, end);
513 
514             preunmap_multi_block(va_range,
515                                  service_context->block_context,
516                                  preunmap_range_start,
517                                  preunmap_range_end,
518                                  dest_id);
519         }
520         else {
521             preunmap_range_end = end;
522         }
523 
524         status = uvm_va_range_migrate_multi_block(va_range,
525                                                   service_context,
526                                                   preunmap_range_start,
527                                                   preunmap_range_end,
528                                                   dest_id,
529                                                   mode,
530                                                   out_tracker);
531         if (status != NV_OK)
532             return status;
533 
534         preunmap_range_start = preunmap_range_end + 1;
535     }
536 
537     return NV_OK;
538 }
539 
540 static NV_STATUS uvm_migrate_ranges(uvm_va_space_t *va_space,
541                                     uvm_service_block_context_t *service_context,
542                                     uvm_va_range_t *first_va_range,
543                                     NvU64 base,
544                                     NvU64 length,
545                                     uvm_processor_id_t dest_id,
546                                     uvm_migrate_mode_t mode,
547                                     bool should_do_cpu_preunmap,
548                                     uvm_tracker_t *out_tracker)
549 {
550     uvm_va_range_t *va_range, *va_range_last;
551     NvU64 end = base + length - 1;
552     NV_STATUS status = NV_OK;
553     bool skipped_migrate = false;
554 
555     if (!first_va_range) {
556         // For HMM, we iterate over va_blocks since there is no va_range.
557         return uvm_hmm_migrate_ranges(va_space, service_context, base, length, dest_id, mode, out_tracker);
558     }
559 
560     UVM_ASSERT(first_va_range == uvm_va_space_iter_first(va_space, base, base));
561 
562     va_range_last = NULL;
563     uvm_for_each_va_range_in_contig_from(va_range, va_space, first_va_range, end) {
564         uvm_range_group_range_iter_t iter;
565         uvm_va_policy_t *policy = uvm_va_range_get_policy(va_range);
566 
567         va_range_last = va_range;
568 
569         // Only managed ranges can be migrated
570         if (va_range->type != UVM_VA_RANGE_TYPE_MANAGED) {
571             status = NV_ERR_INVALID_ADDRESS;
572             break;
573         }
574 
575         // For UVM-Lite GPUs, the CUDA driver may suballocate a single va_range
576         // into many range groups.  For this reason, we iterate over each va_range first
577         // then through the range groups within.
578         uvm_range_group_for_each_migratability_in(&iter,
579                                                   va_space,
580                                                   max(base, va_range->node.start),
581                                                   min(end, va_range->node.end)) {
582             // Skip non-migratable VA ranges
583             if (!iter.migratable) {
584                 // Only return NV_WARN_MORE_PROCESSING_REQUIRED if the pages aren't
585                 // already resident at dest_id.
586                 if (!uvm_va_policy_preferred_location_equal(policy,
587                                                             dest_id,
588                                                             service_context->block_context->make_resident.dest_nid))
589                     skipped_migrate = true;
590             }
591             else if (uvm_processor_mask_test(&va_range->uvm_lite_gpus, dest_id) &&
592                      !uvm_id_equal(dest_id, policy->preferred_location)) {
593                 // Don't migrate to a non-faultable GPU that is in UVM-Lite mode,
594                 // unless it's the preferred location
595                 status = NV_ERR_INVALID_DEVICE;
596                 break;
597             }
598             else {
599                 status = uvm_va_range_migrate(va_range,
600                                               service_context,
601                                               iter.start,
602                                               iter.end,
603                                               dest_id,
604                                               mode,
605                                               should_do_cpu_preunmap,
606                                               out_tracker);
607                 if (status != NV_OK)
608                     break;
609             }
610         }
611     }
612 
613     if (status != NV_OK)
614         return status;
615 
616     // Check that we were able to iterate over the entire range without any gaps
617     if (!va_range_last || va_range_last->node.end < end)
618         return NV_ERR_INVALID_ADDRESS;
619 
620     if (skipped_migrate)
621         return NV_WARN_MORE_PROCESSING_REQUIRED;
622 
623     return NV_OK;
624 }
625 
626 static NV_STATUS uvm_migrate(uvm_va_space_t *va_space,
627                              struct mm_struct *mm,
628                              NvU64 base,
629                              NvU64 length,
630                              uvm_processor_id_t dest_id,
631                              int dest_nid,
632                              NvU32 migrate_flags,
633                              uvm_va_range_t *first_va_range,
634                              uvm_tracker_t *out_tracker)
635 {
636     NV_STATUS status = NV_OK;
637     uvm_service_block_context_t *service_context;
638     bool do_mappings;
639     bool do_two_passes;
640     bool is_single_block;
641     bool should_do_cpu_preunmap;
642 
643     uvm_assert_rwsem_locked(&va_space->lock);
644 
645     // If the GPU has its memory disabled, just skip the migration and let
646     // faults take care of things.
647     if (!uvm_va_space_processor_has_memory(va_space, dest_id))
648         return NV_OK;
649 
650     if (mm)
651         uvm_assert_mmap_lock_locked(mm);
652     else if (!first_va_range)
653         return NV_ERR_INVALID_ADDRESS;
654 
655     service_context = uvm_service_block_context_alloc(mm);
656     if (!service_context)
657         return NV_ERR_NO_MEMORY;
658 
659     service_context->block_context->make_resident.dest_nid = dest_nid;
660 
661     // We perform two passes (unless the migration only covers a single VA
662     // block or UVM_MIGRATE_FLAG_SKIP_CPU_MAP is passed). This helps in the
663     // following scenarios:
664     //
665     // - Migrations that add CPU mappings, since they are synchronous operations
666     // that delay the migration of the next VA blocks.
667     // - Concurrent migrations. This is due to our current channel selection
668     // logic that doesn't prevent false dependencies between independent
669     // operations. For example, removal of mappings for outgoing transfers are
670     // delayed by the mappings added by incoming transfers.
671     // TODO: Bug 1764953: Re-evaluate the two-pass logic when channel selection
672     // is overhauled.
673     //
674     // The two passes are as follows:
675     //
676     // 1- Transfer all VA blocks (do not add mappings)
677     // 2- Go block by block reexecuting the transfer (in case someone moved it
678     // since the first pass), and adding the mappings.
679     //
680     // For HMM (!first_va_range), we always do a single pass.
681     is_single_block = !first_va_range || is_migration_single_block(first_va_range, base, length);
682     do_mappings = UVM_ID_IS_GPU(dest_id) || !(migrate_flags & UVM_MIGRATE_FLAG_SKIP_CPU_MAP);
683     do_two_passes = do_mappings && !is_single_block;
684 
685     if (do_two_passes) {
686         should_do_cpu_preunmap = migration_should_do_cpu_preunmap(va_space, UVM_MIGRATE_PASS_FIRST, is_single_block);
687 
688         status = uvm_migrate_ranges(va_space,
689                                     service_context,
690                                     first_va_range,
691                                     base,
692                                     length,
693                                     dest_id,
694                                     UVM_MIGRATE_MODE_MAKE_RESIDENT,
695                                     should_do_cpu_preunmap,
696                                     out_tracker);
697     }
698 
699     if (status == NV_OK) {
700         uvm_migrate_mode_t mode = do_mappings? UVM_MIGRATE_MODE_MAKE_RESIDENT_AND_MAP:
701                                                UVM_MIGRATE_MODE_MAKE_RESIDENT;
702         uvm_migrate_pass_t pass = do_two_passes? UVM_MIGRATE_PASS_SECOND:
703                                                  UVM_MIGRATE_PASS_FIRST;
704         should_do_cpu_preunmap = migration_should_do_cpu_preunmap(va_space, pass, is_single_block);
705 
706         status = uvm_migrate_ranges(va_space,
707                                     service_context,
708                                     first_va_range,
709                                     base,
710                                     length,
711                                     dest_id,
712                                     mode,
713                                     should_do_cpu_preunmap,
714                                     out_tracker);
715     }
716 
717     uvm_service_block_context_free(service_context);
718 
719     return status;
720 }
721 
722 static NV_STATUS semaphore_release_from_gpu(uvm_gpu_t *gpu,
723                                             uvm_va_range_semaphore_pool_t *semaphore_va_range,
724                                             NvU64 semaphore_user_addr,
725                                             NvU32 semaphore_payload,
726                                             uvm_tracker_t *release_after_tracker)
727 {
728     NV_STATUS status;
729     uvm_push_t push;
730     uvm_channel_type_t channel_type;
731     NvU64 semaphore_gpu_va;
732     NvU64 semaphore_offset;
733 
734     UVM_ASSERT(uvm_mem_mapped_on_gpu_kernel(semaphore_va_range->mem, gpu));
735 
736     semaphore_offset = semaphore_user_addr - (NvU64)(uintptr_t)semaphore_va_range->mem->user->addr;
737     semaphore_gpu_va = uvm_mem_get_gpu_va_kernel(semaphore_va_range->mem, gpu) + semaphore_offset;
738 
739     // Outside of SR-IOV heavy, using UVM_CHANNEL_TYPE_MEMOPS is optimal from a
740     // performance standpoint because if the migration is targeting a GPU, it is
741     // likely that the channel used for the GPU page table update (pushed to
742     // UVM_CHANNEL_TYPE_MEMOPS) will also be used for the release. The
743     // inter-channel dependency avoided by using a single channel can add a
744     // significant overhead to the enclosing migration.
745     //
746     // In SR-IOV heavy, the user semaphore release is functionally forbidden
747     // from being pushed to a UVM_CHANNEL_TYPE_MEMOPS channel, because it is not
748     // a page tree operation.
749     if (uvm_parent_gpu_is_virt_mode_sriov_heavy(gpu->parent))
750         channel_type = UVM_CHANNEL_TYPE_GPU_INTERNAL;
751     else
752         channel_type = UVM_CHANNEL_TYPE_MEMOPS;
753 
754     status = uvm_push_begin_acquire(gpu->channel_manager,
755                                     channel_type,
756                                     release_after_tracker,
757                                     &push,
758                                     "Pushing semaphore release (*0x%llx = %u)",
759                                     semaphore_user_addr,
760                                     semaphore_payload);
761     if (status != NV_OK)
762         return status;
763 
764     gpu->parent->ce_hal->semaphore_release(&push, semaphore_gpu_va, semaphore_payload);
765     uvm_push_end(&push);
766 
767     uvm_mutex_lock(&semaphore_va_range->tracker_lock);
768     status = uvm_tracker_add_push_safe(&semaphore_va_range->tracker, &push);
769     uvm_tracker_remove_completed(&semaphore_va_range->tracker);
770     uvm_mutex_unlock(&semaphore_va_range->tracker_lock);
771 
772     return status;
773 }
774 
775 static void semaphore_release_from_cpu(uvm_mem_t *semaphore_mem, NvU64 semaphore_user_addr, NvU32 semaphore_payload)
776 {
777     char *semaphore_cpu_va;
778     NvU64 semaphore_offset;
779 
780     UVM_ASSERT(uvm_mem_mapped_on_cpu_kernel(semaphore_mem));
781 
782     semaphore_offset = semaphore_user_addr - (NvU64)(uintptr_t)semaphore_mem->user->addr;
783 
784     // Prevent processor speculation prior to accessing user-mapped memory to
785     // avoid leaking information from side-channel attacks. Under speculation, a
786     // valid VA range which does not contain this semaphore could be used by the
787     // caller. It's unclear but likely that the user might be able to control
788     // the data at that address. Auditing all potential ways that could happen
789     // is difficult and error-prone, so to be on the safe side we'll just always
790     // block speculation.
791     nv_speculation_barrier();
792 
793     semaphore_cpu_va = (char *) uvm_mem_get_cpu_addr_kernel(semaphore_mem) + semaphore_offset;
794 
795     UVM_WRITE_ONCE(*(NvU32 *)semaphore_cpu_va, semaphore_payload);
796 }
797 
798 static NV_STATUS semaphore_release(NvU64 semaphore_address,
799                                    NvU32 semaphore_payload,
800                                    uvm_va_range_semaphore_pool_t *semaphore_pool,
801                                    uvm_gpu_t *dest_gpu,
802                                    uvm_tracker_t *tracker_ptr)
803 {
804     uvm_gpu_t *gpu;
805     uvm_gpu_t *gpu_owner = semaphore_pool->owner;
806 
807     // If there is a GPU owner, release the semaphore from it.
808     if (gpu_owner != NULL)
809         return semaphore_release_from_gpu(gpu_owner, semaphore_pool, semaphore_address, semaphore_payload, tracker_ptr);
810 
811     // Attempt eager release from CPU if the tracker is already completed.
812     if (uvm_tracker_is_completed(tracker_ptr)) {
813         semaphore_release_from_cpu(semaphore_pool->mem, semaphore_address, semaphore_payload);
814         return NV_OK;
815     }
816 
817     if (dest_gpu == NULL) {
818         // The destination is the CPU, but we didn't do a CPU release above
819         // because the previous work is not complete. This situation arises when
820         // accessed_by mappings are being set up asynchronously, or the
821         // test-only flag UVM_MIGRATE_FLAG_SKIP_CPU_MAP is used. So there should
822         // be a registered GPU, since all CPU work is synchronous, and the
823         // semaphore must be mapped on that GPU.
824         //
825         // Note that the GPU selected for the release may not be the same device
826         // that prevented the tracker from being complete.
827         gpu = uvm_processor_mask_find_first_gpu(&semaphore_pool->mem->kernel.mapped_on);
828 
829         UVM_ASSERT(gpu != NULL);
830     }
831     else {
832         gpu = dest_gpu;
833     }
834 
835     return semaphore_release_from_gpu(gpu, semaphore_pool, semaphore_address, semaphore_payload, tracker_ptr);
836 }
837 
838 NV_STATUS uvm_migrate_init(void)
839 {
840     NV_STATUS status = uvm_migrate_pageable_init();
841     if (status != NV_OK)
842         return status;
843 
844     g_uvm_perf_migrate_cpu_preunmap_enable = uvm_perf_migrate_cpu_preunmap_enable != 0;
845 
846     BUILD_BUG_ON((UVM_VA_BLOCK_SIZE) & (UVM_VA_BLOCK_SIZE - 1));
847 
848     if (g_uvm_perf_migrate_cpu_preunmap_enable) {
849         if (uvm_perf_migrate_cpu_preunmap_block_order <= UVM_PERF_MIGRATE_CPU_PREUNMAP_BLOCK_ORDER_MAX) {
850             g_uvm_perf_migrate_cpu_preunmap_size = UVM_VA_BLOCK_SIZE << uvm_perf_migrate_cpu_preunmap_block_order;
851         }
852         else {
853             g_uvm_perf_migrate_cpu_preunmap_size = UVM_VA_BLOCK_SIZE << UVM_PERF_MIGRATE_CPU_PREUNMAP_BLOCK_ORDER_DEFAULT;
854 
855             pr_info("Invalid value %u for uvm_perf_migrate_cpu_preunmap_block_order. Using %u instead\n",
856                     uvm_perf_migrate_cpu_preunmap_block_order,
857                     UVM_PERF_MIGRATE_CPU_PREUNMAP_BLOCK_ORDER_DEFAULT);
858         }
859     }
860 
861     return NV_OK;
862 }
863 
864 void uvm_migrate_exit(void)
865 {
866     uvm_migrate_pageable_exit();
867 }
868 
869 NV_STATUS uvm_api_migrate(UVM_MIGRATE_PARAMS *params, struct file *filp)
870 {
871     uvm_va_space_t *va_space = uvm_va_space_get(filp);
872     uvm_tracker_t tracker = UVM_TRACKER_INIT();
873     uvm_tracker_t *tracker_ptr = NULL;
874     uvm_gpu_t *dest_gpu = NULL;
875     uvm_va_range_t *sema_va_range = NULL;
876     struct mm_struct *mm;
877     NV_STATUS status = NV_OK;
878     bool flush_events = false;
879     const bool synchronous = !(params->flags & UVM_MIGRATE_FLAG_ASYNC);
880     int cpu_numa_node = (int)params->cpuNumaNode;
881 
882     // We temporarily allow 0 length in the IOCTL parameters as a signal to
883     // only release the semaphore. This is because user-space is in charge of
884     // migrating pageable memory in some cases.
885     //
886     // TODO: Bug 2419180: do not allow 0 length migrations when we fully switch
887     // to migrate_vma for all types of vmas.
888     if (params->length > 0 || synchronous || params->semaphoreAddress == 0) {
889         if (uvm_api_range_invalid(params->base, params->length))
890             return NV_ERR_INVALID_ADDRESS;
891     }
892 
893     if (params->flags & ~UVM_MIGRATE_FLAGS_ALL)
894         return NV_ERR_INVALID_ARGUMENT;
895 
896     if ((params->flags & UVM_MIGRATE_FLAGS_TEST_ALL) && !uvm_enable_builtin_tests) {
897         UVM_INFO_PRINT("Test flag set for UVM_MIGRATE. Did you mean to insmod with uvm_enable_builtin_tests=1?\n");
898         UVM_INFO_PRINT("TEMP\n");
899         return NV_ERR_INVALID_ARGUMENT;
900     }
901 
902     // mmap_lock will be needed if we have to create CPU mappings
903     mm = uvm_va_space_mm_or_current_retain_lock(va_space);
904     uvm_va_space_down_read(va_space);
905 
906     if (synchronous) {
907         if (params->semaphoreAddress != 0) {
908             status = NV_ERR_INVALID_ARGUMENT;
909             goto done;
910         }
911     }
912     else {
913         if (params->semaphoreAddress == 0) {
914             if (params->semaphorePayload != 0) {
915                 status = NV_ERR_INVALID_ARGUMENT;
916                 goto done;
917             }
918         }
919         else {
920             sema_va_range = uvm_va_range_find(va_space, params->semaphoreAddress);
921             if (!IS_ALIGNED(params->semaphoreAddress, sizeof(params->semaphorePayload)) ||
922                     !sema_va_range || sema_va_range->type != UVM_VA_RANGE_TYPE_SEMAPHORE_POOL) {
923                 status = NV_ERR_INVALID_ADDRESS;
924                 goto done;
925             }
926         }
927     }
928 
929     if (!uvm_uuid_is_cpu(&params->destinationUuid)) {
930         if (params->flags & UVM_MIGRATE_FLAG_NO_GPU_VA_SPACE)
931             dest_gpu = uvm_va_space_get_gpu_by_uuid(va_space, &params->destinationUuid);
932         else
933             dest_gpu = uvm_va_space_get_gpu_by_uuid_with_gpu_va_space(va_space, &params->destinationUuid);
934 
935         if (!dest_gpu) {
936             status = NV_ERR_INVALID_DEVICE;
937             goto done;
938         }
939 
940         if (params->length > 0 && !uvm_gpu_can_address(dest_gpu, params->base, params->length)) {
941             status = NV_ERR_OUT_OF_RANGE;
942             goto done;
943         }
944     }
945     else {
946         // If cpu_numa_node is not -1, we only check that it is a valid node in
947         // the system, it has memory, and it doesn't correspond to a GPU node.
948         //
949         // For pageable memory, this is fine because alloc_pages_node will clamp
950         // the allocation to cpuset_current_mems_allowed when uvm_migrate
951         //_pageable is called from process context (uvm_migrate) when dst_id is
952         // CPU. UVM bottom half calls uvm_migrate_pageable with CPU dst_id only
953         // when the VMA memory policy is set to dst_node_id and dst_node_id is
954         // not NUMA_NO_NODE.
955         if (cpu_numa_node != -1 &&
956             (!nv_numa_node_has_memory(cpu_numa_node) ||
957              !node_isset(cpu_numa_node, node_possible_map) ||
958              uvm_va_space_find_gpu_with_memory_node_id(va_space, cpu_numa_node))) {
959             status = NV_ERR_INVALID_ARGUMENT;
960             goto done;
961         }
962     }
963 
964     UVM_ASSERT(status == NV_OK);
965 
966     // If we're synchronous or if we need to release a semaphore, use a tracker.
967     if (synchronous || params->semaphoreAddress)
968         tracker_ptr = &tracker;
969 
970     if (params->length > 0) {
971         uvm_api_range_type_t type;
972         uvm_processor_id_t dest_id = dest_gpu ? dest_gpu->id : UVM_ID_CPU;
973 
974         type = uvm_api_range_type_check(va_space, mm, params->base, params->length);
975         if (type == UVM_API_RANGE_TYPE_INVALID) {
976             status = NV_ERR_INVALID_ADDRESS;
977             goto done;
978         }
979 
980         if (type == UVM_API_RANGE_TYPE_ATS) {
981             uvm_migrate_args_t uvm_migrate_args =
982             {
983                 .va_space                       = va_space,
984                 .mm                             = mm,
985                 .start                          = params->base,
986                 .length                         = params->length,
987                 .dst_id                         = dest_id,
988                 .dst_node_id                    = cpu_numa_node,
989                 .populate_permissions           = UVM_POPULATE_PERMISSIONS_INHERIT,
990                 .touch                          = false,
991                 .skip_mapped                    = false,
992                 .populate_on_cpu_alloc_failures = false,
993                 .user_space_start               = &params->userSpaceStart,
994                 .user_space_length              = &params->userSpaceLength,
995             };
996 
997             status = uvm_migrate_pageable(&uvm_migrate_args);
998         }
999         else {
1000             status = uvm_migrate(va_space,
1001                                  mm,
1002                                  params->base,
1003                                  params->length,
1004                                  dest_id,
1005                                  (UVM_ID_IS_CPU(dest_id) ? cpu_numa_node : NUMA_NO_NODE),
1006                                  params->flags,
1007                                  uvm_va_space_iter_first(va_space, params->base, params->base),
1008                                  tracker_ptr);
1009         }
1010     }
1011 
1012 done:
1013     // We only need to hold mmap_lock to create new CPU mappings, so drop it if
1014     // we need to wait for the tracker to finish.
1015     //
1016     // TODO: Bug 1766650: For large migrations with destination CPU, try
1017     //       benchmarks to see if a two-pass approach would be faster (first
1018     //       pass pushes all GPU work asynchronously, second pass updates CPU
1019     //       mappings synchronously).
1020     if (mm)
1021         uvm_up_read_mmap_lock_out_of_order(mm);
1022 
1023     if (tracker_ptr) {
1024         // If requested, release semaphore
1025         if (params->semaphoreAddress && (status == NV_OK)) {
1026             status = semaphore_release(params->semaphoreAddress,
1027                                        params->semaphorePayload,
1028                                        &sema_va_range->semaphore_pool,
1029                                        dest_gpu,
1030                                        tracker_ptr);
1031         }
1032 
1033         // Wait on the tracker if we are synchronous or there was an error. The
1034         // VA space lock must be held to prevent GPUs from being unregistered.
1035         if (synchronous || (status != NV_OK)) {
1036             NV_STATUS tracker_status = uvm_tracker_wait(tracker_ptr);
1037 
1038             // Only clobber status if we didn't hit an earlier error
1039             if (status == NV_OK)
1040                 status = tracker_status;
1041 
1042             flush_events = true;
1043         }
1044 
1045         uvm_tracker_deinit(tracker_ptr);
1046     }
1047 
1048     uvm_va_space_up_read(va_space);
1049     uvm_va_space_mm_or_current_release(va_space, mm);
1050 
1051     // If the migration is known to be complete, eagerly dispatch the migration
1052     // events, instead of processing them on a later event flush. Note that an
1053     // asynchronous migration could be complete by now, but the flush would not
1054     // be triggered.
1055     if (flush_events)
1056         uvm_tools_flush_events();
1057 
1058     return status;
1059 }
1060 
1061 NV_STATUS uvm_api_migrate_range_group(UVM_MIGRATE_RANGE_GROUP_PARAMS *params, struct file *filp)
1062 {
1063     NV_STATUS status = NV_OK;
1064     NV_STATUS tracker_status = NV_OK;
1065     uvm_va_space_t *va_space = uvm_va_space_get(filp);
1066     struct mm_struct *mm;
1067     uvm_range_group_t *range_group;
1068     uvm_range_group_range_t *rgr;
1069     uvm_processor_id_t dest_id;
1070     uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
1071     NvU32 migrate_flags = 0;
1072     uvm_gpu_t *gpu = NULL;
1073 
1074     // mmap_lock will be needed if we have to create CPU mappings
1075     mm = uvm_va_space_mm_or_current_retain_lock(va_space);
1076     uvm_va_space_down_read(va_space);
1077 
1078     if (uvm_uuid_is_cpu(&params->destinationUuid)) {
1079         dest_id = UVM_ID_CPU;
1080     }
1081     else {
1082         gpu = uvm_va_space_get_gpu_by_uuid_with_gpu_va_space(va_space, &params->destinationUuid);
1083         if (!gpu) {
1084             status = NV_ERR_INVALID_DEVICE;
1085             goto done;
1086         }
1087 
1088         dest_id = gpu->id;
1089     }
1090 
1091     range_group = radix_tree_lookup(&va_space->range_groups, params->rangeGroupId);
1092     if (!range_group) {
1093         status = NV_ERR_OBJECT_NOT_FOUND;
1094         goto done;
1095     }
1096 
1097     // Migrate all VA ranges in the range group. uvm_migrate is used because it performs all
1098     // VA range validity checks.
1099     list_for_each_entry(rgr, &range_group->ranges, range_group_list_node) {
1100         NvU64 start = rgr->node.start;
1101         NvU64 length = rgr->node.end - rgr->node.start + 1;
1102 
1103         if (gpu && !uvm_gpu_can_address(gpu, start, length)) {
1104             status = NV_ERR_OUT_OF_RANGE;
1105         }
1106         else {
1107             uvm_va_range_t *first_va_range = uvm_va_space_iter_first(va_space, start, start);
1108 
1109             if (!first_va_range || first_va_range->type != UVM_VA_RANGE_TYPE_MANAGED) {
1110                 status = NV_ERR_INVALID_ADDRESS;
1111                 goto done;
1112             }
1113 
1114             status = uvm_migrate(va_space,
1115                                  mm,
1116                                  start,
1117                                  length,
1118                                  dest_id,
1119                                  NUMA_NO_NODE,
1120                                  migrate_flags,
1121                                  first_va_range,
1122                                  &local_tracker);
1123         }
1124 
1125         if (status != NV_OK)
1126             goto done;
1127     }
1128 
1129 done:
1130     // We only need to hold mmap_lock to create new CPU mappings, so drop it if
1131     // we need to wait for the tracker to finish.
1132     //
1133     // TODO: Bug 1766650: For large migrations with destination CPU, try
1134     //       benchmarks to see if a two-pass approach would be faster (first
1135     //       pass pushes all GPU work asynchronously, second pass updates CPU
1136     //       mappings synchronously).
1137     if (mm)
1138         uvm_up_read_mmap_lock_out_of_order(mm);
1139 
1140     tracker_status = uvm_tracker_wait_deinit(&local_tracker);
1141     uvm_va_space_up_read(va_space);
1142     uvm_va_space_mm_or_current_release(va_space, mm);
1143 
1144     // This API is synchronous, so wait for migrations to finish
1145     uvm_tools_flush_events();
1146 
1147     return status == NV_OK? tracker_status : status;
1148 }
1149