1 /*******************************************************************************
2     Copyright (c) 2016-2022 NVIDIA Corporation
3 
4     Permission is hereby granted, free of charge, to any person obtaining a copy
5     of this software and associated documentation files (the "Software"), to
6     deal in the Software without restriction, including without limitation the
7     rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
8     sell copies of the Software, and to permit persons to whom the Software is
9     furnished to do so, subject to the following conditions:
10 
11         The above copyright notice and this permission notice shall be
12         included in all copies or substantial portions of the Software.
13 
14     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17     THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19     FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20     DEALINGS IN THE SOFTWARE.
21 
22 *******************************************************************************/
23 
24 #include "uvm_common.h"
25 #include "uvm_ioctl.h"
26 #include "uvm_linux.h"
27 #include "uvm_global.h"
28 #include "uvm_gpu.h"
29 #include "uvm_lock.h"
30 #include "uvm_va_space.h"
31 #include "uvm_va_range.h"
32 #include "uvm_va_block.h"
33 #include "uvm_tracker.h"
34 #include "uvm_api.h"
35 #include "uvm_channel.h"
36 #include "uvm_push.h"
37 #include "uvm_hal.h"
38 #include "uvm_tools.h"
39 #include "uvm_migrate.h"
40 #include "uvm_migrate_pageable.h"
41 #include "uvm_va_space_mm.h"
42 #include "nv_speculation_barrier.h"
43 
44 typedef enum
45 {
46     UVM_MIGRATE_PASS_FIRST,
47     UVM_MIGRATE_PASS_SECOND
48 } uvm_migrate_pass_t;
49 
50 static int uvm_perf_migrate_cpu_preunmap_enable = 1;
51 module_param(uvm_perf_migrate_cpu_preunmap_enable, int, S_IRUGO);
52 
53 #define UVM_PERF_MIGRATE_CPU_PREUNMAP_BLOCK_ORDER_DEFAULT 2
54 #define UVM_PERF_MIGRATE_CPU_PREUNMAP_BLOCK_ORDER_MAX     10
55 static unsigned uvm_perf_migrate_cpu_preunmap_block_order = UVM_PERF_MIGRATE_CPU_PREUNMAP_BLOCK_ORDER_DEFAULT;
56 module_param(uvm_perf_migrate_cpu_preunmap_block_order, uint, S_IRUGO);
57 
58 // Global post-processed values of the module parameters
59 static bool g_uvm_perf_migrate_cpu_preunmap_enable __read_mostly;
60 static NvU64 g_uvm_perf_migrate_cpu_preunmap_size __read_mostly;
61 
62 static bool is_migration_single_block(uvm_va_range_t *first_va_range, NvU64 base, NvU64 length)
63 {
64     NvU64 end = base + length - 1;
65 
66     if (end > first_va_range->node.end)
67         return false;
68 
69     return uvm_va_range_block_index(first_va_range, base) == uvm_va_range_block_index(first_va_range, end);
70 }
71 
72 static NV_STATUS block_migrate_map_mapped_pages(uvm_va_block_t *va_block,
73                                                 uvm_va_block_retry_t *va_block_retry,
74                                                 uvm_va_block_context_t *va_block_context,
75                                                 uvm_va_block_region_t region,
76                                                 uvm_processor_id_t dest_id)
77 {
78     uvm_prot_t prot;
79     uvm_page_index_t page_index;
80     NV_STATUS status = NV_OK;
81     const uvm_page_mask_t *pages_mapped_on_destination = uvm_va_block_map_mask_get(va_block, dest_id);
82 
83     for (prot = UVM_PROT_READ_ONLY; prot <= UVM_PROT_READ_WRITE_ATOMIC; ++prot)
84         va_block_context->mask_by_prot[prot - 1].count = 0;
85 
86     // Only map those pages that are not already mapped on destination
87     for_each_va_block_unset_page_in_region_mask(page_index, pages_mapped_on_destination, region) {
88         prot = uvm_va_block_page_compute_highest_permission(va_block, dest_id, page_index);
89         if (prot == UVM_PROT_NONE)
90             continue;
91 
92         if (va_block_context->mask_by_prot[prot - 1].count++ == 0)
93             uvm_page_mask_zero(&va_block_context->mask_by_prot[prot - 1].page_mask);
94 
95         uvm_page_mask_set(&va_block_context->mask_by_prot[prot - 1].page_mask, page_index);
96     }
97 
98     for (prot = UVM_PROT_READ_ONLY; prot <= UVM_PROT_READ_WRITE_ATOMIC; ++prot) {
99         if (va_block_context->mask_by_prot[prot - 1].count == 0)
100             continue;
101 
102         // We pass UvmEventMapRemoteCauseInvalid since the destination processor
103         // of a migration will never be mapped remotely
104         status = uvm_va_block_map(va_block,
105                                   va_block_context,
106                                   dest_id,
107                                   region,
108                                   &va_block_context->mask_by_prot[prot - 1].page_mask,
109                                   prot,
110                                   UvmEventMapRemoteCauseInvalid,
111                                   &va_block->tracker);
112         if (status != NV_OK)
113             break;
114 
115         // Whoever added the other mapping(s) should have already added
116         // SetAccessedBy processors
117     }
118 
119     return status;
120 }
121 
122 static NV_STATUS block_migrate_map_unmapped_pages(uvm_va_block_t *va_block,
123                                                   uvm_va_block_retry_t *va_block_retry,
124                                                   uvm_va_block_context_t *va_block_context,
125                                                   uvm_va_block_region_t region,
126                                                   uvm_processor_id_t dest_id)
127 
128 {
129     uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
130     NV_STATUS status = NV_OK;
131     NV_STATUS tracker_status;
132 
133     // Save the mask of unmapped pages because it will change after the
134     // first map operation
135     uvm_page_mask_complement(&va_block_context->caller_page_mask, &va_block->maybe_mapped_pages);
136 
137     // Only map those pages that are not mapped anywhere else (likely due
138     // to a first touch or a migration). We pass
139     // UvmEventMapRemoteCauseInvalid since the destination processor of a
140     // migration will never be mapped remotely.
141     status = uvm_va_block_map(va_block,
142                               va_block_context,
143                               dest_id,
144                               region,
145                               &va_block_context->caller_page_mask,
146                               UVM_PROT_READ_WRITE_ATOMIC,
147                               UvmEventMapRemoteCauseInvalid,
148                               &local_tracker);
149     if (status != NV_OK)
150         goto out;
151 
152     // Add mappings for AccessedBy processors
153     //
154     // No mappings within this call will operate on dest_id, so we don't
155     // need to acquire the map operation above.
156     status = uvm_va_block_add_mappings_after_migration(va_block,
157                                                        va_block_context,
158                                                        dest_id,
159                                                        dest_id,
160                                                        region,
161                                                        &va_block_context->caller_page_mask,
162                                                        UVM_PROT_READ_WRITE_ATOMIC,
163                                                        NULL);
164 
165 out:
166     tracker_status = uvm_tracker_add_tracker_safe(&va_block->tracker, &local_tracker);
167     uvm_tracker_deinit(&local_tracker);
168     return status == NV_OK ? tracker_status : status;
169 }
170 
171 // Pages that are not mapped anywhere can be safely mapped with RWA permission.
172 // The rest of pages need to individually compute the maximum permission that
173 // does not require a revocation.
174 static NV_STATUS block_migrate_add_mappings(uvm_va_block_t *va_block,
175                                             uvm_va_block_retry_t *va_block_retry,
176                                             uvm_va_block_context_t *va_block_context,
177                                             uvm_va_block_region_t region,
178                                             uvm_processor_id_t dest_id)
179 
180 {
181     NV_STATUS status;
182 
183     status = block_migrate_map_unmapped_pages(va_block,
184                                               va_block_retry,
185                                               va_block_context,
186                                               region,
187                                               dest_id);
188     if (status != NV_OK)
189         return status;
190 
191     return block_migrate_map_mapped_pages(va_block,
192                                           va_block_retry,
193                                           va_block_context,
194                                           region,
195                                           dest_id);
196 }
197 
198 NV_STATUS uvm_va_block_migrate_locked(uvm_va_block_t *va_block,
199                                       uvm_va_block_retry_t *va_block_retry,
200                                       uvm_va_block_context_t *va_block_context,
201                                       uvm_va_block_region_t region,
202                                       uvm_processor_id_t dest_id,
203                                       uvm_migrate_mode_t mode,
204                                       uvm_tracker_t *out_tracker)
205 {
206     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
207     NV_STATUS status, tracker_status = NV_OK;
208 
209     uvm_assert_mutex_locked(&va_block->lock);
210     UVM_ASSERT(uvm_hmm_check_context_vma_is_valid(va_block, va_block_context, region));
211 
212     if (uvm_va_block_is_hmm(va_block)) {
213         status = uvm_hmm_va_block_migrate_locked(va_block,
214                                                  va_block_retry,
215                                                  va_block_context,
216                                                  dest_id,
217                                                  region,
218                                                  UVM_MAKE_RESIDENT_CAUSE_API_MIGRATE);
219     }
220     else {
221         va_block_context->policy = uvm_va_range_get_policy(va_block->va_range);
222 
223         if (uvm_va_policy_is_read_duplicate(va_block_context->policy, va_space)) {
224             status = uvm_va_block_make_resident_read_duplicate(va_block,
225                                                                va_block_retry,
226                                                                va_block_context,
227                                                                dest_id,
228                                                                region,
229                                                                NULL,
230                                                                NULL,
231                                                                UVM_MAKE_RESIDENT_CAUSE_API_MIGRATE);
232         }
233         else {
234             status = uvm_va_block_make_resident(va_block,
235                                                 va_block_retry,
236                                                 va_block_context,
237                                                 dest_id,
238                                                 region,
239                                                 NULL,
240                                                 NULL,
241                                                 UVM_MAKE_RESIDENT_CAUSE_API_MIGRATE);
242         }
243     }
244 
245     if (status == NV_OK && mode == UVM_MIGRATE_MODE_MAKE_RESIDENT_AND_MAP) {
246         // block_migrate_add_mappings will acquire the work from the above
247         // make_resident call and update the VA block tracker.
248         status = block_migrate_add_mappings(va_block, va_block_retry, va_block_context, region, dest_id);
249     }
250 
251     if (out_tracker)
252         tracker_status = uvm_tracker_add_tracker_safe(out_tracker, &va_block->tracker);
253 
254     return status == NV_OK ? tracker_status : status;
255 }
256 
257 // Unmapping CPU pages on P9 systems is very costly, to the point that it
258 // becomes the bottleneck of UvmMigrate. We have measured up to 3x lower BW for
259 // migrations that need to remove CPU mappings compared to migrations that only
260 // create CPU mappings. The overhead can be fully attributed to the TLB
261 // shootdown. When a CPU page is unmapped, it needs to (1) invalidate any copy
262 // in the P9 cores, and (2) if ATS is enabled, issue ATSD messages over NVLINK
263 // to remove the corresponding entries in the GPUs' TLBs. ATSDs are not even
264 // required when migration managed memory since UVM ensures that there are no
265 // ATS entries cached in the GPU TLBs for the managed VA ranges. However, we
266 // don't have a way to skip them as of today.
267 //
268 // In order to minimize the overhead of CPU unmaps during UvmMigrate we try to
269 // call unmap_mapping_range on VA regions larger than the VA block granularity
270 // before the actual migration so that TLB invalidations are batched better by
271 // the OS. This also has an impact in the number of ATSD messages issued. This
272 // is because the NPU code uses MMU notifiers in order to get a callback
273 // (invalidate_range) when a TLB invalidation is required. Fortunately, this
274 // callback is not called if there is nothing to be invalidated. Therefore, if
275 // we issue a large unmap, subsequent unmaps within that region will not invoke
276 // the callback.
277 //
278 // However, due to (1), even issuing a single invalidate for the whole migrated
279 // range introduces a noticeable overhead (20-30%) on systems with 3xNVLINK2.
280 // This is only expected to get worse if CPU-GPU interconnects' BW keeps
281 // increasing.
282 //
283 // Thus, VA range migrations are split into groups of contiguous VA blocks, and
284 // trigger a single pre-unmap of the group of VA blocks in the Linux kernel
285 // before the VA blocks' migration starts. This way, we trigger larger (more
286 // efficient) TLB invalidations than when we do it one VA block a time, while
287 // still being able to pipeline the migration, which allows to hide most of the
288 // costs of (1).
289 //
290 // However, there are some cases in which the CPU has mappings to the pages
291 // being migrated but they don't need to be removed (which can introduce
292 // unnecessary CPU faults later on). Therefore, we skip the pre-unmap step
293 // under the following conditions:
294 // - Pages mapped by the CPU that are *already* in the destination.
295 // - Pages mapped by the CPU that are *not* in the destination but
296 // read-duplication is enabled in the VA range.
297 
298 // This function checks if the pre-unmap optimization is required given the
299 // system capabilities and the destination of the migration. This is to skip
300 // any subsequent checks required by the optimization, which can be costly.
301 //
302 // The current logic checks that:
303 // - We are in the first pass of the migration (see the explanation of the
304 // two-pass strategy in uvm_migrate).
305 // - The CPU has an NVLINK interconnect to the GPUs. Otherwise, we don't
306 // need this optimization since we are already limited by PCIe BW.
307 // - If the migration spans several VA blocks, otherwise skip the preunmap to
308 // avoid the overhead.
309 static bool migration_should_do_cpu_preunmap(uvm_va_space_t *va_space,
310                                              uvm_migrate_pass_t pass,
311                                              bool is_single_block)
312 
313 {
314     if (!g_uvm_perf_migrate_cpu_preunmap_enable)
315         return false;
316 
317     if (pass != UVM_MIGRATE_PASS_FIRST || is_single_block)
318         return false;
319 
320     if (uvm_processor_mask_get_gpu_count(&va_space->has_nvlink[UVM_ID_CPU_VALUE]) == 0)
321         return false;
322 
323     return true;
324 }
325 
326 // This function determines if the VA range properties avoid the need to remove
327 // CPU mappings on UvmMigrate. Currently, it only checks whether
328 // read-duplication is enabled in the VA range. This is because, when migrating
329 // read-duplicated VA blocks, the source processor doesn't need to be unmapped
330 // (though it may need write access revoked).
331 static bool va_range_should_do_cpu_preunmap(const uvm_va_policy_t *policy,
332                                             uvm_va_space_t *va_space)
333 {
334     return !uvm_va_policy_is_read_duplicate(policy, va_space);
335 }
336 
337 // Function that determines if the VA block to be migrated contains pages with
338 // CPU mappings that don't need to be removed (see the comment above). In that
339 // case false is returned. Otherwise it returns true, and stores in the
340 // variable pointed by num_unmap_pages the number of pages that do need to
341 // remove their CPU mappings.
342 static bool va_block_should_do_cpu_preunmap(uvm_va_block_t *va_block,
343                                             uvm_va_block_context_t *va_block_context,
344                                             NvU64 start,
345                                             NvU64 end,
346                                             uvm_processor_id_t dest_id,
347                                             NvU32 *num_unmap_pages)
348 {
349     const uvm_page_mask_t *mapped_pages_cpu;
350     NvU32 num_cpu_unchanged_pages = 0;
351     uvm_va_block_region_t region;
352 
353     *num_unmap_pages = 0;
354 
355     if (!va_block)
356         return true;
357 
358     UVM_ASSERT(va_range_should_do_cpu_preunmap(va_block_context->policy, uvm_va_block_get_va_space(va_block)));
359 
360     region = uvm_va_block_region_from_start_end(va_block, max(start, va_block->start), min(end, va_block->end));
361 
362     uvm_mutex_lock(&va_block->lock);
363 
364     mapped_pages_cpu = uvm_va_block_map_mask_get(va_block, UVM_ID_CPU);
365     if (uvm_processor_mask_test(&va_block->resident, dest_id)) {
366         const uvm_page_mask_t *resident_pages_dest = uvm_va_block_resident_mask_get(va_block, dest_id);
367         uvm_page_mask_t *do_not_unmap_pages = &va_block_context->scratch_page_mask;
368 
369         // TODO: Bug 1877578
370         //
371         // We assume that if pages are mapped on the CPU and not resident on
372         // the destination, the pages will change residency so the CPU must be
373         // unmapped. If we implement automatic read-duplication heuristics in
374         // the future, we'll also need to check if the pages are being
375         // read-duplicated.
376         uvm_page_mask_and(do_not_unmap_pages, mapped_pages_cpu, resident_pages_dest);
377 
378         num_cpu_unchanged_pages = uvm_page_mask_region_weight(do_not_unmap_pages, region);
379     }
380 
381     *num_unmap_pages = uvm_page_mask_region_weight(mapped_pages_cpu, region) - num_cpu_unchanged_pages;
382 
383     uvm_mutex_unlock(&va_block->lock);
384 
385     return num_cpu_unchanged_pages == 0;
386 }
387 
388 static void preunmap_multi_block(uvm_va_range_t *va_range,
389                                  uvm_va_block_context_t *va_block_context,
390                                  NvU64 start,
391                                  NvU64 end,
392                                  uvm_processor_id_t dest_id)
393 {
394     size_t i;
395     const size_t first_block_index = uvm_va_range_block_index(va_range, start);
396     const size_t last_block_index = uvm_va_range_block_index(va_range, end);
397     NvU32 num_unmap_pages = 0;
398 
399     UVM_ASSERT(start >= va_range->node.start);
400     UVM_ASSERT(end  <= va_range->node.end);
401     UVM_ASSERT(va_range->type == UVM_VA_RANGE_TYPE_MANAGED);
402     uvm_assert_rwsem_locked(&va_range->va_space->lock);
403 
404     UVM_ASSERT(uvm_range_group_all_migratable(va_range->va_space, start, end));
405 
406     for (i = first_block_index; i <= last_block_index; i++) {
407         NvU32 num_block_unmap_pages;
408 
409         if (!va_block_should_do_cpu_preunmap(uvm_va_range_block(va_range, i),
410                                              va_block_context,
411                                              start,
412                                              end,
413                                              dest_id,
414                                              &num_block_unmap_pages)) {
415             return;
416         }
417 
418         num_unmap_pages += num_block_unmap_pages;
419     }
420 
421     if (num_unmap_pages > 0)
422         unmap_mapping_range(va_range->va_space->mapping, start, end - start + 1, 1);
423 }
424 
425 static NV_STATUS uvm_va_range_migrate_multi_block(uvm_va_range_t *va_range,
426                                                   uvm_va_block_context_t *va_block_context,
427                                                   NvU64 start,
428                                                   NvU64 end,
429                                                   uvm_processor_id_t dest_id,
430                                                   uvm_migrate_mode_t mode,
431                                                   uvm_tracker_t *out_tracker)
432 {
433     size_t i;
434     const size_t first_block_index = uvm_va_range_block_index(va_range, start);
435     const size_t last_block_index = uvm_va_range_block_index(va_range, end);
436 
437     UVM_ASSERT(start >= va_range->node.start);
438     UVM_ASSERT(end  <= va_range->node.end);
439     UVM_ASSERT(va_range->type == UVM_VA_RANGE_TYPE_MANAGED);
440     uvm_assert_rwsem_locked(&va_range->va_space->lock);
441 
442     UVM_ASSERT(uvm_range_group_all_migratable(va_range->va_space, start, end));
443 
444     // Iterate over blocks, populating them if necessary
445     for (i = first_block_index; i <= last_block_index; i++) {
446         uvm_va_block_retry_t va_block_retry;
447         uvm_va_block_region_t region;
448         uvm_va_block_t *va_block;
449         NV_STATUS status = uvm_va_range_block_create(va_range, i, &va_block);
450 
451         if (status != NV_OK)
452             return status;
453 
454         region = uvm_va_block_region_from_start_end(va_block,
455                                                     max(start, va_block->start),
456                                                     min(end, va_block->end));
457 
458         status = UVM_VA_BLOCK_LOCK_RETRY(va_block, &va_block_retry,
459                                          uvm_va_block_migrate_locked(va_block,
460                                                                      &va_block_retry,
461                                                                      va_block_context,
462                                                                      region,
463                                                                      dest_id,
464                                                                      mode,
465                                                                      out_tracker));
466         if (status != NV_OK)
467             return status;
468     }
469 
470     return NV_OK;
471 }
472 
473 static NV_STATUS uvm_va_range_migrate(uvm_va_range_t *va_range,
474                                       uvm_va_block_context_t *va_block_context,
475                                       NvU64 start,
476                                       NvU64 end,
477                                       uvm_processor_id_t dest_id,
478                                       uvm_migrate_mode_t mode,
479                                       bool should_do_cpu_preunmap,
480                                       uvm_tracker_t *out_tracker)
481 {
482     NvU64 preunmap_range_start = start;
483 
484     UVM_ASSERT(va_block_context->policy == uvm_va_range_get_policy(va_range));
485 
486     should_do_cpu_preunmap = should_do_cpu_preunmap && va_range_should_do_cpu_preunmap(va_block_context->policy,
487                                                                                        va_range->va_space);
488 
489     // Divide migrations into groups of contiguous VA blocks. This is to trigger
490     // CPU unmaps for that region before the migration starts.
491     while (preunmap_range_start < end) {
492         NV_STATUS status;
493         NvU64 preunmap_range_end;
494 
495         if (should_do_cpu_preunmap) {
496             preunmap_range_end = UVM_ALIGN_UP(preunmap_range_start + 1, g_uvm_perf_migrate_cpu_preunmap_size);
497             preunmap_range_end = min(preunmap_range_end - 1, end);
498 
499             preunmap_multi_block(va_range,
500                                  va_block_context,
501                                  preunmap_range_start,
502                                  preunmap_range_end,
503                                  dest_id);
504         }
505         else {
506             preunmap_range_end = end;
507         }
508 
509         status = uvm_va_range_migrate_multi_block(va_range,
510                                                   va_block_context,
511                                                   preunmap_range_start,
512                                                   preunmap_range_end,
513                                                   dest_id,
514                                                   mode,
515                                                   out_tracker);
516         if (status != NV_OK)
517             return status;
518 
519         preunmap_range_start = preunmap_range_end + 1;
520     }
521 
522     return NV_OK;
523 }
524 
525 static NV_STATUS uvm_migrate_ranges(uvm_va_space_t *va_space,
526                                     uvm_va_block_context_t *va_block_context,
527                                     uvm_va_range_t *first_va_range,
528                                     NvU64 base,
529                                     NvU64 length,
530                                     uvm_processor_id_t dest_id,
531                                     uvm_migrate_mode_t mode,
532                                     bool should_do_cpu_preunmap,
533                                     uvm_tracker_t *out_tracker)
534 {
535     uvm_va_range_t *va_range, *va_range_last;
536     NvU64 end = base + length - 1;
537     NV_STATUS status = NV_OK;
538     bool skipped_migrate = false;
539 
540     if (!first_va_range) {
541         // For HMM, we iterate over va_blocks since there is no va_range.
542         return uvm_hmm_migrate_ranges(va_space,
543                                       va_block_context,
544                                       base,
545                                       length,
546                                       dest_id,
547                                       mode,
548                                       out_tracker);
549     }
550 
551     UVM_ASSERT(first_va_range == uvm_va_space_iter_first(va_space, base, base));
552 
553     va_range_last = NULL;
554     uvm_for_each_va_range_in_contig_from(va_range, va_space, first_va_range, end) {
555         uvm_range_group_range_iter_t iter;
556         va_range_last = va_range;
557 
558         // Only managed ranges can be migrated
559         if (va_range->type != UVM_VA_RANGE_TYPE_MANAGED) {
560             status = NV_ERR_INVALID_ADDRESS;
561             break;
562         }
563 
564         va_block_context->policy = uvm_va_range_get_policy(va_range);
565 
566         // For UVM-Lite GPUs, the CUDA driver may suballocate a single va_range
567         // into many range groups.  For this reason, we iterate over each va_range first
568         // then through the range groups within.
569         uvm_range_group_for_each_migratability_in(&iter,
570                                                   va_space,
571                                                   max(base, va_range->node.start),
572                                                   min(end, va_range->node.end)) {
573             // Skip non-migratable VA ranges
574             if (!iter.migratable) {
575                 // Only return NV_WARN_MORE_PROCESSING_REQUIRED if the pages aren't
576                 // already resident at dest_id.
577                 if (!uvm_id_equal(uvm_va_range_get_policy(va_range)->preferred_location, dest_id))
578                     skipped_migrate = true;
579             }
580             else if (uvm_processor_mask_test(&va_range->uvm_lite_gpus, dest_id) &&
581                      !uvm_id_equal(dest_id, uvm_va_range_get_policy(va_range)->preferred_location)) {
582                 // Don't migrate to a non-faultable GPU that is in UVM-Lite mode,
583                 // unless it's the preferred location
584                 status = NV_ERR_INVALID_DEVICE;
585                 break;
586             }
587             else {
588                 status = uvm_va_range_migrate(va_range,
589                                               va_block_context,
590                                               iter.start,
591                                               iter.end,
592                                               dest_id,
593                                               mode,
594                                               should_do_cpu_preunmap,
595                                               out_tracker);
596                 if (status != NV_OK)
597                     break;
598             }
599         }
600     }
601 
602     if (status != NV_OK)
603         return status;
604 
605     // Check that we were able to iterate over the entire range without any gaps
606     if (!va_range_last || va_range_last->node.end < end)
607         return NV_ERR_INVALID_ADDRESS;
608 
609     if (skipped_migrate)
610         return NV_WARN_MORE_PROCESSING_REQUIRED;
611 
612     return NV_OK;
613 }
614 
615 static NV_STATUS uvm_migrate(uvm_va_space_t *va_space,
616                              struct mm_struct *mm,
617                              NvU64 base,
618                              NvU64 length,
619                              uvm_processor_id_t dest_id,
620                              NvU32 migrate_flags,
621                              uvm_va_range_t *first_va_range,
622                              uvm_tracker_t *out_tracker)
623 {
624     NV_STATUS status = NV_OK;
625     uvm_va_block_context_t *va_block_context;
626     bool do_mappings;
627     bool do_two_passes;
628     bool is_single_block;
629     bool should_do_cpu_preunmap;
630 
631     uvm_assert_rwsem_locked(&va_space->lock);
632 
633     // If the GPU has its memory disabled, just skip the migration and let
634     // faults take care of things.
635     if (!uvm_va_space_processor_has_memory(va_space, dest_id))
636         return NV_OK;
637 
638     if (mm)
639         uvm_assert_mmap_lock_locked(mm);
640 
641     va_block_context = uvm_va_block_context_alloc(mm);
642     if (!va_block_context)
643         return NV_ERR_NO_MEMORY;
644 
645     // We perform two passes (unless the migration only covers a single VA
646     // block or UVM_MIGRATE_FLAG_SKIP_CPU_MAP is passed). This helps in the
647     // following scenarios:
648     //
649     // - Migrations that add CPU mappings, since they are synchronous operations
650     // that delay the migration of the next VA blocks.
651     // - Concurrent migrations. This is due to our current channel selection
652     // logic that doesn't prevent false dependencies between independent
653     // operations. For example, removal of mappings for outgoing transfers are
654     // delayed by the mappings added by incoming transfers.
655     // TODO: Bug 1764953: Re-evaluate the two-pass logic when channel selection
656     // is overhauled.
657     //
658     // The two passes are as follows:
659     //
660     // 1- Transfer all VA blocks (do not add mappings)
661     // 2- Go block by block reexecuting the transfer (in case someone moved it
662     // since the first pass), and adding the mappings.
663     //
664     // For HMM (!first_va_range), we always do a single pass.
665     is_single_block = !first_va_range || is_migration_single_block(first_va_range, base, length);
666     do_mappings = UVM_ID_IS_GPU(dest_id) || !(migrate_flags & UVM_MIGRATE_FLAG_SKIP_CPU_MAP);
667     do_two_passes = do_mappings && !is_single_block;
668 
669     if (do_two_passes) {
670         should_do_cpu_preunmap = migration_should_do_cpu_preunmap(va_space, UVM_MIGRATE_PASS_FIRST, is_single_block);
671 
672         status = uvm_migrate_ranges(va_space,
673                                     va_block_context,
674                                     first_va_range,
675                                     base,
676                                     length,
677                                     dest_id,
678                                     UVM_MIGRATE_MODE_MAKE_RESIDENT,
679                                     should_do_cpu_preunmap,
680                                     out_tracker);
681     }
682 
683     if (status == NV_OK) {
684         uvm_migrate_mode_t mode = do_mappings? UVM_MIGRATE_MODE_MAKE_RESIDENT_AND_MAP:
685                                                UVM_MIGRATE_MODE_MAKE_RESIDENT;
686         uvm_migrate_pass_t pass = do_two_passes? UVM_MIGRATE_PASS_SECOND:
687                                                  UVM_MIGRATE_PASS_FIRST;
688         should_do_cpu_preunmap = migration_should_do_cpu_preunmap(va_space, pass, is_single_block);
689 
690         status = uvm_migrate_ranges(va_space,
691                                     va_block_context,
692                                     first_va_range,
693                                     base,
694                                     length,
695                                     dest_id,
696                                     mode,
697                                     should_do_cpu_preunmap,
698                                     out_tracker);
699     }
700 
701     uvm_va_block_context_free(va_block_context);
702 
703     return status;
704 }
705 
706 static NV_STATUS semaphore_release_from_gpu(uvm_gpu_t *gpu,
707                                             uvm_va_range_semaphore_pool_t *semaphore_va_range,
708                                             NvU64 semaphore_user_addr,
709                                             NvU32 semaphore_payload,
710                                             uvm_tracker_t *release_after_tracker)
711 {
712     NV_STATUS status;
713     uvm_push_t push;
714     uvm_channel_type_t channel_type;
715     NvU64 semaphore_gpu_va;
716     NvU64 semaphore_offset;
717 
718     UVM_ASSERT(uvm_mem_mapped_on_gpu_kernel(semaphore_va_range->mem, gpu));
719 
720     semaphore_offset = semaphore_user_addr - (NvU64)(uintptr_t)semaphore_va_range->mem->user->addr;
721     semaphore_gpu_va = uvm_mem_get_gpu_va_kernel(semaphore_va_range->mem, gpu) + semaphore_offset;
722 
723     // Outside of SR-IOV heavy, using UVM_CHANNEL_TYPE_MEMOPS is optimal from a
724     // performance standpoint because if the migration is targeting a GPU, it is
725     // likely that the channel used for the GPU page table update (pushed to
726     // UVM_CHANNEL_TYPE_MEMOPS) will also be used for the release. The
727     // inter-channel dependency avoided by using a single channel can add a
728     // significant overhead to the enclosing migration.
729     //
730     // In SR-IOV heavy, the user semaphore release is functionally forbidden
731     // from being pushed to a UVM_CHANNEL_TYPE_MEMOPS channel, because it is not
732     // a page tree operation.
733     if (uvm_gpu_is_virt_mode_sriov_heavy(gpu))
734         channel_type = UVM_CHANNEL_TYPE_GPU_INTERNAL;
735     else
736         channel_type = UVM_CHANNEL_TYPE_MEMOPS;
737 
738     status = uvm_push_begin_acquire(gpu->channel_manager,
739                                     channel_type,
740                                     release_after_tracker,
741                                     &push,
742                                     "Pushing semaphore release (*0x%llx = %u)",
743                                     semaphore_user_addr,
744                                     semaphore_payload);
745     if (status != NV_OK)
746         return status;
747 
748     gpu->parent->ce_hal->semaphore_release(&push, semaphore_gpu_va, semaphore_payload);
749     uvm_push_end(&push);
750 
751     uvm_mutex_lock(&semaphore_va_range->tracker_lock);
752     status = uvm_tracker_add_push_safe(&semaphore_va_range->tracker, &push);
753     uvm_tracker_remove_completed(&semaphore_va_range->tracker);
754     uvm_mutex_unlock(&semaphore_va_range->tracker_lock);
755 
756     return status;
757 }
758 
759 static void semaphore_release_from_cpu(uvm_mem_t *semaphore_mem, NvU64 semaphore_user_addr, NvU32 semaphore_payload)
760 {
761     char *semaphore_cpu_va;
762     NvU64 semaphore_offset;
763 
764     UVM_ASSERT(uvm_mem_mapped_on_cpu_kernel(semaphore_mem));
765 
766     semaphore_offset = semaphore_user_addr - (NvU64)(uintptr_t)semaphore_mem->user->addr;
767 
768     // Prevent processor speculation prior to accessing user-mapped memory to
769     // avoid leaking information from side-channel attacks. Under speculation, a
770     // valid VA range which does not contain this semaphore could be used by the
771     // caller. It's unclear but likely that the user might be able to control
772     // the data at that address. Auditing all potential ways that could happen
773     // is difficult and error-prone, so to be on the safe side we'll just always
774     // block speculation.
775     nv_speculation_barrier();
776 
777     semaphore_cpu_va = (char *) uvm_mem_get_cpu_addr_kernel(semaphore_mem) + semaphore_offset;
778 
779     UVM_WRITE_ONCE(*(NvU32 *)semaphore_cpu_va, semaphore_payload);
780 }
781 
782 static NV_STATUS semaphore_release(NvU64 semaphore_address,
783                                    NvU32 semaphore_payload,
784                                    uvm_va_range_semaphore_pool_t *semaphore_pool,
785                                    uvm_gpu_t *dest_gpu,
786                                    uvm_tracker_t *tracker_ptr)
787 {
788     uvm_gpu_t *gpu;
789     uvm_gpu_t *gpu_owner = semaphore_pool->owner;
790 
791     // If there is a GPU owner, release the semaphore from it.
792     if (gpu_owner != NULL)
793         return semaphore_release_from_gpu(gpu_owner, semaphore_pool, semaphore_address, semaphore_payload, tracker_ptr);
794 
795     // Attempt eager release from CPU if the tracker is already completed.
796     if (uvm_tracker_is_completed(tracker_ptr)) {
797         semaphore_release_from_cpu(semaphore_pool->mem, semaphore_address, semaphore_payload);
798         return NV_OK;
799     }
800 
801     if (dest_gpu == NULL) {
802         // The destination is the CPU, but we didn't do a CPU release above
803         // because the previous work is not complete. This situation arises when
804         // accessed_by mappings are being set up asynchronously, or the
805         // test-only flag UVM_MIGRATE_FLAG_SKIP_CPU_MAP is used. So there should
806         // be a registered GPU, since all CPU work is synchronous, and the
807         // semaphore must be mapped on that GPU.
808         //
809         // Note that the GPU selected for the release may not be the same device
810         // that prevented the tracker from being complete.
811         gpu = uvm_global_processor_mask_find_first_gpu(&semaphore_pool->mem->kernel.mapped_on);
812 
813         UVM_ASSERT(gpu != NULL);
814     }
815     else {
816         gpu = dest_gpu;
817     }
818 
819     return semaphore_release_from_gpu(gpu, semaphore_pool, semaphore_address, semaphore_payload, tracker_ptr);
820 }
821 
822 NV_STATUS uvm_migrate_init(void)
823 {
824     NV_STATUS status = uvm_migrate_pageable_init();
825     if (status != NV_OK)
826         return status;
827 
828     g_uvm_perf_migrate_cpu_preunmap_enable = uvm_perf_migrate_cpu_preunmap_enable != 0;
829 
830     BUILD_BUG_ON((UVM_VA_BLOCK_SIZE) & (UVM_VA_BLOCK_SIZE - 1));
831 
832     if (g_uvm_perf_migrate_cpu_preunmap_enable) {
833         if (uvm_perf_migrate_cpu_preunmap_block_order <= UVM_PERF_MIGRATE_CPU_PREUNMAP_BLOCK_ORDER_MAX) {
834             g_uvm_perf_migrate_cpu_preunmap_size = UVM_VA_BLOCK_SIZE << uvm_perf_migrate_cpu_preunmap_block_order;
835         }
836         else {
837             g_uvm_perf_migrate_cpu_preunmap_size = UVM_VA_BLOCK_SIZE << UVM_PERF_MIGRATE_CPU_PREUNMAP_BLOCK_ORDER_DEFAULT;
838 
839             pr_info("Invalid value %u for uvm_perf_migrate_cpu_preunmap_block_order. Using %u instead\n",
840                     uvm_perf_migrate_cpu_preunmap_block_order,
841                     UVM_PERF_MIGRATE_CPU_PREUNMAP_BLOCK_ORDER_DEFAULT);
842         }
843     }
844 
845     return NV_OK;
846 }
847 
848 void uvm_migrate_exit(void)
849 {
850     uvm_migrate_pageable_exit();
851 }
852 
853 NV_STATUS uvm_api_migrate(UVM_MIGRATE_PARAMS *params, struct file *filp)
854 {
855     uvm_va_space_t *va_space = uvm_va_space_get(filp);
856     uvm_tracker_t tracker = UVM_TRACKER_INIT();
857     uvm_tracker_t *tracker_ptr = NULL;
858     uvm_gpu_t *dest_gpu = NULL;
859     uvm_va_range_t *sema_va_range = NULL;
860     struct mm_struct *mm;
861     NV_STATUS status = NV_OK;
862     bool flush_events = false;
863     const bool synchronous = !(params->flags & UVM_MIGRATE_FLAG_ASYNC);
864 
865     // We temporarily allow 0 length in the IOCTL parameters as a signal to
866     // only release the semaphore. This is because user-space is in charge of
867     // migrating pageable memory in some cases.
868     //
869     // TODO: Bug 2419180: do not allow 0 length migrations when we fully switch
870     // to migrate_vma for all types of vmas.
871     if (params->length > 0 || synchronous || params->semaphoreAddress == 0) {
872         if (uvm_api_range_invalid(params->base, params->length))
873             return NV_ERR_INVALID_ADDRESS;
874     }
875 
876     if (params->flags & ~UVM_MIGRATE_FLAGS_ALL)
877         return NV_ERR_INVALID_ARGUMENT;
878 
879     if ((params->flags & UVM_MIGRATE_FLAGS_TEST_ALL) && !uvm_enable_builtin_tests) {
880         UVM_INFO_PRINT("Test flag set for UVM_MIGRATE. Did you mean to insmod with uvm_enable_builtin_tests=1?\n");
881         UVM_INFO_PRINT("TEMP\n");
882         return NV_ERR_INVALID_ARGUMENT;
883     }
884 
885     // mmap_lock will be needed if we have to create CPU mappings
886     mm = uvm_va_space_mm_or_current_retain_lock(va_space);
887     uvm_va_space_down_read(va_space);
888 
889     if (synchronous) {
890         if (params->semaphoreAddress != 0) {
891             status = NV_ERR_INVALID_ARGUMENT;
892             goto done;
893         }
894     }
895     else {
896         if (params->semaphoreAddress == 0) {
897             if (params->semaphorePayload != 0) {
898                 status = NV_ERR_INVALID_ARGUMENT;
899                 goto done;
900             }
901         }
902         else {
903             sema_va_range = uvm_va_range_find(va_space, params->semaphoreAddress);
904             if (!IS_ALIGNED(params->semaphoreAddress, sizeof(params->semaphorePayload)) ||
905                     !sema_va_range || sema_va_range->type != UVM_VA_RANGE_TYPE_SEMAPHORE_POOL) {
906                 status = NV_ERR_INVALID_ADDRESS;
907                 goto done;
908             }
909         }
910     }
911 
912     if (!uvm_uuid_is_cpu(&params->destinationUuid)) {
913         if (params->flags & UVM_MIGRATE_FLAG_NO_GPU_VA_SPACE)
914             dest_gpu = uvm_va_space_get_gpu_by_uuid(va_space, &params->destinationUuid);
915         else
916             dest_gpu = uvm_va_space_get_gpu_by_uuid_with_gpu_va_space(va_space, &params->destinationUuid);
917 
918         if (!dest_gpu) {
919             status = NV_ERR_INVALID_DEVICE;
920             goto done;
921         }
922 
923         if (params->length > 0 && !uvm_gpu_can_address(dest_gpu, params->base, params->length)) {
924             status = NV_ERR_OUT_OF_RANGE;
925             goto done;
926         }
927     }
928 
929     UVM_ASSERT(status == NV_OK);
930 
931     // If we're synchronous or if we need to release a semaphore, use a tracker.
932     if (synchronous || params->semaphoreAddress)
933         tracker_ptr = &tracker;
934 
935     if (params->length > 0) {
936         uvm_api_range_type_t type;
937 
938         type = uvm_api_range_type_check(va_space, mm, params->base, params->length);
939         if (type == UVM_API_RANGE_TYPE_INVALID) {
940             status = NV_ERR_INVALID_ADDRESS;
941             goto done;
942         }
943 
944         if (type == UVM_API_RANGE_TYPE_ATS) {
945             uvm_migrate_args_t uvm_migrate_args =
946             {
947                 .va_space               = va_space,
948                 .mm                     = mm,
949                 .start                  = params->base,
950                 .length                 = params->length,
951                 .dst_id                 = (dest_gpu ? dest_gpu->id : UVM_ID_CPU),
952                 .dst_node_id            = (int)params->cpuNumaNode,
953                 .populate_permissions   = UVM_POPULATE_PERMISSIONS_INHERIT,
954                 .touch                  = false,
955                 .skip_mapped            = false,
956                 .user_space_start       = &params->userSpaceStart,
957                 .user_space_length      = &params->userSpaceLength,
958             };
959 
960             status = uvm_migrate_pageable(&uvm_migrate_args);
961         }
962         else {
963             status = uvm_migrate(va_space,
964                                  mm,
965                                  params->base,
966                                  params->length,
967                                  (dest_gpu ? dest_gpu->id : UVM_ID_CPU),
968                                  params->flags,
969                                  uvm_va_space_iter_first(va_space,
970                                                          params->base,
971                                                          params->base),
972                                  tracker_ptr);
973         }
974     }
975 
976 done:
977     // We only need to hold mmap_lock to create new CPU mappings, so drop it if
978     // we need to wait for the tracker to finish.
979     //
980     // TODO: Bug 1766650: For large migrations with destination CPU, try
981     //       benchmarks to see if a two-pass approach would be faster (first
982     //       pass pushes all GPU work asynchronously, second pass updates CPU
983     //       mappings synchronously).
984     if (mm)
985         uvm_up_read_mmap_lock_out_of_order(mm);
986 
987     if (tracker_ptr) {
988         // If requested, release semaphore
989         if (params->semaphoreAddress && (status == NV_OK)) {
990             status = semaphore_release(params->semaphoreAddress,
991                                        params->semaphorePayload,
992                                        &sema_va_range->semaphore_pool,
993                                        dest_gpu,
994                                        tracker_ptr);
995         }
996 
997         // Wait on the tracker if we are synchronous or there was an error. The
998         // VA space lock must be held to prevent GPUs from being unregistered.
999         if (synchronous || (status != NV_OK)) {
1000             NV_STATUS tracker_status = uvm_tracker_wait(tracker_ptr);
1001 
1002             // Only clobber status if we didn't hit an earlier error
1003             if (status == NV_OK)
1004                 status = tracker_status;
1005 
1006             flush_events = true;
1007         }
1008 
1009         uvm_tracker_deinit(tracker_ptr);
1010     }
1011 
1012     uvm_va_space_up_read(va_space);
1013     uvm_va_space_mm_or_current_release(va_space, mm);
1014 
1015     // If the migration is known to be complete, eagerly dispatch the migration
1016     // events, instead of processing them on a later event flush. Note that an
1017     // asynchronous migration could be complete by now, but the flush would not
1018     // be triggered.
1019     if (flush_events)
1020         uvm_tools_flush_events();
1021 
1022     return status;
1023 }
1024 
1025 NV_STATUS uvm_api_migrate_range_group(UVM_MIGRATE_RANGE_GROUP_PARAMS *params, struct file *filp)
1026 {
1027     NV_STATUS status = NV_OK;
1028     NV_STATUS tracker_status = NV_OK;
1029     uvm_va_space_t *va_space = uvm_va_space_get(filp);
1030     struct mm_struct *mm;
1031     uvm_range_group_t *range_group;
1032     uvm_range_group_range_t *rgr;
1033     uvm_processor_id_t dest_id;
1034     uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
1035     NvU32 migrate_flags = 0;
1036     uvm_gpu_t *gpu = NULL;
1037 
1038     // mmap_lock will be needed if we have to create CPU mappings
1039     mm = uvm_va_space_mm_or_current_retain_lock(va_space);
1040     uvm_va_space_down_read(va_space);
1041 
1042     if (uvm_uuid_is_cpu(&params->destinationUuid)) {
1043         dest_id = UVM_ID_CPU;
1044     }
1045     else {
1046         gpu = uvm_va_space_get_gpu_by_uuid_with_gpu_va_space(va_space, &params->destinationUuid);
1047         if (!gpu) {
1048             status = NV_ERR_INVALID_DEVICE;
1049             goto done;
1050         }
1051 
1052         dest_id = gpu->id;
1053     }
1054 
1055     range_group = radix_tree_lookup(&va_space->range_groups, params->rangeGroupId);
1056     if (!range_group) {
1057         status = NV_ERR_OBJECT_NOT_FOUND;
1058         goto done;
1059     }
1060 
1061     // Migrate all VA ranges in the range group. uvm_migrate is used because it performs all
1062     // VA range validity checks.
1063     list_for_each_entry(rgr, &range_group->ranges, range_group_list_node) {
1064         NvU64 start = rgr->node.start;
1065         NvU64 length = rgr->node.end - rgr->node.start + 1;
1066 
1067         if (gpu && !uvm_gpu_can_address(gpu, start, length)) {
1068             status = NV_ERR_OUT_OF_RANGE;
1069         }
1070         else {
1071             uvm_va_range_t *first_va_range = uvm_va_space_iter_first(va_space, start, start);
1072 
1073             if (!first_va_range || first_va_range->type != UVM_VA_RANGE_TYPE_MANAGED) {
1074                 status = NV_ERR_INVALID_ADDRESS;
1075                 goto done;
1076             }
1077 
1078             status = uvm_migrate(va_space,
1079                                  mm,
1080                                  start,
1081                                  length,
1082                                  dest_id,
1083                                  migrate_flags,
1084                                  first_va_range,
1085                                  &local_tracker);
1086         }
1087 
1088         if (status != NV_OK)
1089             goto done;
1090     }
1091 
1092 done:
1093     // We only need to hold mmap_lock to create new CPU mappings, so drop it if
1094     // we need to wait for the tracker to finish.
1095     //
1096     // TODO: Bug 1766650: For large migrations with destination CPU, try
1097     //       benchmarks to see if a two-pass approach would be faster (first
1098     //       pass pushes all GPU work asynchronously, second pass updates CPU
1099     //       mappings synchronously).
1100     if (mm)
1101         uvm_up_read_mmap_lock_out_of_order(mm);
1102 
1103     tracker_status = uvm_tracker_wait_deinit(&local_tracker);
1104     uvm_va_space_up_read(va_space);
1105     uvm_va_space_mm_or_current_release(va_space, mm);
1106 
1107     // This API is synchronous, so wait for migrations to finish
1108     uvm_tools_flush_events();
1109 
1110     return status == NV_OK? tracker_status : status;
1111 }
1112