1 /*******************************************************************************
2     Copyright (c) 2016-2023 NVIDIA Corporation
3 
4     Permission is hereby granted, free of charge, to any person obtaining a copy
5     of this software and associated documentation files (the "Software"), to
6     deal in the Software without restriction, including without limitation the
7     rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
8     sell copies of the Software, and to permit persons to whom the Software is
9     furnished to do so, subject to the following conditions:
10 
11         The above copyright notice and this permission notice shall be
12         included in all copies or substantial portions of the Software.
13 
14     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17     THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19     FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20     DEALINGS IN THE SOFTWARE.
21 
22 *******************************************************************************/
23 
24 #include "uvm_hmm.h"
25 
26 // Support for HMM ( https://docs.kernel.org/mm/hmm.html ):
27 
28 #ifdef NVCPU_X86_64
29 static bool uvm_disable_hmm = false;
30 MODULE_PARM_DESC(uvm_disable_hmm,
31                  "Force-disable HMM functionality in the UVM driver. "
32                  "Default: false (HMM is enabled if possible). "
33                  "However, even with uvm_disable_hmm=false, HMM will not be "
34                  "enabled if is not supported in this driver build "
35                  "configuration, or if ATS settings conflict with HMM.");
36 #else
37 // So far, we've only tested HMM on x86_64, so disable it by default everywhere
38 // else.
39 static bool uvm_disable_hmm = true;
40 MODULE_PARM_DESC(uvm_disable_hmm,
41                  "Force-disable HMM functionality in the UVM driver. "
42                  "Default: true (HMM is not enabled on this CPU architecture). "
43                  "However, even with uvm_disable_hmm=false, HMM will not be "
44                  "enabled if is not supported in this driver build "
45                  "configuration, or if ATS settings conflict with HMM.");
46 #endif
47 
48 module_param(uvm_disable_hmm, bool, 0444);
49 
50 #if UVM_IS_CONFIG_HMM()
51 
52 #include <linux/hmm.h>
53 #include <linux/rmap.h>
54 #include <linux/migrate.h>
55 #include <linux/userfaultfd_k.h>
56 #include <linux/memremap.h>
57 #include <linux/wait.h>
58 
59 #include "uvm_common.h"
60 #include "uvm_gpu.h"
61 #include "uvm_pmm_gpu.h"
62 #include "uvm_hal_types.h"
63 #include "uvm_va_block_types.h"
64 #include "uvm_va_space_mm.h"
65 #include "uvm_va_space.h"
66 #include "uvm_va_range.h"
67 #include "uvm_range_tree.h"
68 #include "uvm_pmm_sysmem.h"
69 #include "uvm_lock.h"
70 #include "uvm_api.h"
71 #include "uvm_va_policy.h"
72 #include "uvm_tools.h"
73 
74 static NV_STATUS gpu_chunk_add(uvm_va_block_t *va_block,
75                                uvm_page_index_t page_index,
76                                struct page *page);
77 
78 typedef struct
79 {
80     uvm_processor_id_t processor_id;
81     uvm_processor_id_t new_residency;
82     uvm_va_block_t *va_block;
83     uvm_va_block_retry_t *va_block_retry;
84     uvm_service_block_context_t *service_context;
85     uvm_page_mask_t page_mask;
86     uvm_page_mask_t same_devmem_page_mask;
87 } uvm_hmm_gpu_fault_event_t;
88 
89 typedef struct
90 {
91     uvm_va_block_t *va_block;
92     uvm_va_block_retry_t *va_block_retry;
93     uvm_va_block_context_t *va_block_context;
94     uvm_va_block_region_t region;
95     uvm_processor_id_t dest_id;
96     uvm_make_resident_cause_t cause;
97     uvm_page_mask_t page_mask;
98     uvm_page_mask_t same_devmem_page_mask;
99 } uvm_hmm_migrate_event_t;
100 
101 typedef struct
102 {
103     uvm_processor_id_t processor_id;
104     uvm_va_block_t *va_block;
105     uvm_va_block_retry_t *va_block_retry;
106     uvm_service_block_context_t *service_context;
107     uvm_page_mask_t page_mask;
108     uvm_page_mask_t same_devmem_page_mask;
109 } uvm_hmm_devmem_fault_context_t;
110 
111 bool uvm_hmm_is_enabled_system_wide(void)
112 {
113     if (uvm_disable_hmm)
114         return false;
115 
116     if (g_uvm_global.ats.enabled)
117         return false;
118 
119     // Confidential Computing and HMM impose mutually exclusive constraints. In
120     // Confidential Computing the GPU can only access pages resident in vidmem,
121     // but in HMM pages may be required to be resident in sysmem: file backed
122     // VMAs, huge pages, etc.
123     if (g_uvm_global.conf_computing_enabled)
124         return false;
125 
126     return uvm_va_space_mm_enabled_system();
127 }
128 
129 bool uvm_hmm_is_enabled(uvm_va_space_t *va_space)
130 {
131     return uvm_hmm_is_enabled_system_wide() &&
132            uvm_va_space_mm_enabled(va_space) &&
133            !(va_space->initialization_flags & UVM_INIT_FLAGS_DISABLE_HMM);
134 }
135 
136 static uvm_va_block_t *hmm_va_block_from_node(uvm_range_tree_node_t *node)
137 {
138     if (!node)
139         return NULL;
140     return container_of(node, uvm_va_block_t, hmm.node);
141 }
142 
143 void uvm_hmm_va_space_initialize(uvm_va_space_t *va_space)
144 {
145     uvm_hmm_va_space_t *hmm_va_space = &va_space->hmm;
146 
147     if (!uvm_hmm_is_enabled(va_space))
148         return;
149 
150     uvm_range_tree_init(&hmm_va_space->blocks);
151     uvm_mutex_init(&hmm_va_space->blocks_lock, UVM_LOCK_ORDER_LEAF);
152 
153     return;
154 }
155 
156 void uvm_hmm_va_space_destroy(uvm_va_space_t *va_space)
157 {
158     uvm_hmm_va_space_t *hmm_va_space = &va_space->hmm;
159     uvm_range_tree_node_t *node, *next;
160     uvm_va_block_t *va_block;
161 
162     if (!uvm_hmm_is_enabled(va_space))
163         return;
164 
165     uvm_assert_rwsem_locked_write(&va_space->lock);
166 
167     // The blocks_lock is not needed when the va_space lock is held for write.
168     uvm_range_tree_for_each_safe(node, next, &hmm_va_space->blocks) {
169         va_block = hmm_va_block_from_node(node);
170         uvm_range_tree_remove(&hmm_va_space->blocks, node);
171         mmu_interval_notifier_remove(&va_block->hmm.notifier);
172         uvm_va_block_kill(va_block);
173     }
174 }
175 
176 static void hmm_va_block_unregister_gpu(uvm_va_block_t *va_block,
177                                         uvm_gpu_t *gpu,
178                                         struct mm_struct *mm)
179 {
180     uvm_va_policy_node_t *node;
181 
182     uvm_mutex_lock(&va_block->lock);
183 
184     // Reset preferred location and accessed-by of policy nodes if needed.
185     uvm_for_each_va_policy_node_in(node, va_block, va_block->start, va_block->end) {
186         if (uvm_id_equal(node->policy.preferred_location, gpu->id))
187             node->policy.preferred_location = UVM_ID_INVALID;
188 
189         uvm_processor_mask_clear(&node->policy.accessed_by, gpu->id);
190     }
191 
192     // Migrate and free any remaining resident allocations on this GPU.
193     uvm_va_block_unregister_gpu_locked(va_block, gpu, mm);
194 
195     uvm_mutex_unlock(&va_block->lock);
196 }
197 
198 void uvm_hmm_unregister_gpu(uvm_va_space_t *va_space, uvm_gpu_t *gpu, struct mm_struct *mm)
199 {
200     uvm_range_tree_node_t *node;
201     uvm_va_block_t *va_block;
202 
203     if (!uvm_hmm_is_enabled(va_space))
204         return;
205 
206     if (mm)
207         uvm_assert_mmap_lock_locked(mm);
208     uvm_assert_rwsem_locked_write(&va_space->lock);
209 
210     uvm_range_tree_for_each(node, &va_space->hmm.blocks) {
211         va_block = hmm_va_block_from_node(node);
212 
213         hmm_va_block_unregister_gpu(va_block, gpu, mm);
214     }
215 }
216 
217 static void hmm_va_block_remove_gpu_va_space(uvm_va_block_t *va_block,
218                                              uvm_gpu_va_space_t *gpu_va_space,
219                                              uvm_va_block_context_t *va_block_context)
220 {
221     uvm_mutex_lock(&va_block->lock);
222 
223     uvm_va_block_remove_gpu_va_space(va_block, gpu_va_space, va_block_context);
224 
225     uvm_mutex_unlock(&va_block->lock);
226 
227     // TODO: Bug 3660922: Need to handle read duplication at some point.
228     // See range_remove_gpu_va_space_managed().
229 }
230 
231 void uvm_hmm_remove_gpu_va_space(uvm_va_space_t *va_space,
232                                  uvm_gpu_va_space_t *gpu_va_space,
233                                  struct mm_struct *mm)
234 {
235     uvm_va_block_context_t *va_block_context;
236     uvm_range_tree_node_t *node, *next;
237     uvm_va_block_t *va_block;
238 
239     if (!uvm_hmm_is_enabled(va_space))
240         return;
241 
242     if (mm)
243         uvm_assert_mmap_lock_locked(mm);
244     uvm_assert_rwsem_locked_write(&va_space->lock);
245 
246     va_block_context = uvm_va_space_block_context(va_space, mm);
247 
248     uvm_range_tree_for_each_safe(node, next, &va_space->hmm.blocks) {
249         va_block = hmm_va_block_from_node(node);
250 
251         hmm_va_block_remove_gpu_va_space(va_block, gpu_va_space, va_block_context);
252     }
253 }
254 
255 static bool hmm_invalidate(uvm_va_block_t *va_block,
256                            const struct mmu_notifier_range *range,
257                            unsigned long cur_seq)
258 {
259     uvm_thread_context_t *uvm_context = uvm_thread_context();
260     struct mmu_interval_notifier *mni = &va_block->hmm.notifier;
261     struct mm_struct *mm = mni->mm;
262     uvm_va_block_context_t *va_block_context;
263     uvm_va_block_region_t region;
264     NvU64 start, end;
265     uvm_processor_id_t id;
266     NV_STATUS status = NV_OK;
267 
268     // The MMU_NOTIFY_RELEASE event isn't really needed since mn_itree_release()
269     // doesn't remove the interval notifiers from the struct_mm so there will
270     // be a full range MMU_NOTIFY_UNMAP event after the release from
271     // unmap_vmas() during exit_mmap().
272     if (range->event == MMU_NOTIFY_SOFT_DIRTY || range->event == MMU_NOTIFY_RELEASE)
273         return true;
274 
275     // Blockable is only set false by
276     // mmu_notifier_invalidate_range_start_nonblock() which is only called in
277     // __oom_reap_task_mm().
278     if (!mmu_notifier_range_blockable(range))
279         return false;
280 
281     // We only ignore invalidations in this context whilst holding the
282     // va_block lock. This prevents deadlock when try_to_migrate()
283     // calls the notifier, but holding the lock prevents other threads
284     // invalidating PTEs so we can safely assume the results of
285     // migrate_vma_setup() are correct.
286     if (uvm_context->ignore_hmm_invalidate_va_block == va_block ||
287         ((range->event == MMU_NOTIFY_MIGRATE || range->event == MMU_NOTIFY_EXCLUSIVE) &&
288          range->owner == &g_uvm_global))
289         return true;
290 
291     va_block_context = uvm_va_block_context_alloc(mm);
292     if (!va_block_context)
293         return true;
294 
295     uvm_mutex_lock(&va_block->lock);
296 
297     // mmu_interval_notifier_remove() is always called before marking a
298     // va_block as dead so this va_block has to be alive.
299     UVM_ASSERT(!uvm_va_block_is_dead(va_block));
300 
301     // Note: unmap_vmas() does MMU_NOTIFY_UNMAP [0, 0xffffffffffffffff]
302     // Also note that hmm_invalidate() can be called when a new va_block is not
303     // yet inserted into the va_space->hmm.blocks table while the original
304     // va_block is being split. The original va_block may have its end address
305     // updated before the mmu interval notifier is updated so this invalidate
306     // may be for a range past the va_block end address.
307     start = range->start;
308     end = (range->end == ULONG_MAX) ? range->end : range->end - 1;
309     if (start < va_block->start)
310         start = va_block->start;
311     if (end > va_block->end)
312         end = va_block->end;
313     if (start > end)
314         goto unlock;
315 
316     // These will be equal if no other thread causes an invalidation
317     // whilst the va_block lock was dropped.
318     uvm_context->hmm_invalidate_seqnum++;
319     va_block->hmm.changed++;
320 
321     mmu_interval_set_seq(mni, cur_seq);
322 
323     region = uvm_va_block_region_from_start_end(va_block, start, end);
324 
325     va_block_context->hmm.vma = NULL;
326 
327     // We only need to unmap GPUs since Linux handles the CPUs.
328     for_each_gpu_id_in_mask(id, &va_block->mapped) {
329         status = uvm_va_block_unmap(va_block,
330                                     va_block_context,
331                                     id,
332                                     region,
333                                     uvm_va_block_map_mask_get(va_block, id),
334                                     &va_block->tracker);
335         // Note that the va_block lock can be dropped, relocked, and
336         // NV_ERR_MORE_PROCESSING_REQUIRED returned.
337         if (status != NV_OK)
338             break;
339     }
340 
341     if (range->event == MMU_NOTIFY_UNMAP || range->event == MMU_NOTIFY_CLEAR)
342         uvm_va_block_munmap_region(va_block, region);
343 
344     if (status == NV_OK)
345         status = uvm_tracker_wait(&va_block->tracker);
346 
347     // Remove stale HMM struct page pointers to system memory.
348     uvm_va_block_remove_cpu_chunks(va_block, region);
349 
350 unlock:
351     uvm_mutex_unlock(&va_block->lock);
352 
353     uvm_va_block_context_free(va_block_context);
354 
355     UVM_ASSERT(status == NV_OK);
356     return true;
357 }
358 
359 static bool uvm_hmm_invalidate_entry(struct mmu_interval_notifier *mni,
360                                      const struct mmu_notifier_range *range,
361                                      unsigned long cur_seq)
362 {
363     uvm_va_block_t *va_block = container_of(mni, uvm_va_block_t, hmm.notifier);
364 
365     UVM_ENTRY_RET(hmm_invalidate(va_block, range, cur_seq));
366 }
367 
368 static const struct mmu_interval_notifier_ops uvm_hmm_notifier_ops =
369 {
370     .invalidate = uvm_hmm_invalidate_entry,
371 };
372 
373 NV_STATUS uvm_hmm_va_block_find(uvm_va_space_t *va_space,
374                                 NvU64 addr,
375                                 uvm_va_block_t **va_block_ptr)
376 {
377     uvm_range_tree_node_t *node;
378 
379     if (!uvm_hmm_is_enabled(va_space))
380         return NV_ERR_INVALID_ADDRESS;
381 
382     uvm_assert_rwsem_locked(&va_space->lock);
383 
384     uvm_mutex_lock(&va_space->hmm.blocks_lock);
385     node = uvm_range_tree_find(&va_space->hmm.blocks, addr);
386     uvm_mutex_unlock(&va_space->hmm.blocks_lock);
387 
388     if (!node)
389         return NV_ERR_OBJECT_NOT_FOUND;
390 
391     *va_block_ptr = hmm_va_block_from_node(node);
392 
393     return NV_OK;
394 }
395 
396 static int migrate_vma_setup_locked(struct migrate_vma *args, uvm_va_block_t *va_block)
397 {
398     uvm_thread_context_t *uvm_context = uvm_thread_context();
399     int ret;
400 
401     // It's only safe to ignore invalidations whilst doing a migration
402     // and holding the va_block lock.
403     uvm_assert_mutex_locked(&va_block->lock);
404     uvm_context->ignore_hmm_invalidate_va_block = va_block;
405     ret = migrate_vma_setup(args);
406 
407     // We shouldn't be generating any more invalidations now.
408     uvm_context->ignore_hmm_invalidate_va_block = NULL;
409     return ret;
410 }
411 
412 static bool uvm_hmm_vma_is_valid(struct vm_area_struct *vma,
413                                  unsigned long addr,
414                                  bool allow_unreadable_vma)
415 {
416     // UVM doesn't support userfaultfd. hmm_range_fault() doesn't support
417     // VM_IO or VM_PFNMAP VMAs. It also doesn't support VMAs without VM_READ
418     // but we allow those VMAs to have policy set on them.
419     // migrate_vma_setup() doesn't support VM_SPECIAL VMAs but that is handled
420     // by uvm_hmm_must_use_sysmem() forcing residency to the CPU.
421     return vma &&
422            addr >= vma->vm_start &&
423            !userfaultfd_armed(vma) &&
424            !(vma->vm_flags & (VM_IO | VM_PFNMAP)) &&
425            !uvm_vma_is_managed(vma) &&
426            (allow_unreadable_vma || (vma->vm_flags & VM_READ));
427 }
428 
429 static void hmm_va_block_init(uvm_va_block_t *va_block,
430                               uvm_va_space_t *va_space,
431                               NvU64 start,
432                               NvU64 end)
433 {
434     va_block->hmm.va_space = va_space;
435     va_block->hmm.node.start = start;
436     va_block->hmm.node.end = end;
437     uvm_range_tree_init(&va_block->hmm.va_policy_tree);
438     uvm_mutex_init(&va_block->hmm.migrate_lock, UVM_LOCK_ORDER_VA_BLOCK_MIGRATE);
439 }
440 
441 static NV_STATUS hmm_va_block_find_create(uvm_va_space_t *va_space,
442                                           NvU64 addr,
443                                           bool allow_unreadable_vma,
444                                           struct vm_area_struct **vma_out,
445                                           uvm_va_block_t **va_block_ptr)
446 {
447     struct mm_struct *mm;
448     struct vm_area_struct *va_block_vma;
449     uvm_va_block_t *va_block;
450     NvU64 start, end;
451     NV_STATUS status;
452     int ret;
453 
454     if (!uvm_hmm_is_enabled(va_space))
455         return NV_ERR_INVALID_ADDRESS;
456 
457     mm = va_space->va_space_mm.mm;
458     uvm_assert_mmap_lock_locked(mm);
459     uvm_assert_rwsem_locked(&va_space->lock);
460     UVM_ASSERT(PAGE_ALIGNED(addr));
461 
462     // Note that we have to allow PROT_NONE VMAs so that policies can be set.
463     va_block_vma = find_vma(mm, addr);
464     if (!uvm_hmm_vma_is_valid(va_block_vma, addr, allow_unreadable_vma))
465         return NV_ERR_INVALID_ADDRESS;
466 
467     // Since we only hold the va_space read lock, there can be multiple
468     // parallel va_block insertions.
469     uvm_mutex_lock(&va_space->hmm.blocks_lock);
470 
471     va_block = hmm_va_block_from_node(uvm_range_tree_find(&va_space->hmm.blocks, addr));
472     if (va_block)
473         goto done;
474 
475     // The va_block is always created to cover the whole aligned
476     // UVM_VA_BLOCK_SIZE interval unless there are existing UVM va_ranges or
477     // HMM va_blocks. In that case, the new HMM va_block size is adjusted so it
478     // doesn't overlap.
479     start = UVM_VA_BLOCK_ALIGN_DOWN(addr);
480     end = start + UVM_VA_BLOCK_SIZE - 1;
481 
482     // Search for existing UVM va_ranges in the start/end interval and create
483     // a maximum interval that doesn't overlap any existing UVM va_ranges.
484     // We know that 'addr' is not within a va_range or
485     // hmm_va_block_find_create() wouldn't be called.
486     status = uvm_range_tree_find_hole_in(&va_space->va_range_tree, addr, &start, &end);
487     UVM_ASSERT(status == NV_OK);
488 
489     // Search for existing HMM va_blocks in the start/end interval and create
490     // a maximum interval that doesn't overlap any existing HMM va_blocks.
491     status = uvm_range_tree_find_hole_in(&va_space->hmm.blocks, addr, &start, &end);
492     UVM_ASSERT(status == NV_OK);
493 
494     // Create a HMM va_block with a NULL va_range pointer.
495     status = uvm_va_block_create(NULL, start, end, &va_block);
496     if (status != NV_OK)
497         goto err_unlock;
498 
499     hmm_va_block_init(va_block, va_space, start, end);
500 
501     ret = mmu_interval_notifier_insert(&va_block->hmm.notifier,
502                                        mm,
503                                        start,
504                                        end - start + 1,
505                                        &uvm_hmm_notifier_ops);
506     if (ret) {
507         status = errno_to_nv_status(ret);
508         goto err_release;
509     }
510 
511     status = uvm_range_tree_add(&va_space->hmm.blocks, &va_block->hmm.node);
512     UVM_ASSERT(status == NV_OK);
513 
514 done:
515     uvm_mutex_unlock(&va_space->hmm.blocks_lock);
516     if (vma_out)
517         *vma_out = va_block_vma;
518     *va_block_ptr = va_block;
519     return NV_OK;
520 
521 err_release:
522     uvm_va_block_release(va_block);
523 
524 err_unlock:
525     uvm_mutex_unlock(&va_space->hmm.blocks_lock);
526     return status;
527 }
528 
529 NV_STATUS uvm_hmm_va_block_find_create(uvm_va_space_t *va_space,
530                                        NvU64 addr,
531                                        struct vm_area_struct **vma,
532                                        uvm_va_block_t **va_block_ptr)
533 {
534     return hmm_va_block_find_create(va_space, addr, false, vma, va_block_ptr);
535 }
536 
537 NV_STATUS uvm_hmm_find_vma(struct mm_struct *mm, struct vm_area_struct **vma_out, NvU64 addr)
538 {
539     if (!mm)
540         return NV_ERR_INVALID_ADDRESS;
541 
542     uvm_assert_mmap_lock_locked(mm);
543 
544     *vma_out = find_vma(mm, addr);
545     if (!uvm_hmm_vma_is_valid(*vma_out, addr, false))
546         return NV_ERR_INVALID_ADDRESS;
547 
548     return NV_OK;
549 }
550 
551 bool uvm_hmm_check_context_vma_is_valid(uvm_va_block_t *va_block,
552                                         struct vm_area_struct *vma,
553                                         uvm_va_block_region_t region)
554 {
555     uvm_assert_mutex_locked(&va_block->lock);
556 
557     if (uvm_va_block_is_hmm(va_block)) {
558         UVM_ASSERT(vma);
559         UVM_ASSERT(va_block->hmm.va_space->va_space_mm.mm == vma->vm_mm);
560         uvm_assert_mmap_lock_locked(va_block->hmm.va_space->va_space_mm.mm);
561         UVM_ASSERT(vma->vm_start <= uvm_va_block_region_start(va_block, region));
562         UVM_ASSERT(vma->vm_end > uvm_va_block_region_end(va_block, region));
563     }
564 
565     return true;
566 }
567 
568 void uvm_hmm_service_context_init(uvm_service_block_context_t *service_context)
569 {
570     // TODO: Bug 4050579: Remove this when swap cached pages can be migrated.
571     service_context->block_context.hmm.swap_cached = false;
572 }
573 
574 NV_STATUS uvm_hmm_migrate_begin(uvm_va_block_t *va_block)
575 {
576     if (uvm_mutex_trylock(&va_block->hmm.migrate_lock))
577         return NV_OK;
578 
579     return NV_ERR_BUSY_RETRY;
580 }
581 
582 void uvm_hmm_migrate_begin_wait(uvm_va_block_t *va_block)
583 {
584     uvm_mutex_lock(&va_block->hmm.migrate_lock);
585 }
586 
587 void uvm_hmm_migrate_finish(uvm_va_block_t *va_block)
588 {
589     uvm_mutex_unlock(&va_block->hmm.migrate_lock);
590 }
591 
592 // Migrate the given range [start end] within a va_block to dest_id.
593 static NV_STATUS hmm_migrate_range(uvm_va_block_t *va_block,
594                                    uvm_va_block_retry_t *va_block_retry,
595                                    uvm_va_block_context_t *va_block_context,
596                                    uvm_processor_id_t dest_id,
597                                    NvU64 start,
598                                    NvU64 end,
599                                    uvm_migrate_mode_t mode,
600                                    uvm_tracker_t *out_tracker)
601 {
602     uvm_va_block_region_t region;
603     uvm_va_policy_node_t *node;
604     const uvm_va_policy_t *policy;
605     NV_STATUS status = NV_OK;
606 
607     uvm_hmm_migrate_begin_wait(va_block);
608     uvm_mutex_lock(&va_block->lock);
609 
610     uvm_for_each_va_policy_in(policy, va_block, start, end, node, region) {
611         // Even though UVM_VA_BLOCK_RETRY_LOCKED() may unlock and relock the
612         // va_block lock, the policy remains valid because we hold the mmap
613         // lock so munmap can't remove the policy, and the va_space lock so the
614         // policy APIs can't change the policy.
615         status = UVM_VA_BLOCK_RETRY_LOCKED(va_block,
616                                            va_block_retry,
617                                            uvm_va_block_migrate_locked(va_block,
618                                                                        va_block_retry,
619                                                                        va_block_context,
620                                                                        region,
621                                                                        dest_id,
622                                                                        mode,
623                                                                        out_tracker));
624         if (status != NV_OK)
625             break;
626     }
627 
628     uvm_mutex_unlock(&va_block->lock);
629     uvm_hmm_migrate_finish(va_block);
630 
631     return status;
632 }
633 
634 void uvm_hmm_evict_va_blocks(uvm_va_space_t *va_space)
635 {
636     // We can't use uvm_va_space_mm_retain(), because the va_space_mm
637     // should already be dead by now.
638     struct mm_struct *mm = va_space->va_space_mm.mm;
639     uvm_hmm_va_space_t *hmm_va_space = &va_space->hmm;
640     uvm_range_tree_node_t *node, *next;
641     uvm_va_block_t *va_block;
642     uvm_va_block_context_t *block_context;
643 
644     uvm_down_read_mmap_lock(mm);
645     uvm_va_space_down_write(va_space);
646 
647     uvm_range_tree_for_each_safe(node, next, &hmm_va_space->blocks) {
648         uvm_va_block_region_t region;
649         struct vm_area_struct *vma;
650 
651         va_block = hmm_va_block_from_node(node);
652         block_context = uvm_va_space_block_context(va_space, mm);
653         uvm_hmm_migrate_begin_wait(va_block);
654         uvm_mutex_lock(&va_block->lock);
655         for_each_va_block_vma_region(va_block, mm, vma, &region) {
656             if (!uvm_hmm_vma_is_valid(vma, vma->vm_start, false))
657                 continue;
658 
659             block_context->hmm.vma = vma;
660             uvm_hmm_va_block_migrate_locked(va_block,
661                                             NULL,
662                                             block_context,
663                                             UVM_ID_CPU,
664                                             region,
665                                             UVM_MAKE_RESIDENT_CAUSE_API_MIGRATE);
666         }
667         uvm_mutex_unlock(&va_block->lock);
668         uvm_hmm_migrate_finish(va_block);
669     }
670 
671     uvm_va_space_up_write(va_space);
672     uvm_up_read_mmap_lock(mm);
673 }
674 
675 NV_STATUS uvm_hmm_test_va_block_inject_split_error(uvm_va_space_t *va_space, NvU64 addr)
676 {
677     uvm_va_block_test_t *block_test;
678     uvm_va_block_t *va_block;
679     NV_STATUS status;
680 
681     if (!uvm_hmm_is_enabled(va_space))
682         return NV_ERR_INVALID_ADDRESS;
683 
684     status = hmm_va_block_find_create(va_space, addr, false, NULL, &va_block);
685     if (status != NV_OK)
686         return status;
687 
688     block_test = uvm_va_block_get_test(va_block);
689     if (block_test)
690         block_test->inject_split_error = true;
691 
692     return NV_OK;
693 }
694 
695 typedef struct {
696     struct mmu_interval_notifier notifier;
697     uvm_va_block_t *existing_block;
698 } hmm_split_invalidate_data_t;
699 
700 static bool hmm_split_invalidate(struct mmu_interval_notifier *mni,
701                                  const struct mmu_notifier_range *range,
702                                  unsigned long cur_seq)
703 {
704     hmm_split_invalidate_data_t *split_data = container_of(mni, hmm_split_invalidate_data_t, notifier);
705 
706     uvm_tools_test_hmm_split_invalidate(split_data->existing_block->hmm.va_space);
707     hmm_invalidate(split_data->existing_block, range, cur_seq);
708 
709     return true;
710 }
711 
712 static bool hmm_split_invalidate_entry(struct mmu_interval_notifier *mni,
713                                        const struct mmu_notifier_range *range,
714                                        unsigned long cur_seq)
715 {
716     UVM_ENTRY_RET(hmm_split_invalidate(mni, range, cur_seq));
717 }
718 
719 static const struct mmu_interval_notifier_ops hmm_notifier_split_ops =
720 {
721     .invalidate = hmm_split_invalidate_entry,
722 };
723 
724 // Splits existing va_block into two pieces, with new_va_block always after
725 // va_block. va_block is updated to have new_end. new_end+1 must be page-
726 // aligned.
727 //
728 // Before: [----------- existing ------------]
729 // After:  [---- existing ----][---- new ----]
730 //                            ^new_end
731 //
732 // On error, va_block is still accessible and is left in its original
733 // functional state.
734 static NV_STATUS hmm_split_block(uvm_va_block_t *va_block,
735                                  NvU64 new_end,
736                                  uvm_va_block_t **new_block_ptr)
737 {
738     uvm_va_space_t *va_space = va_block->hmm.va_space;
739     struct mm_struct *mm = va_space->va_space_mm.mm;
740     hmm_split_invalidate_data_t split_data;
741     NvU64 delay_us;
742     uvm_va_block_t *new_va_block;
743     NV_STATUS status;
744     int ret;
745 
746     uvm_assert_rwsem_locked_write(&va_space->lock);
747 
748     UVM_ASSERT(new_end > va_block->start);
749     UVM_ASSERT(new_end < va_block->end);
750     UVM_ASSERT(PAGE_ALIGNED(new_end + 1));
751 
752     status = uvm_va_block_create(NULL, new_end + 1, va_block->end, &new_va_block);
753     if (status != NV_OK)
754         return status;
755 
756     // Initialize the newly created HMM va_block.
757     hmm_va_block_init(new_va_block, va_space, new_va_block->start, new_va_block->end);
758 
759     ret = mmu_interval_notifier_insert(&new_va_block->hmm.notifier,
760                                        mm,
761                                        new_va_block->start,
762                                        uvm_va_block_size(new_va_block),
763                                        &uvm_hmm_notifier_ops);
764 
765     // Since __mmu_notifier_register() was called when the va_space was
766     // initially created, we know that mm->notifier_subscriptions is valid
767     // and mmu_interval_notifier_insert() can't return ENOMEM.
768     // The only error return is for start + length overflowing but we already
769     // registered the same address range before so there should be no error.
770     UVM_ASSERT(!ret);
771 
772     uvm_mutex_lock(&va_block->lock);
773 
774     status = uvm_va_block_split_locked(va_block, new_end, new_va_block, NULL);
775     if (status != NV_OK)
776         goto err;
777 
778     uvm_mutex_unlock(&va_block->lock);
779 
780     // The MMU interval notifier has to be removed in order to resize it.
781     // That means there would be a window of time when invalidation callbacks
782     // could be missed. To handle this case, we register a temporary notifier
783     // to cover the address range while resizing the old notifier (it is
784     // OK to have multiple notifiers for the same range, we may simply try to
785     // invalidate twice).
786     split_data.existing_block = va_block;
787     ret = mmu_interval_notifier_insert(&split_data.notifier,
788                                        mm,
789                                        va_block->start,
790                                        new_end - va_block->start + 1,
791                                        &hmm_notifier_split_ops);
792     UVM_ASSERT(!ret);
793 
794     // Delay to allow hmm_sanity test to trigger an mmu_notifier during the
795     // critical window where the split invalidate callback is active.
796     delay_us = atomic64_read(&va_space->test.split_invalidate_delay_us);
797     if (delay_us)
798         udelay(delay_us);
799 
800     mmu_interval_notifier_remove(&va_block->hmm.notifier);
801 
802     // Enable notifications on the old block with the smaller size.
803     ret = mmu_interval_notifier_insert(&va_block->hmm.notifier,
804                                        mm,
805                                        va_block->start,
806                                        uvm_va_block_size(va_block),
807                                        &uvm_hmm_notifier_ops);
808     UVM_ASSERT(!ret);
809 
810     mmu_interval_notifier_remove(&split_data.notifier);
811 
812     if (new_block_ptr)
813         *new_block_ptr = new_va_block;
814 
815     return status;
816 
817 err:
818     uvm_mutex_unlock(&va_block->lock);
819     mmu_interval_notifier_remove(&new_va_block->hmm.notifier);
820     uvm_va_block_release(new_va_block);
821     return status;
822 }
823 
824 // Check to see if the HMM va_block would overlap the range start/end and
825 // split it so it can be removed. That breaks down to the following cases:
826 // start/end could cover all of the HMM va_block ->
827 //     remove the va_block
828 // start/end could cover the left part of the HMM va_block ->
829 //     remove the left part
830 // start/end could cover the right part of the HMM va_block ->
831 //     remove the right part
832 // or start/end could "punch a hole" in the middle and leave the ends intact.
833 // In each case, only one HMM va_block is removed so return it in out_va_block.
834 static NV_STATUS split_block_if_needed(uvm_va_block_t *va_block,
835                                        NvU64 start,
836                                        NvU64 end,
837                                        uvm_va_block_t **out_va_block)
838 {
839     uvm_va_block_context_t *va_block_context;
840     uvm_va_space_t *va_space;
841     struct mm_struct *mm;
842     struct vm_area_struct *vma;
843     uvm_va_block_region_t region;
844     NvU64 addr, from, to;
845     uvm_va_block_t *new;
846     NV_STATUS status;
847 
848     if (va_block->start < start) {
849         status = hmm_split_block(va_block, start - 1, &new);
850         if (status != NV_OK)
851             return status;
852 
853         // Keep the left part, the right part will be deleted.
854         va_block = new;
855     }
856 
857     if (va_block->end > end) {
858         status = hmm_split_block(va_block, end, NULL);
859         if (status != NV_OK)
860             return status;
861 
862         // Keep the right part, the left part will be deleted.
863     }
864 
865     *out_va_block = va_block;
866 
867     // Migrate any GPU data to sysmem before destroying the HMM va_block.
868     // We do this because the new va_range might be for a UVM external
869     // allocation which could be converting an address range that was first
870     // operated on by UVM-HMM and the exteral allocation should see that data.
871     va_space = va_block->hmm.va_space;
872     mm = va_space->va_space_mm.mm;
873     va_block_context = uvm_va_space_block_context(va_space, mm);
874 
875     for (addr = va_block->start; addr < va_block->end; addr = to + 1) {
876         vma = find_vma_intersection(mm, addr, va_block->end);
877         if (!vma)
878             break;
879 
880         from = max(addr, (NvU64)vma->vm_start);
881         to = min(va_block->end, (NvU64)vma->vm_end - 1);
882         region = uvm_va_block_region_from_start_end(va_block, from, to);
883 
884         if (!uvm_hmm_vma_is_valid(vma, from, false))
885             continue;
886 
887         va_block_context->hmm.vma = vma;
888 
889         status = hmm_migrate_range(va_block,
890                                    NULL,
891                                    va_block_context,
892                                    UVM_ID_CPU,
893                                    from,
894                                    to,
895                                    UVM_MIGRATE_MODE_MAKE_RESIDENT_AND_MAP,
896                                    NULL);
897         if (status != NV_OK)
898             return status;
899     }
900 
901     return NV_OK;
902 }
903 
904 // Normally, the HMM va_block is destroyed when the va_space is destroyed
905 // (i.e., when the /dev/nvidia-uvm device is closed). A munmap() call triggers
906 // a uvm_hmm_invalidate() callback which unmaps the VMA's range from the GPU's
907 // page tables. However, it doesn't destroy the va_block because that would
908 // require calling mmu_interval_notifier_remove() which can't be called from
909 // the invalidate callback due to Linux locking constraints. If a process
910 // calls mmap()/munmap() for SAM and then creates a managed allocation,
911 // the same VMA range can be picked and there would be a UVM/HMM va_block
912 // conflict. Creating a managed allocation, external allocation, or other
913 // va_range types, calls this function to remove stale HMM va_blocks or split
914 // the HMM va_block so there is no overlap.
915 NV_STATUS uvm_hmm_va_block_reclaim(uvm_va_space_t *va_space,
916                                    struct mm_struct *mm,
917                                    NvU64 start,
918                                    NvU64 end)
919 {
920     uvm_range_tree_node_t *node, *next;
921     uvm_va_block_t *va_block;
922     NV_STATUS status;
923 
924     if (!uvm_hmm_is_enabled(va_space))
925         return NV_OK;
926 
927     if (mm)
928         uvm_assert_mmap_lock_locked(mm);
929     uvm_assert_rwsem_locked_write(&va_space->lock);
930 
931     // Process each HMM va_block that overlaps the interval [start, end].
932     // Note that end is inclusive.
933     // The blocks_lock is not needed when the va_space lock is held for write.
934     uvm_range_tree_for_each_in_safe(node, next, &va_space->hmm.blocks, start, end) {
935         va_block = hmm_va_block_from_node(node);
936 
937         if (mm) {
938             status = split_block_if_needed(va_block, start, end, &va_block);
939             if (status != NV_OK)
940                 return status;
941         }
942 
943         // Note that this waits for any invalidations callbacks to complete
944         // so uvm_hmm_invalidate() won't see a block disapear.
945         // The va_space write lock should prevent uvm_hmm_va_block_find_create()
946         // from adding it back.
947         mmu_interval_notifier_remove(&va_block->hmm.notifier);
948         uvm_range_tree_remove(&va_space->hmm.blocks, &va_block->hmm.node);
949         uvm_va_block_kill(va_block);
950     }
951 
952     UVM_ASSERT(!uvm_range_tree_iter_first(&va_space->hmm.blocks, start, end));
953 
954     return NV_OK;
955 }
956 
957 void uvm_hmm_va_block_split_tree(uvm_va_block_t *existing_va_block, uvm_va_block_t *new_block)
958 {
959     uvm_va_space_t *va_space = existing_va_block->hmm.va_space;
960 
961     UVM_ASSERT(uvm_va_block_is_hmm(existing_va_block));
962     uvm_assert_rwsem_locked_write(&va_space->lock);
963 
964     uvm_range_tree_split(&existing_va_block->hmm.va_space->hmm.blocks,
965                          &existing_va_block->hmm.node,
966                          &new_block->hmm.node);
967 }
968 
969 NV_STATUS uvm_hmm_split_as_needed(uvm_va_space_t *va_space,
970                                   NvU64 addr,
971                                   uvm_va_policy_is_split_needed_t split_needed_cb,
972                                   void *data)
973 {
974     uvm_va_block_t *va_block;
975     uvm_va_policy_node_t *node;
976     NV_STATUS status;
977 
978     uvm_assert_rwsem_locked_write(&va_space->lock);
979 
980     // If there is no HMM va_block or the va_block doesn't span the policy
981     // addr, there is no need to split.
982     status = uvm_hmm_va_block_find(va_space, addr, &va_block);
983     if (status != NV_OK || va_block->start == addr)
984         return NV_OK;
985 
986     uvm_mutex_lock(&va_block->lock);
987 
988     node = uvm_va_policy_node_find(va_block, addr);
989     if (!node)
990         goto done;
991 
992     // If the policy range doesn't span addr, we're done.
993     if (addr == node->node.start)
994         goto done;
995 
996     if (split_needed_cb(&node->policy, data))
997         status = uvm_va_policy_node_split(va_block, node, addr - 1, NULL);
998 
999 done:
1000     uvm_mutex_unlock(&va_block->lock);
1001     return status;
1002 }
1003 
1004 static NV_STATUS hmm_set_preferred_location_locked(uvm_va_block_t *va_block,
1005                                                    uvm_va_block_context_t *va_block_context,
1006                                                    uvm_processor_id_t preferred_location,
1007                                                    NvU64 addr,
1008                                                    NvU64 end,
1009                                                    uvm_tracker_t *out_tracker)
1010 {
1011     uvm_processor_mask_t set_accessed_by_processors;
1012     const uvm_va_policy_t *old_policy;
1013     uvm_va_policy_node_t *node;
1014     uvm_va_block_region_t region;
1015     uvm_processor_id_t id;
1016     NV_STATUS status, tracker_status;
1017 
1018     // Note that we can't just call uvm_va_policy_set_range() for the whole
1019     // range [addr end] because we need to examine the old value of
1020     // policy->preferred_location before setting it. Thus we iterate over
1021     // the existing policy nodes.
1022     uvm_for_each_va_policy_in(old_policy, va_block, addr, end, node, region) {
1023         if (uvm_id_equal(old_policy->preferred_location, preferred_location))
1024             continue;
1025 
1026         // If the old preferred location is a valid processor ID, remote
1027         // mappings should be established to the new preferred location if
1028         // accessed-by is set.
1029         uvm_processor_mask_zero(&set_accessed_by_processors);
1030 
1031         if (UVM_ID_IS_VALID(old_policy->preferred_location) &&
1032             uvm_processor_mask_test(&old_policy->accessed_by, old_policy->preferred_location))
1033             uvm_processor_mask_set(&set_accessed_by_processors, old_policy->preferred_location);
1034 
1035         if (!uvm_va_policy_set_preferred_location(va_block, region, preferred_location, old_policy))
1036             return NV_ERR_NO_MEMORY;
1037 
1038         // Establish new remote mappings if the old preferred location had
1039         // accessed-by set.
1040         for_each_id_in_mask(id, &set_accessed_by_processors) {
1041             status = uvm_va_block_set_accessed_by_locked(va_block, va_block_context, id, region, out_tracker);
1042             if (status != NV_OK)
1043                 return status;
1044         }
1045 
1046         // Even though the UVM_VA_BLOCK_RETRY_LOCKED() may unlock and relock
1047         // the va_block lock, the policy remains valid because we hold the mmap
1048         // lock so munmap can't remove the policy, and the va_space lock so the
1049         // policy APIs can't change the policy.
1050         status = UVM_VA_BLOCK_RETRY_LOCKED(va_block,
1051                                            NULL,
1052                                            uvm_va_block_set_preferred_location_locked(va_block,
1053                                                                                       va_block_context,
1054                                                                                       region));
1055 
1056         tracker_status = uvm_tracker_add_tracker_safe(out_tracker, &va_block->tracker);
1057         if (status == NV_OK)
1058             status = tracker_status;
1059 
1060         if (status != NV_OK)
1061             return status;
1062     }
1063 
1064     return NV_OK;
1065 }
1066 
1067 NV_STATUS uvm_hmm_set_preferred_location(uvm_va_space_t *va_space,
1068                                          uvm_processor_id_t preferred_location,
1069                                          NvU64 base,
1070                                          NvU64 last_address,
1071                                          uvm_tracker_t *out_tracker)
1072 {
1073     uvm_va_block_context_t *va_block_context;
1074     uvm_va_block_t *va_block;
1075     NvU64 addr;
1076     NV_STATUS status = NV_OK;
1077 
1078     if (!uvm_hmm_is_enabled(va_space))
1079         return NV_ERR_INVALID_ADDRESS;
1080 
1081     uvm_assert_mmap_lock_locked(va_space->va_space_mm.mm);
1082     uvm_assert_rwsem_locked_write(&va_space->lock);
1083     UVM_ASSERT(PAGE_ALIGNED(base));
1084     UVM_ASSERT(PAGE_ALIGNED(last_address + 1));
1085     UVM_ASSERT(base < last_address);
1086 
1087     // Update HMM preferred location policy.
1088 
1089     va_block_context = uvm_va_space_block_context(va_space, va_space->va_space_mm.mm);
1090 
1091     for (addr = base; addr < last_address; addr = va_block->end + 1) {
1092         NvU64 end;
1093 
1094         status = hmm_va_block_find_create(va_space, addr, true, &va_block_context->hmm.vma, &va_block);
1095         if (status != NV_OK)
1096             break;
1097 
1098         end = min(last_address, va_block->end);
1099 
1100         uvm_mutex_lock(&va_block->lock);
1101 
1102         status = hmm_set_preferred_location_locked(va_block,
1103                                                    va_block_context,
1104                                                    preferred_location,
1105                                                    addr,
1106                                                    end,
1107                                                    out_tracker);
1108 
1109         uvm_mutex_unlock(&va_block->lock);
1110 
1111         if (status != NV_OK)
1112             break;
1113     }
1114 
1115     return status;
1116 }
1117 
1118 static NV_STATUS hmm_set_accessed_by_start_end_locked(uvm_va_block_t *va_block,
1119                                                       uvm_va_block_context_t *va_block_context,
1120                                                       uvm_processor_id_t processor_id,
1121                                                       NvU64 start,
1122                                                       NvU64 end,
1123                                                       uvm_tracker_t *out_tracker)
1124 {
1125     uvm_va_space_t *va_space = va_block->hmm.va_space;
1126     uvm_va_policy_node_t *node;
1127     uvm_va_block_region_t region;
1128     NV_STATUS status = NV_OK;
1129 
1130     uvm_for_each_va_policy_node_in(node, va_block, start, end) {
1131         // Read duplication takes precedence over SetAccessedBy.
1132         // Do not add mappings if read duplication is enabled.
1133         if (uvm_va_policy_is_read_duplicate(&node->policy, va_space))
1134             continue;
1135 
1136         region = uvm_va_block_region_from_start_end(va_block,
1137                                                     max(start, node->node.start),
1138                                                     min(end, node->node.end));
1139 
1140         status = uvm_va_block_set_accessed_by_locked(va_block,
1141                                                      va_block_context,
1142                                                      processor_id,
1143                                                      region,
1144                                                      out_tracker);
1145         if (status != NV_OK)
1146             break;
1147     }
1148 
1149     return status;
1150 }
1151 
1152 NV_STATUS uvm_hmm_set_accessed_by(uvm_va_space_t *va_space,
1153                                   uvm_processor_id_t processor_id,
1154                                   bool set_bit,
1155                                   NvU64 base,
1156                                   NvU64 last_address,
1157                                   uvm_tracker_t *out_tracker)
1158 {
1159     uvm_va_block_context_t *va_block_context;
1160     uvm_va_block_t *va_block;
1161     NvU64 addr;
1162     NV_STATUS status = NV_OK;
1163 
1164     if (!uvm_hmm_is_enabled(va_space))
1165         return NV_ERR_INVALID_ADDRESS;
1166 
1167     uvm_assert_mmap_lock_locked(va_space->va_space_mm.mm);
1168     uvm_assert_rwsem_locked_write(&va_space->lock);
1169     UVM_ASSERT(PAGE_ALIGNED(base));
1170     UVM_ASSERT(PAGE_ALIGNED(last_address + 1));
1171     UVM_ASSERT(base < last_address);
1172 
1173     // Update HMM accessed by policy.
1174 
1175     va_block_context = uvm_va_space_block_context(va_space, va_space->va_space_mm.mm);
1176 
1177     for (addr = base; addr < last_address; addr = va_block->end + 1) {
1178         NvU64 end;
1179 
1180         status = hmm_va_block_find_create(va_space, addr, true, &va_block_context->hmm.vma, &va_block);
1181         if (status != NV_OK)
1182             break;
1183 
1184         end = min(last_address, va_block->end);
1185 
1186         uvm_mutex_lock(&va_block->lock);
1187 
1188         status = uvm_va_policy_set_range(va_block,
1189                                          addr,
1190                                          end,
1191                                          UVM_VA_POLICY_ACCESSED_BY,
1192                                          !set_bit,
1193                                          processor_id,
1194                                          UVM_READ_DUPLICATION_MAX);
1195 
1196         if (status == NV_OK && set_bit) {
1197             status = hmm_set_accessed_by_start_end_locked(va_block,
1198                                                           va_block_context,
1199                                                           processor_id,
1200                                                           addr,
1201                                                           end,
1202                                                           out_tracker);
1203         }
1204 
1205         uvm_mutex_unlock(&va_block->lock);
1206 
1207         if (status != NV_OK)
1208             break;
1209     }
1210 
1211     return status;
1212 }
1213 
1214 void uvm_hmm_block_add_eviction_mappings(uvm_va_space_t *va_space,
1215                                          uvm_va_block_t *va_block,
1216                                          uvm_va_block_context_t *block_context)
1217 {
1218     uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
1219     uvm_va_policy_node_t *node;
1220     uvm_va_block_region_t region;
1221     uvm_processor_mask_t map_processors;
1222     uvm_processor_id_t id;
1223     NV_STATUS tracker_status;
1224     NV_STATUS status = NV_OK;
1225 
1226     UVM_ASSERT(uvm_va_block_is_hmm(va_block));
1227     uvm_assert_mmap_lock_locked(va_space->va_space_mm.mm);
1228     uvm_assert_rwsem_locked(&va_space->lock);
1229 
1230     uvm_mutex_lock(&va_block->lock);
1231 
1232     uvm_for_each_va_policy_node_in(node, va_block, va_block->start, va_block->end) {
1233         for_each_id_in_mask(id, &node->policy.accessed_by) {
1234             status = hmm_set_accessed_by_start_end_locked(va_block,
1235                                                           block_context,
1236                                                           id,
1237                                                           node->node.start,
1238                                                           node->node.end,
1239                                                           &local_tracker);
1240             if (status != NV_OK)
1241                 break;
1242 
1243             if (!uvm_va_space_map_remote_on_eviction(va_space))
1244                 continue;
1245 
1246             // Exclude the processors that have been already mapped due to
1247             // AccessedBy.
1248             uvm_processor_mask_andnot(&map_processors, &va_block->evicted_gpus, &node->policy.accessed_by);
1249 
1250             for_each_gpu_id_in_mask(id, &map_processors) {
1251                 uvm_gpu_t *gpu = uvm_va_space_get_gpu(va_space, id);
1252                 uvm_va_block_gpu_state_t *gpu_state;
1253 
1254                 if (!gpu->parent->access_counters_supported)
1255                     continue;
1256 
1257                 gpu_state = uvm_va_block_gpu_state_get(va_block, id);
1258                 UVM_ASSERT(gpu_state);
1259 
1260                 // TODO: Bug 2096389: uvm_va_block_add_mappings does not add
1261                 // remote mappings to read-duplicated pages. Add support for it
1262                 // or create a new function.
1263                 status = uvm_va_block_add_mappings(va_block,
1264                                                    block_context,
1265                                                    id,
1266                                                    region,
1267                                                    &gpu_state->evicted,
1268                                                    UvmEventMapRemoteCauseEviction);
1269                 tracker_status = uvm_tracker_add_tracker_safe(&local_tracker, &va_block->tracker);
1270                 status = (status == NV_OK) ? tracker_status : status;
1271                 if (status != NV_OK) {
1272                     UVM_ASSERT(status != NV_ERR_MORE_PROCESSING_REQUIRED);
1273                     break;
1274                 }
1275             }
1276         }
1277     }
1278 
1279     uvm_mutex_unlock(&va_block->lock);
1280 
1281     tracker_status = uvm_tracker_wait_deinit(&local_tracker);
1282     status = (status == NV_OK) ? tracker_status : status;
1283     if (status != NV_OK) {
1284         UVM_ERR_PRINT("Deferred mappings to evicted memory for block [0x%llx, 0x%llx] failed %s\n",
1285                       va_block->start,
1286                       va_block->end,
1287                       nvstatusToString(status));
1288     }
1289 }
1290 
1291 const uvm_va_policy_t *uvm_hmm_find_policy_end(uvm_va_block_t *va_block,
1292                                                struct vm_area_struct *vma,
1293                                                unsigned long addr,
1294                                                NvU64 *endp)
1295 {
1296     const uvm_va_policy_node_t *node;
1297     const uvm_va_policy_t *policy;
1298     NvU64 end = va_block->end;
1299 
1300     uvm_assert_mmap_lock_locked(vma->vm_mm);
1301     uvm_assert_mutex_locked(&va_block->lock);
1302 
1303     if (end > vma->vm_end - 1)
1304         end = vma->vm_end - 1;
1305 
1306     node = uvm_va_policy_node_find(va_block, addr);
1307     if (node) {
1308         policy = &node->policy;
1309         if (end > node->node.end)
1310             end = node->node.end;
1311     }
1312     else {
1313         policy = &uvm_va_policy_default;
1314     }
1315 
1316     *endp = end;
1317 
1318     return policy;
1319 }
1320 
1321 NV_STATUS uvm_hmm_find_policy_vma_and_outer(uvm_va_block_t *va_block,
1322                                             struct vm_area_struct **vma_out,
1323                                             uvm_page_index_t page_index,
1324                                             const uvm_va_policy_t **policy,
1325                                             uvm_page_index_t *outerp)
1326 {
1327     unsigned long addr;
1328     NvU64 end;
1329     uvm_page_index_t outer;
1330     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
1331     struct mm_struct *mm = va_space->va_space_mm.mm;
1332 
1333     if (!mm)
1334         return NV_ERR_INVALID_ADDRESS;
1335 
1336     UVM_ASSERT(uvm_va_block_is_hmm(va_block));
1337     uvm_assert_mmap_lock_locked(mm);
1338     uvm_assert_mutex_locked(&va_block->lock);
1339 
1340     addr = uvm_va_block_cpu_page_address(va_block, page_index);
1341 
1342     *vma_out = vma_lookup(mm, addr);
1343     if (!*vma_out || !((*vma_out)->vm_flags & VM_READ))
1344         return NV_ERR_INVALID_ADDRESS;
1345 
1346     *policy = uvm_hmm_find_policy_end(va_block, *vma_out, addr, &end);
1347 
1348     outer = uvm_va_block_cpu_page_index(va_block, end) + 1;
1349     if (*outerp > outer)
1350         *outerp = outer;
1351 
1352     return NV_OK;
1353 }
1354 
1355 static NV_STATUS hmm_clear_thrashing_policy(uvm_va_block_t *va_block,
1356                                             uvm_va_block_context_t *block_context)
1357 {
1358     const uvm_va_policy_t *policy;
1359     uvm_va_policy_node_t *node;
1360     uvm_va_block_region_t region;
1361     NV_STATUS status = NV_OK;
1362 
1363     uvm_mutex_lock(&va_block->lock);
1364 
1365     uvm_for_each_va_policy_in(policy, va_block, va_block->start, va_block->end, node, region) {
1366         // Unmap may split PTEs and require a retry. Needs to be called
1367         // before the pinned pages information is destroyed.
1368         status = UVM_VA_BLOCK_RETRY_LOCKED(va_block,
1369                                            NULL,
1370                                            uvm_perf_thrashing_unmap_remote_pinned_pages_all(va_block,
1371                                                                                             block_context,
1372                                                                                             region));
1373 
1374         uvm_perf_thrashing_info_destroy(va_block);
1375 
1376         if (status != NV_OK)
1377             break;
1378     }
1379 
1380     uvm_mutex_unlock(&va_block->lock);
1381 
1382     return status;
1383 }
1384 
1385 NV_STATUS uvm_hmm_clear_thrashing_policy(uvm_va_space_t *va_space)
1386 {
1387     uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, NULL);
1388     uvm_range_tree_node_t *node, *next;
1389     uvm_va_block_t *va_block;
1390     NV_STATUS status = NV_OK;
1391 
1392     if (!uvm_hmm_is_enabled(va_space))
1393         return NV_OK;
1394 
1395     uvm_assert_rwsem_locked_write(&va_space->lock);
1396 
1397     uvm_range_tree_for_each_safe(node, next, &va_space->hmm.blocks) {
1398         va_block = hmm_va_block_from_node(node);
1399 
1400         status = hmm_clear_thrashing_policy(va_block, block_context);
1401         if (status != NV_OK)
1402             break;
1403     }
1404 
1405     return status;
1406 }
1407 
1408 uvm_va_block_region_t uvm_hmm_get_prefetch_region(uvm_va_block_t *va_block,
1409                                                   struct vm_area_struct *vma,
1410                                                   const uvm_va_policy_t *policy,
1411                                                   NvU64 address)
1412 {
1413     NvU64 start, end;
1414 
1415     UVM_ASSERT(uvm_va_block_is_hmm(va_block));
1416 
1417     // We need to limit the prefetch region to the VMA.
1418     start = max(va_block->start, (NvU64)vma->vm_start);
1419     end = min(va_block->end, (NvU64)vma->vm_end - 1);
1420 
1421     // Also, we need to limit the prefetch region to the policy range.
1422     if (uvm_va_policy_is_default(policy)) {
1423         NV_STATUS status = uvm_range_tree_find_hole_in(&va_block->hmm.va_policy_tree,
1424                                                        address,
1425                                                        &start,
1426                                                        &end);
1427         // We already know the hole exists and covers the fault region.
1428         UVM_ASSERT(status == NV_OK);
1429     }
1430     else {
1431         const uvm_va_policy_node_t *node = uvm_va_policy_node_from_policy(policy);
1432 
1433         start = max(start, node->node.start);
1434         end = min(end, node->node.end);
1435     }
1436 
1437     return uvm_va_block_region_from_start_end(va_block, start, end);
1438 }
1439 
1440 uvm_prot_t uvm_hmm_compute_logical_prot(uvm_va_block_t *va_block,
1441                                         struct vm_area_struct *vma,
1442                                         NvU64 addr)
1443 {
1444     UVM_ASSERT(uvm_va_block_is_hmm(va_block));
1445     uvm_assert_mmap_lock_locked(va_block->hmm.va_space->va_space_mm.mm);
1446     UVM_ASSERT(vma && addr >= vma->vm_start && addr < vma->vm_end);
1447 
1448     if (!(vma->vm_flags & VM_READ))
1449         return UVM_PROT_NONE;
1450     else if (!(vma->vm_flags & VM_WRITE))
1451         return UVM_PROT_READ_ONLY;
1452     else
1453         return UVM_PROT_READ_WRITE_ATOMIC;
1454 }
1455 
1456 static NV_STATUS hmm_va_block_cpu_page_populate(uvm_va_block_t *va_block,
1457                                                 uvm_page_index_t page_index,
1458                                                 struct page *page)
1459 {
1460     uvm_cpu_chunk_t *chunk;
1461     NV_STATUS status;
1462 
1463     UVM_ASSERT(uvm_va_block_is_hmm(va_block));
1464     UVM_ASSERT(!uvm_page_mask_test(&va_block->cpu.allocated, page_index));
1465 
1466     if (page == ZERO_PAGE(uvm_va_block_cpu_page_address(va_block, page_index)))
1467         return NV_ERR_INVALID_ADDRESS;
1468 
1469     status = uvm_cpu_chunk_alloc_hmm(page, &chunk);
1470     if (status != NV_OK)
1471         return status;
1472 
1473     status = uvm_cpu_chunk_insert_in_block(va_block, chunk, page_index);
1474     if (status != NV_OK) {
1475         uvm_cpu_chunk_free(chunk);
1476         return status;
1477     }
1478 
1479     status = uvm_va_block_map_cpu_chunk_on_gpus(va_block, page_index);
1480     if (status != NV_OK) {
1481         uvm_cpu_chunk_remove_from_block(va_block, page_index);
1482         uvm_cpu_chunk_free(chunk);
1483     }
1484 
1485     return status;
1486 }
1487 
1488 static void hmm_va_block_cpu_page_unpopulate(uvm_va_block_t *va_block,
1489                                              uvm_page_index_t page_index)
1490 {
1491     uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, page_index);
1492 
1493     UVM_ASSERT(uvm_va_block_is_hmm(va_block));
1494 
1495     if (!chunk)
1496         return;
1497 
1498     UVM_ASSERT(!uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU) ||
1499                !uvm_page_mask_test(&va_block->cpu.resident, page_index));
1500 
1501     uvm_cpu_chunk_remove_from_block(va_block, page_index);
1502     uvm_va_block_unmap_cpu_chunk_on_gpus(va_block, chunk, page_index);
1503     uvm_cpu_chunk_free(chunk);
1504 }
1505 
1506 static bool hmm_va_block_cpu_page_is_same(uvm_va_block_t *va_block,
1507                                           uvm_page_index_t page_index,
1508                                           struct page *page)
1509 {
1510     struct page *old_page = uvm_cpu_chunk_get_cpu_page(va_block, page_index);
1511 
1512     UVM_ASSERT(uvm_cpu_chunk_is_hmm(uvm_cpu_chunk_get_chunk_for_page(va_block, page_index)));
1513     return old_page == page;
1514 }
1515 
1516 // uvm_va_block_service_copy() and uvm_va_block_service_finish() expect the
1517 // service_context masks to match what is being processed. Since a page
1518 // that was expected to be processed isn't migrating, we have to clear the
1519 // masks to make service_context consistent with what is actually being
1520 // handled.
1521 static void clear_service_context_masks(uvm_service_block_context_t *service_context,
1522                                         uvm_processor_id_t new_residency,
1523                                         uvm_page_index_t page_index)
1524 {
1525     uvm_page_mask_clear(&service_context->block_context.caller_page_mask, page_index);
1526 
1527     uvm_page_mask_clear(&service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency,
1528                         page_index);
1529 
1530     if (uvm_page_mask_empty(&service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency))
1531         uvm_processor_mask_clear(&service_context->resident_processors, new_residency);
1532 
1533     if (UVM_ID_IS_VALID(service_context->prefetch_hint.residency))
1534         uvm_page_mask_clear(&service_context->prefetch_hint.prefetch_pages_mask, page_index);
1535 
1536     if (service_context->thrashing_pin_count > 0 &&
1537         uvm_page_mask_test_and_clear(&service_context->thrashing_pin_mask, page_index)) {
1538         service_context->thrashing_pin_count--;
1539     }
1540 
1541     if (service_context->read_duplicate_count > 0 &&
1542         uvm_page_mask_test_and_clear(&service_context->read_duplicate_mask, page_index)) {
1543         service_context->read_duplicate_count--;
1544     }
1545 }
1546 
1547 static void cpu_mapping_set(uvm_va_block_t *va_block,
1548                             bool is_write,
1549                             uvm_page_index_t page_index)
1550 {
1551     uvm_processor_mask_set(&va_block->mapped, UVM_ID_CPU);
1552     uvm_page_mask_set(&va_block->maybe_mapped_pages, page_index);
1553     uvm_page_mask_set(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ], page_index);
1554     if (is_write)
1555         uvm_page_mask_set(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE], page_index);
1556     else
1557         uvm_page_mask_clear(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE], page_index);
1558 }
1559 
1560 static void cpu_mapping_clear(uvm_va_block_t *va_block, uvm_page_index_t page_index)
1561 {
1562     uvm_page_mask_clear(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE], page_index);
1563     uvm_page_mask_clear(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ], page_index);
1564     if (uvm_page_mask_empty(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ]))
1565         uvm_processor_mask_clear(&va_block->mapped, UVM_ID_CPU);
1566 }
1567 
1568 static void gpu_chunk_remove(uvm_va_block_t *va_block,
1569                              uvm_page_index_t page_index,
1570                              struct page *page)
1571 {
1572     uvm_va_block_gpu_state_t *gpu_state;
1573     uvm_gpu_chunk_t *gpu_chunk;
1574     uvm_gpu_id_t id;
1575 
1576     id = uvm_pmm_devmem_page_to_gpu_id(page);
1577     gpu_state = uvm_va_block_gpu_state_get(va_block, id);
1578     UVM_ASSERT(gpu_state);
1579 
1580     gpu_chunk = gpu_state->chunks[page_index];
1581     if (!gpu_chunk) {
1582         // If we didn't find a chunk it's because the page was unmapped for
1583         // mremap and no fault has established a new mapping.
1584         UVM_ASSERT(!uvm_page_mask_test(&gpu_state->resident, page_index));
1585         return;
1586     }
1587 
1588     // TODO: Bug 3898467: unmap indirect peers when freeing GPU chunks
1589 
1590     uvm_mmu_chunk_unmap(gpu_chunk, &va_block->tracker);
1591     gpu_state->chunks[page_index] = NULL;
1592 }
1593 
1594 static NV_STATUS gpu_chunk_add(uvm_va_block_t *va_block,
1595                                uvm_page_index_t page_index,
1596                                struct page *page)
1597 {
1598     uvm_va_block_gpu_state_t *gpu_state;
1599     uvm_gpu_chunk_t *gpu_chunk;
1600     uvm_gpu_id_t id;
1601     NV_STATUS status;
1602 
1603     id = uvm_pmm_devmem_page_to_gpu_id(page);
1604     gpu_state = uvm_va_block_gpu_state_get(va_block, id);
1605 
1606     // It's possible that this is a fresh va_block we're trying to add an
1607     // existing gpu_chunk to. This occurs for example when a GPU faults on a
1608     // virtual address that has been remapped with mremap().
1609     if (!gpu_state) {
1610         status = uvm_va_block_gpu_state_alloc(va_block);
1611         if (status != NV_OK)
1612             return status;
1613         gpu_state = uvm_va_block_gpu_state_get(va_block, id);
1614     }
1615 
1616     UVM_ASSERT(gpu_state);
1617 
1618     // Note that a mremap() might be to a CPU virtual address that is nolonger
1619     // aligned with a larger GPU chunk size. We would need to allocate a new
1620     // aligned GPU chunk and copy from old to new.
1621     // TODO: Bug 3368756: add support for large GPU pages.
1622     gpu_chunk = uvm_pmm_devmem_page_to_chunk(page);
1623     UVM_ASSERT(gpu_chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED);
1624     UVM_ASSERT(gpu_chunk->is_referenced);
1625     UVM_ASSERT(page->zone_device_data == va_block->hmm.va_space);
1626 
1627     if (gpu_state->chunks[page_index] == gpu_chunk)
1628         return NV_OK;
1629 
1630     UVM_ASSERT(!gpu_state->chunks[page_index]);
1631 
1632     // In some configurations such as SR-IOV heavy, the chunk cannot be
1633     // referenced using its physical address. Create a virtual mapping.
1634     status = uvm_mmu_chunk_map(gpu_chunk);
1635     if (status != NV_OK)
1636         return status;
1637 
1638     // TODO: Bug 3898467: map indirect peers.
1639 
1640     uvm_processor_mask_set(&va_block->resident, id);
1641     uvm_page_mask_set(&gpu_state->resident, page_index);
1642 
1643     // It is safe to modify the page index field without holding any PMM locks
1644     // because the chunk is allocated, which means that none of the other
1645     // fields in the bitmap can change.
1646     gpu_chunk->va_block = va_block;
1647     gpu_chunk->va_block_page_index = page_index;
1648 
1649     gpu_state->chunks[page_index] = gpu_chunk;
1650 
1651     return NV_OK;
1652 }
1653 
1654 // This is called just before calling migrate_vma_finalize() in order to wait
1655 // for GPU operations to complete and update the va_block state to match which
1656 // pages migrated (or not) and therefore which pages will be released by
1657 // migrate_vma_finalize().
1658 // 'migrated_pages' is the mask of pages that migrated,
1659 // 'same_devmem_page_mask' is the mask of pages that are the same in src_pfns
1660 // and dst_pfns and therefore appear to migrate_vma_*() to be not migrating.
1661 // 'region' is the page index region of all migrated, non-migrated, and
1662 // same_devmem_page_mask pages.
1663 static NV_STATUS sync_page_and_chunk_state(uvm_va_block_t *va_block,
1664                                            const unsigned long *src_pfns,
1665                                            const unsigned long *dst_pfns,
1666                                            uvm_va_block_region_t region,
1667                                            const uvm_page_mask_t *migrated_pages,
1668                                            const uvm_page_mask_t *same_devmem_page_mask)
1669 {
1670     uvm_page_index_t page_index;
1671     NV_STATUS status;
1672 
1673     // Wait for the GPU to finish. migrate_vma_finalize() will release the
1674     // migrated source pages (or non migrating destination pages), so GPU
1675     // opererations must be finished by then.
1676     status = uvm_tracker_wait(&va_block->tracker);
1677 
1678     for_each_va_block_page_in_region(page_index, region) {
1679         struct page *page;
1680 
1681         if (uvm_page_mask_test(same_devmem_page_mask, page_index))
1682             continue;
1683 
1684         // If a page migrated, clean up the source page.
1685         // Otherwise, clean up the destination page.
1686         if (uvm_page_mask_test(migrated_pages, page_index))
1687             page = migrate_pfn_to_page(src_pfns[page_index]);
1688         else
1689             page = migrate_pfn_to_page(dst_pfns[page_index]);
1690 
1691         if (!page)
1692             continue;
1693 
1694         if (is_device_private_page(page)) {
1695             gpu_chunk_remove(va_block, page_index, page);
1696         }
1697         else {
1698             // If the source page is a system memory page,
1699             // migrate_vma_finalize() will release the reference so we should
1700             // clear our pointer to it.
1701             // TODO: Bug 3660922: Need to handle read duplication at some point.
1702             hmm_va_block_cpu_page_unpopulate(va_block, page_index);
1703         }
1704     }
1705 
1706     return status;
1707 }
1708 
1709 // Update va_block state to reflect that the page isn't migrating.
1710 static void clean_up_non_migrating_page(uvm_va_block_t *va_block,
1711                                         const unsigned long *src_pfns,
1712                                         unsigned long *dst_pfns,
1713                                         uvm_page_index_t page_index)
1714 {
1715     struct page *dst_page = migrate_pfn_to_page(dst_pfns[page_index]);
1716 
1717     if (!dst_page)
1718         return;
1719 
1720     // migrate_vma_finalize() will release the dst_page reference so don't keep
1721     // a pointer to it.
1722     if (is_device_private_page(dst_page)) {
1723         gpu_chunk_remove(va_block, page_index, dst_page);
1724     }
1725     else {
1726         UVM_ASSERT(page_ref_count(dst_page) == 1);
1727 
1728         hmm_va_block_cpu_page_unpopulate(va_block, page_index);
1729     }
1730 
1731     unlock_page(dst_page);
1732     put_page(dst_page);
1733     dst_pfns[page_index] = 0;
1734 }
1735 
1736 static void clean_up_non_migrating_pages(uvm_va_block_t *va_block,
1737                                          const unsigned long *src_pfns,
1738                                          unsigned long *dst_pfns,
1739                                          uvm_va_block_region_t region,
1740                                          uvm_page_mask_t *page_mask)
1741 {
1742     uvm_page_index_t page_index;
1743     NV_STATUS status;
1744 
1745     status = uvm_tracker_wait(&va_block->tracker);
1746     UVM_ASSERT(status == NV_OK);
1747 
1748     for_each_va_block_page_in_region_mask(page_index, page_mask, region) {
1749         clean_up_non_migrating_page(va_block, src_pfns, dst_pfns, page_index);
1750     }
1751 }
1752 
1753 // CPU page fault handling.
1754 
1755 // Fill in the dst_pfns[page_index] entry given that there is an allocated
1756 // CPU page.
1757 static void lock_block_cpu_page(uvm_va_block_t *va_block,
1758                                 uvm_page_index_t page_index,
1759                                 struct page *src_page,
1760                                 unsigned long *dst_pfns,
1761                                 uvm_page_mask_t *same_devmem_page_mask)
1762 {
1763     uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, page_index);
1764     uvm_va_block_region_t chunk_region;
1765     struct page *dst_page;
1766 
1767     UVM_ASSERT(chunk);
1768     UVM_ASSERT(chunk->page);
1769 
1770     chunk_region = uvm_va_block_chunk_region(va_block, uvm_cpu_chunk_get_size(chunk), page_index);
1771 
1772     dst_page = chunk->page + (page_index - chunk_region.first);
1773 
1774     UVM_ASSERT(dst_page != ZERO_PAGE(uvm_va_block_cpu_page_address(va_block, page_index)));
1775     UVM_ASSERT(!is_device_private_page(dst_page));
1776 
1777     // The source page is usually a device private page but it could be a GPU
1778     // remote mapped system memory page. It could also be a driver allocated
1779     // page for GPU-to-GPU staged copies (i.e., not a resident copy and owned
1780     // by the driver).
1781     if (is_device_private_page(src_page)) {
1782         // Since the page isn't mirrored, it was allocated by alloc_pages()
1783         // and UVM owns the reference. We leave the reference count unchanged
1784         // and mark the page pointer as mirrored since UVM is transferring
1785         // ownership to Linux and we don't want UVM to double free the page in
1786         // hmm_va_block_cpu_page_unpopulate() or block_kill(). If the page
1787         // does not migrate, it will be freed though.
1788         UVM_ASSERT(!uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU) ||
1789                    !uvm_page_mask_test(&va_block->cpu.resident, page_index));
1790         UVM_ASSERT(chunk->type == UVM_CPU_CHUNK_TYPE_PHYSICAL);
1791         UVM_ASSERT(page_ref_count(dst_page) == 1);
1792         uvm_cpu_chunk_make_hmm(chunk);
1793     }
1794     else {
1795         UVM_ASSERT(same_devmem_page_mask);
1796         UVM_ASSERT(src_page == dst_page);
1797         uvm_page_mask_set(same_devmem_page_mask, page_index);
1798 
1799         // The call to migrate_vma_setup() will have inserted a migration PTE
1800         // so the CPU has no access.
1801         cpu_mapping_clear(va_block, page_index);
1802         return;
1803     }
1804 
1805     lock_page(dst_page);
1806     dst_pfns[page_index] = migrate_pfn(page_to_pfn(dst_page));
1807 }
1808 
1809 static void hmm_mark_gpu_chunk_referenced(uvm_va_block_t *va_block,
1810                                           uvm_gpu_t *gpu,
1811                                           uvm_gpu_chunk_t *gpu_chunk)
1812 {
1813     // Tell PMM to expect a callback from Linux to free the page since the
1814     // device private struct page reference count will determine when the
1815     // GPU chunk is free.
1816     UVM_ASSERT(gpu_chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED);
1817     list_del_init(&gpu_chunk->list);
1818     uvm_pmm_gpu_unpin_referenced(&gpu->pmm, gpu_chunk, va_block);
1819 }
1820 
1821 static void fill_dst_pfn(uvm_va_block_t *va_block,
1822                          uvm_gpu_t *gpu,
1823                          const unsigned long *src_pfns,
1824                          unsigned long *dst_pfns,
1825                          uvm_page_index_t page_index,
1826                          uvm_page_mask_t *same_devmem_page_mask)
1827 {
1828     unsigned long src_pfn = src_pfns[page_index];
1829     uvm_gpu_chunk_t *gpu_chunk;
1830     unsigned long pfn;
1831     struct page *dpage;
1832 
1833     gpu_chunk = uvm_va_block_lookup_gpu_chunk(va_block, gpu, uvm_va_block_cpu_page_address(va_block, page_index));
1834     UVM_ASSERT(gpu_chunk);
1835     UVM_ASSERT(gpu_chunk->log2_size == PAGE_SHIFT);
1836     pfn = uvm_pmm_gpu_devmem_get_pfn(&gpu->pmm, gpu_chunk);
1837 
1838     // If the same GPU page is both source and destination, migrate_vma_pages()
1839     // will see the wrong "expected" reference count and not migrate it, so we
1840     // mark it as not migrating but we keep track of this so we don't confuse
1841     // it with a page that migrate_vma_pages() actually does not migrate.
1842     if ((src_pfn & MIGRATE_PFN_VALID) && (src_pfn >> MIGRATE_PFN_SHIFT) == pfn) {
1843         uvm_page_mask_set(same_devmem_page_mask, page_index);
1844         return;
1845     }
1846 
1847     dpage = pfn_to_page(pfn);
1848     UVM_ASSERT(is_device_private_page(dpage));
1849     UVM_ASSERT(dpage->pgmap->owner == &g_uvm_global);
1850 
1851     hmm_mark_gpu_chunk_referenced(va_block, gpu, gpu_chunk);
1852     UVM_ASSERT(!page_count(dpage));
1853     zone_device_page_init(dpage);
1854     dpage->zone_device_data = va_block->hmm.va_space;
1855 
1856     dst_pfns[page_index] = migrate_pfn(pfn);
1857 }
1858 
1859 static void fill_dst_pfns(uvm_va_block_t *va_block,
1860                           const unsigned long *src_pfns,
1861                           unsigned long *dst_pfns,
1862                           uvm_va_block_region_t region,
1863                           uvm_page_mask_t *page_mask,
1864                           uvm_page_mask_t *same_devmem_page_mask,
1865                           uvm_processor_id_t dest_id)
1866 {
1867     uvm_gpu_t *gpu = uvm_va_space_get_gpu(va_block->hmm.va_space, dest_id);
1868     uvm_page_index_t page_index;
1869 
1870     uvm_page_mask_zero(same_devmem_page_mask);
1871 
1872     for_each_va_block_page_in_region_mask(page_index, page_mask, region) {
1873         if (!(src_pfns[page_index] & MIGRATE_PFN_MIGRATE))
1874             continue;
1875 
1876         fill_dst_pfn(va_block,
1877                      gpu,
1878                      src_pfns,
1879                      dst_pfns,
1880                      page_index,
1881                      same_devmem_page_mask);
1882     }
1883 }
1884 
1885 static NV_STATUS alloc_and_copy_to_cpu(uvm_va_block_t *va_block,
1886                                        struct vm_area_struct *vma,
1887                                        const unsigned long *src_pfns,
1888                                        unsigned long *dst_pfns,
1889                                        uvm_va_block_region_t region,
1890                                        uvm_page_mask_t *page_mask,
1891                                        uvm_page_mask_t *same_devmem_page_mask,
1892                                        uvm_processor_id_t processor_id,
1893                                        uvm_service_block_context_t *service_context)
1894 {
1895     uvm_page_index_t page_index;
1896     NV_STATUS status = NV_OK;
1897 
1898     for_each_va_block_page_in_region_mask(page_index, page_mask, region) {
1899         struct page *src_page;
1900         struct page *dst_page;
1901         gfp_t gfp;
1902 
1903         if (!(src_pfns[page_index] & MIGRATE_PFN_MIGRATE)) {
1904             // Device exclusive PTEs are not selected but we still want to
1905             // process the page so record it as such.
1906             if (service_context && !UVM_ID_IS_CPU(processor_id) &&
1907                 service_context->access_type[page_index] == UVM_FAULT_ACCESS_TYPE_ATOMIC_STRONG) {
1908                 uvm_page_mask_set(same_devmem_page_mask, page_index);
1909                 continue;
1910             }
1911 
1912             // We have previously found a page that is CPU resident which can't
1913             // be migrated (probably a shared mapping) so make sure we establish
1914             // a remote mapping for it.
1915             if (uvm_page_mask_test(same_devmem_page_mask, page_index))
1916                 continue;
1917 
1918             goto clr_mask;
1919         }
1920 
1921         // This is the page that will be copied to system memory.
1922         src_page = migrate_pfn_to_page(src_pfns[page_index]);
1923 
1924         if (src_page) {
1925             // mremap may have caused us to loose the gpu_chunk associated with
1926             // this va_block/page_index so make sure we have the correct chunk.
1927             if (is_device_private_page(src_page))
1928                 gpu_chunk_add(va_block, page_index, src_page);
1929 
1930             if (uvm_page_mask_test(&va_block->cpu.allocated, page_index)) {
1931                 lock_block_cpu_page(va_block, page_index, src_page, dst_pfns, same_devmem_page_mask);
1932                 continue;
1933             }
1934         }
1935 
1936         UVM_ASSERT(!uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU) ||
1937                    !uvm_page_mask_test(&va_block->cpu.resident, page_index));
1938 
1939         // Allocate a user system memory page for the destination.
1940         // This is the typical case since Linux will free the source page when
1941         // migrating to device private memory.
1942         // If there is no source page, it means the page is pte_none() or the
1943         // zero page. This case "shouldn't happen" because we asked
1944         // migrate_vma_setup() only for device private pages but
1945         // migrate_vma_collect_hole() doesn't check the
1946         // MIGRATE_VMA_SELECT_SYSTEM flag.
1947         gfp = GFP_HIGHUSER_MOVABLE;
1948         if (!src_page)
1949             gfp |= __GFP_ZERO;
1950 
1951         dst_page = alloc_page_vma(gfp,
1952                                   vma,
1953                                   va_block->start + (page_index << PAGE_SHIFT));
1954         if (!dst_page) {
1955             // Ignore errors if the page is only for prefetching.
1956             if (service_context &&
1957                 service_context->access_type[page_index] == UVM_FAULT_ACCESS_TYPE_PREFETCH)
1958                 goto clr_mask;
1959             UVM_ERR_PRINT("cannot allocate page %u (addr 0x%llx)\n",
1960                           page_index, va_block->start + (page_index << PAGE_SHIFT));
1961             status = NV_ERR_NO_MEMORY;
1962             break;
1963         }
1964 
1965         status = hmm_va_block_cpu_page_populate(va_block, page_index, dst_page);
1966         if (status != NV_OK) {
1967             __free_page(dst_page);
1968             // Ignore errors if the page is only for prefetching.
1969             if (service_context &&
1970                 service_context->access_type[page_index] == UVM_FAULT_ACCESS_TYPE_PREFETCH)
1971                 goto clr_mask;
1972             break;
1973         }
1974 
1975         // Note that we don't call get_page(dst_page) since alloc_page_vma()
1976         // returns with a page reference count of one and we are passing
1977         // ownership to Linux. Also, uvm_va_block_cpu_page_populate() recorded
1978         // the page as "mirrored" so that migrate_vma_finalize() and
1979         // hmm_va_block_cpu_page_unpopulate() don't double free the page.
1980         lock_page(dst_page);
1981         dst_pfns[page_index] = migrate_pfn(page_to_pfn(dst_page));
1982         continue;
1983 
1984     clr_mask:
1985         // TODO: Bug 3900774: clean up murky mess of mask clearing.
1986         uvm_page_mask_clear(page_mask, page_index);
1987         if (service_context)
1988             clear_service_context_masks(service_context, UVM_ID_CPU, page_index);
1989     }
1990 
1991     if (status != NV_OK)
1992         clean_up_non_migrating_pages(va_block, src_pfns, dst_pfns, region, page_mask);
1993     else if (uvm_page_mask_empty(page_mask))
1994         return NV_WARN_MORE_PROCESSING_REQUIRED;
1995 
1996     return status;
1997 }
1998 
1999 static NV_STATUS uvm_hmm_devmem_fault_alloc_and_copy(uvm_hmm_devmem_fault_context_t *devmem_fault_context)
2000 {
2001     uvm_processor_id_t processor_id;
2002     uvm_service_block_context_t *service_context;
2003     uvm_va_block_retry_t *va_block_retry;
2004     const unsigned long *src_pfns;
2005     unsigned long *dst_pfns;
2006     uvm_page_mask_t *page_mask;
2007     uvm_page_mask_t *same_devmem_page_mask = &devmem_fault_context->same_devmem_page_mask;
2008     uvm_va_block_t *va_block;
2009     NV_STATUS status = NV_OK;
2010 
2011     processor_id = devmem_fault_context->processor_id;
2012     service_context = devmem_fault_context->service_context;
2013     va_block_retry = devmem_fault_context->va_block_retry;
2014     va_block = devmem_fault_context->va_block;
2015     src_pfns = service_context->block_context.hmm.src_pfns;
2016     dst_pfns = service_context->block_context.hmm.dst_pfns;
2017 
2018     // Build the migration page mask.
2019     // Note that thrashing pinned pages and prefetch pages are already
2020     // accounted for in service_context->per_processor_masks.
2021     page_mask = &devmem_fault_context->page_mask;
2022     uvm_page_mask_copy(page_mask, &service_context->per_processor_masks[UVM_ID_CPU_VALUE].new_residency);
2023 
2024     status = alloc_and_copy_to_cpu(va_block,
2025                                    service_context->block_context.hmm.vma,
2026                                    src_pfns,
2027                                    dst_pfns,
2028                                    service_context->region,
2029                                    page_mask,
2030                                    same_devmem_page_mask,
2031                                    processor_id,
2032                                    service_context);
2033     if (status != NV_OK)
2034         return status;
2035 
2036     // Do the copy but don't update the residency or mapping for the new
2037     // location yet.
2038     return uvm_va_block_service_copy(processor_id, UVM_ID_CPU, va_block, va_block_retry, service_context);
2039 }
2040 
2041 static NV_STATUS uvm_hmm_devmem_fault_finalize_and_map(uvm_hmm_devmem_fault_context_t *devmem_fault_context)
2042 {
2043     uvm_processor_id_t processor_id;
2044     uvm_service_block_context_t *service_context;
2045     uvm_perf_prefetch_hint_t *prefetch_hint;
2046     uvm_va_block_retry_t *va_block_retry;
2047     const unsigned long *src_pfns;
2048     unsigned long *dst_pfns;
2049     uvm_page_mask_t *page_mask;
2050     uvm_va_block_t *va_block;
2051     uvm_va_block_region_t region;
2052     uvm_page_index_t page_index;
2053     NV_STATUS status, tracker_status;
2054 
2055     processor_id = devmem_fault_context->processor_id;
2056     service_context = devmem_fault_context->service_context;
2057     prefetch_hint = &service_context->prefetch_hint;
2058     va_block = devmem_fault_context->va_block;
2059     va_block_retry = devmem_fault_context->va_block_retry;
2060     src_pfns = service_context->block_context.hmm.src_pfns;
2061     dst_pfns = service_context->block_context.hmm.dst_pfns;
2062     region = service_context->region;
2063 
2064     page_mask = &devmem_fault_context->page_mask;
2065 
2066     // There are a number of reasons why HMM will mark a page as not migrating
2067     // even if we set a valid entry in dst_pfns[]. Mark these pages accordingly.
2068     for_each_va_block_page_in_region_mask(page_index, page_mask, region) {
2069         if (src_pfns[page_index] & MIGRATE_PFN_MIGRATE)
2070             continue;
2071 
2072         // If a page isn't migrating and only the GPU page table is being
2073         // updated, continue to process it normally.
2074         if (uvm_page_mask_test(&devmem_fault_context->same_devmem_page_mask, page_index))
2075             continue;
2076 
2077         // TODO: Bug 3900774: clean up murky mess of mask clearing.
2078         uvm_page_mask_clear(page_mask, page_index);
2079         clear_service_context_masks(service_context, UVM_ID_CPU, page_index);
2080     }
2081 
2082     if (uvm_page_mask_empty(page_mask))
2083         status = NV_WARN_MORE_PROCESSING_REQUIRED;
2084     else
2085         status = uvm_va_block_service_finish(processor_id, va_block, service_context);
2086 
2087     tracker_status = sync_page_and_chunk_state(va_block,
2088                                                src_pfns,
2089                                                dst_pfns,
2090                                                region,
2091                                                page_mask,
2092                                                &devmem_fault_context->same_devmem_page_mask);
2093 
2094     return status == NV_OK ? tracker_status : status;
2095 }
2096 
2097 static NV_STATUS populate_region(uvm_va_block_t *va_block,
2098                                  unsigned long *pfns,
2099                                  uvm_va_block_region_t region,
2100                                  uvm_page_mask_t *populated_page_mask)
2101 {
2102     uvm_page_index_t page_index;
2103     NV_STATUS status;
2104 
2105     // Make sure GPU state is allocated or else the GPU DMA mappings to
2106     // system memory won't be saved.
2107     status = uvm_va_block_gpu_state_alloc(va_block);
2108     if (status != NV_OK)
2109         return status;
2110 
2111     for_each_va_block_page_in_region(page_index, region) {
2112         struct page *page;
2113 
2114         // This case should only happen when querying CPU residency and we ask
2115         // for something not covered by a VMA. Otherwise, hmm_range_fault()
2116         // returns -EFAULT instead of setting the HMM_PFN_ERROR bit.
2117         if (pfns[page_index] & HMM_PFN_ERROR)
2118             return NV_ERR_INVALID_ADDRESS;
2119 
2120         if (pfns[page_index] & HMM_PFN_VALID) {
2121             page = hmm_pfn_to_page(pfns[page_index]);
2122         }
2123         else {
2124             // The page can't be evicted since it has to be migrated to the GPU
2125             // first which would leave a device private page entry so this has
2126             // to be a pte_none(), swapped out, or similar entry.
2127             // The page would have been allocated if populate_region() is being
2128             // called from uvm_hmm_va_block_service_locked() so this must be
2129             // for uvm_hmm_va_block_update_residency_info(). Just leave the
2130             // residency/populated information unchanged since
2131             // uvm_hmm_invalidate() should handle that if the underlying page
2132             // is invalidated.
2133             // Also note there can be an allocated page due to GPU-to-GPU
2134             // migration between non-peer or indirect peer GPUs.
2135             continue;
2136         }
2137 
2138         if (is_device_private_page(page)) {
2139             // Linux can call hmm_invalidate() and we have to clear the GPU
2140             // chunk pointer in uvm_va_block_gpu_state_t::chunks[] but it might
2141             // not release the device private struct page reference. Since
2142             // hmm_range_fault() did find a device private PTE, we can
2143             // re-establish the GPU chunk pointer.
2144             status = gpu_chunk_add(va_block, page_index, page);
2145             if (status != NV_OK)
2146                 return status;
2147             continue;
2148         }
2149 
2150         // If a CPU chunk is already allocated, check to see it matches what
2151         // hmm_range_fault() found.
2152         if (uvm_page_mask_test(&va_block->cpu.allocated, page_index)) {
2153             UVM_ASSERT(hmm_va_block_cpu_page_is_same(va_block, page_index, page));
2154         }
2155         else {
2156             status = hmm_va_block_cpu_page_populate(va_block, page_index, page);
2157             if (status != NV_OK)
2158                 return status;
2159 
2160             // Record that we populated this page. hmm_block_cpu_fault_locked()
2161             // uses this to ensure pages that don't migrate get remote mapped.
2162             if (populated_page_mask)
2163                 uvm_page_mask_set(populated_page_mask, page_index);
2164         }
2165 
2166         // Since we have a stable snapshot of the CPU pages, we can
2167         // update the residency and protection information.
2168         uvm_processor_mask_set(&va_block->resident, UVM_ID_CPU);
2169         uvm_page_mask_set(&va_block->cpu.resident, page_index);
2170 
2171         cpu_mapping_set(va_block, pfns[page_index] & HMM_PFN_WRITE, page_index);
2172     }
2173 
2174     return NV_OK;
2175 }
2176 
2177 static void hmm_range_fault_begin(uvm_va_block_t *va_block)
2178 {
2179     uvm_thread_context_t *uvm_context = uvm_thread_context();
2180 
2181     uvm_assert_mutex_locked(&va_block->lock);
2182     uvm_context->hmm_invalidate_seqnum = va_block->hmm.changed;
2183 }
2184 
2185 static bool hmm_range_fault_retry(uvm_va_block_t *va_block)
2186 {
2187     uvm_thread_context_t *uvm_context = uvm_thread_context();
2188 
2189     uvm_assert_mutex_locked(&va_block->lock);
2190     return uvm_context->hmm_invalidate_seqnum != va_block->hmm.changed;
2191 }
2192 
2193 // Make the region be resident on the CPU by calling hmm_range_fault() to fault
2194 // in CPU pages.
2195 static NV_STATUS hmm_make_resident_cpu(uvm_va_block_t *va_block,
2196                                        struct vm_area_struct *vma,
2197                                        unsigned long *hmm_pfns,
2198                                        uvm_va_block_region_t region,
2199                                        NvU8 *access_type,
2200                                        uvm_page_mask_t *populated_page_mask)
2201 {
2202     uvm_page_index_t page_index;
2203     int ret;
2204     struct hmm_range range = {
2205         .notifier = &va_block->hmm.notifier,
2206         .start = uvm_va_block_region_start(va_block, region),
2207         .end = uvm_va_block_region_end(va_block, region) + 1,
2208         .hmm_pfns = hmm_pfns + region.first,
2209         .pfn_flags_mask = HMM_PFN_REQ_FAULT | HMM_PFN_REQ_WRITE,
2210         .dev_private_owner = &g_uvm_global,
2211     };
2212 
2213     for_each_va_block_page_in_region(page_index, region) {
2214         if ((access_type && access_type[page_index] >= UVM_FAULT_ACCESS_TYPE_WRITE) ||
2215             (vma->vm_flags & VM_WRITE))
2216             hmm_pfns[page_index] = HMM_PFN_REQ_FAULT | HMM_PFN_REQ_WRITE;
2217         else
2218             hmm_pfns[page_index] = HMM_PFN_REQ_FAULT;
2219     }
2220 
2221     hmm_range_fault_begin(va_block);
2222 
2223     // Mirror the VA block to the HMM address range.
2224     // Note that we request HMM to handle page faults, which means that it will
2225     // populate and map potentially not-yet-existing pages to the VMA.
2226     // Also note that mmu_interval_read_begin() calls wait_event() for any
2227     // parallel invalidation callbacks to finish so we can't hold locks that
2228     // the invalidation callback acquires.
2229     uvm_mutex_unlock(&va_block->lock);
2230 
2231     range.notifier_seq = mmu_interval_read_begin(range.notifier);
2232     ret = hmm_range_fault(&range);
2233 
2234     uvm_mutex_lock(&va_block->lock);
2235 
2236     if (ret)
2237         return (ret == -EBUSY) ? NV_WARN_MORE_PROCESSING_REQUIRED : errno_to_nv_status(ret);
2238 
2239     if (hmm_range_fault_retry(va_block))
2240         return NV_WARN_MORE_PROCESSING_REQUIRED;
2241 
2242     return populate_region(va_block,
2243                            hmm_pfns,
2244                            region,
2245                            populated_page_mask);
2246 }
2247 
2248 // Release the reference count on any pages that were made device exclusive.
2249 static void hmm_release_atomic_pages(uvm_va_block_t *va_block,
2250                                      uvm_service_block_context_t *service_context)
2251 {
2252     uvm_va_block_region_t region = service_context->region;
2253     uvm_page_index_t page_index;
2254 
2255     for_each_va_block_page_in_region(page_index, region) {
2256         struct page *page = service_context->block_context.hmm.pages[page_index];
2257 
2258         if (!page)
2259             continue;
2260 
2261         unlock_page(page);
2262         put_page(page);
2263     }
2264 }
2265 
2266 static NV_STATUS hmm_block_atomic_fault_locked(uvm_processor_id_t processor_id,
2267                                                uvm_va_block_t *va_block,
2268                                                uvm_va_block_retry_t *va_block_retry,
2269                                                uvm_service_block_context_t *service_context)
2270 {
2271     uvm_va_block_region_t region = service_context->region;
2272     struct page **pages = service_context->block_context.hmm.pages;
2273     int npages;
2274     uvm_page_index_t page_index;
2275     uvm_make_resident_cause_t cause;
2276     NV_STATUS status;
2277 
2278     if (!uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU) ||
2279         !uvm_page_mask_region_full(&va_block->cpu.resident, region)) {
2280         // There is an atomic GPU fault. We need to make sure no pages are
2281         // GPU resident so that make_device_exclusive_range() doesn't call
2282         // migrate_to_ram() and cause a va_space lock recursion problem.
2283         if (service_context->operation == UVM_SERVICE_OPERATION_REPLAYABLE_FAULTS)
2284             cause = UVM_MAKE_RESIDENT_CAUSE_REPLAYABLE_FAULT;
2285         else if (service_context->operation == UVM_SERVICE_OPERATION_NON_REPLAYABLE_FAULTS)
2286             cause = UVM_MAKE_RESIDENT_CAUSE_NON_REPLAYABLE_FAULT;
2287         else
2288             cause = UVM_MAKE_RESIDENT_CAUSE_ACCESS_COUNTER;
2289 
2290         status = uvm_hmm_va_block_migrate_locked(va_block,
2291                                                  va_block_retry,
2292                                                  &service_context->block_context,
2293                                                  UVM_ID_CPU,
2294                                                  region,
2295                                                  cause);
2296         if (status != NV_OK)
2297             goto done;
2298 
2299         // make_device_exclusive_range() will try to call migrate_to_ram()
2300         // and deadlock with ourself if the data isn't CPU resident.
2301         if (!uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU) ||
2302             !uvm_page_mask_region_full(&va_block->cpu.resident, region)) {
2303             status = NV_WARN_MORE_PROCESSING_REQUIRED;
2304             goto done;
2305         }
2306     }
2307 
2308     // TODO: Bug 4014681: atomic GPU operations are not supported on MAP_SHARED
2309     // mmap() files so we check for that here and report a fatal fault.
2310     // Otherwise with the current Linux 6.1 make_device_exclusive_range(),
2311     // it doesn't make the page exclusive and we end up in an endless loop.
2312     if (service_context->block_context.hmm.vma->vm_flags & VM_SHARED) {
2313         status = NV_ERR_NOT_SUPPORTED;
2314         goto done;
2315     }
2316 
2317     hmm_range_fault_begin(va_block);
2318 
2319     uvm_mutex_unlock(&va_block->lock);
2320 
2321     npages = make_device_exclusive_range(service_context->block_context.mm,
2322         uvm_va_block_cpu_page_address(va_block, region.first),
2323         uvm_va_block_cpu_page_address(va_block, region.outer - 1) + PAGE_SIZE,
2324         pages + region.first,
2325         &g_uvm_global);
2326 
2327     uvm_mutex_lock(&va_block->lock);
2328 
2329     if (npages < 0) {
2330         status = (npages == -EBUSY) ? NV_WARN_MORE_PROCESSING_REQUIRED : errno_to_nv_status(npages);
2331         goto done;
2332     }
2333 
2334     while ((size_t)npages < uvm_va_block_region_num_pages(region))
2335         pages[region.first + npages++] = NULL;
2336 
2337     if (hmm_range_fault_retry(va_block)) {
2338         status = NV_WARN_MORE_PROCESSING_REQUIRED;
2339         goto release;
2340     }
2341 
2342     status = NV_OK;
2343 
2344     for_each_va_block_page_in_region(page_index, region) {
2345         struct page *page = pages[page_index];
2346 
2347         if (!page) {
2348             // Record that one of the pages isn't exclusive but keep converting
2349             // the others.
2350             status = NV_WARN_MORE_PROCESSING_REQUIRED;
2351             continue;
2352         }
2353 
2354         // If a CPU chunk is already allocated, check to see it matches what
2355         // make_device_exclusive_range() found.
2356         if (uvm_page_mask_test(&va_block->cpu.allocated, page_index)) {
2357             UVM_ASSERT(hmm_va_block_cpu_page_is_same(va_block, page_index, page));
2358             UVM_ASSERT(uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU));
2359             UVM_ASSERT(uvm_page_mask_test(&va_block->cpu.resident, page_index));
2360         }
2361         else {
2362             NV_STATUS s = hmm_va_block_cpu_page_populate(va_block, page_index, page);
2363 
2364             if (s == NV_OK) {
2365                 uvm_processor_mask_set(&va_block->resident, UVM_ID_CPU);
2366                 uvm_page_mask_set(&va_block->cpu.resident, page_index);
2367             }
2368         }
2369 
2370         cpu_mapping_clear(va_block, page_index);
2371     }
2372 
2373     if (status != NV_OK)
2374         goto release;
2375 
2376     status = uvm_va_block_service_copy(processor_id, UVM_ID_CPU, va_block, va_block_retry, service_context);
2377     if (status != NV_OK)
2378         goto release;
2379 
2380     status = uvm_va_block_service_finish(processor_id, va_block, service_context);
2381 
2382 release:
2383     hmm_release_atomic_pages(va_block, service_context);
2384 
2385 done:
2386     return status;
2387 }
2388 
2389 static bool is_atomic_fault(NvU8 *access_type, uvm_va_block_region_t region)
2390 {
2391     uvm_page_index_t page_index;
2392 
2393     for_each_va_block_page_in_region(page_index, region) {
2394         if (access_type[page_index] == UVM_FAULT_ACCESS_TYPE_ATOMIC_STRONG)
2395             return true;
2396     }
2397 
2398     return false;
2399 }
2400 
2401 static bool is_gpu_resident(uvm_va_block_t *va_block, uvm_va_block_region_t region)
2402 {
2403     uvm_processor_id_t gpu_id;
2404 
2405     for_each_gpu_id_in_mask(gpu_id, &va_block->resident) {
2406         uvm_va_block_gpu_state_t *gpu_state;
2407 
2408         gpu_state = uvm_va_block_gpu_state_get(va_block, gpu_id);
2409         if (!uvm_page_mask_region_empty(&gpu_state->resident, region))
2410             return true;
2411     }
2412 
2413     return false;
2414 }
2415 
2416 static NV_STATUS hmm_block_cpu_fault_locked(uvm_processor_id_t processor_id,
2417                                             uvm_va_block_t *va_block,
2418                                             uvm_va_block_retry_t *va_block_retry,
2419                                             uvm_service_block_context_t *service_context)
2420 {
2421     uvm_va_block_region_t region = service_context->region;
2422     struct migrate_vma *args = &service_context->block_context.hmm.migrate_vma_args;
2423     NV_STATUS status;
2424     int ret;
2425     uvm_hmm_devmem_fault_context_t fault_context = {
2426         .processor_id = processor_id,
2427         .va_block = va_block,
2428         .va_block_retry = va_block_retry,
2429         .service_context = service_context,
2430     };
2431 
2432     // Normally the source page will be a device private page that is being
2433     // migrated to system memory. However, when it is a GPU fault, the source
2434     // page can be a system memory page that the GPU needs to remote map
2435     // instead. However migrate_vma_setup() won't select these types of
2436     // mappings/pages:
2437     //  - device exclusive PTEs
2438     //  - shared mappings
2439     //  - file backed mappings
2440     // Also, if the source and destination page are the same, the page reference
2441     // count won't be the "expected" count and migrate_vma_pages() won't migrate
2442     // it. This mask records that uvm_hmm_devmem_fault_alloc_and_copy() and
2443     // uvm_hmm_devmem_fault_finalize_and_map() still needs to process these
2444     // pages even if src_pfn indicates they are not migrating.
2445     uvm_page_mask_zero(&fault_context.same_devmem_page_mask);
2446 
2447     if (!UVM_ID_IS_CPU(processor_id)) {
2448         if (is_atomic_fault(service_context->access_type, region)) {
2449             return hmm_block_atomic_fault_locked(processor_id,
2450                                                  va_block,
2451                                                  va_block_retry,
2452                                                  service_context);
2453         }
2454 
2455         status = hmm_make_resident_cpu(va_block,
2456                                        service_context->block_context.hmm.vma,
2457                                        service_context->block_context.hmm.src_pfns,
2458                                        region,
2459                                        service_context->access_type,
2460                                        &fault_context.same_devmem_page_mask);
2461         if (status != NV_OK)
2462             return status;
2463 
2464         // If no GPU has a resident copy, we can skip the migrate_vma_*().
2465         // This is necessary if uvm_hmm_must_use_sysmem() returned true.
2466         if (!is_gpu_resident(va_block, region)) {
2467             status = uvm_va_block_service_copy(processor_id,
2468                                                UVM_ID_CPU,
2469                                                va_block,
2470                                                va_block_retry,
2471                                                service_context);
2472             if (status != NV_OK)
2473                 return status;
2474 
2475             return uvm_va_block_service_finish(processor_id, va_block, service_context);
2476         }
2477     }
2478 
2479     args->vma = service_context->block_context.hmm.vma;
2480     args->src = service_context->block_context.hmm.src_pfns + region.first;
2481     args->dst = service_context->block_context.hmm.dst_pfns + region.first;
2482     args->start = uvm_va_block_region_start(va_block, region);
2483     args->end = uvm_va_block_region_end(va_block, region) + 1;
2484     args->flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE;
2485     args->pgmap_owner = &g_uvm_global;
2486 
2487     if (UVM_ID_IS_CPU(processor_id)) {
2488         args->fault_page = service_context->cpu_fault.vmf->page;
2489     }
2490     else {
2491         args->flags |= MIGRATE_VMA_SELECT_SYSTEM;
2492         args->fault_page = NULL;
2493     }
2494 
2495     ret = migrate_vma_setup_locked(args, va_block);
2496     UVM_ASSERT(!ret);
2497 
2498     // The overall process here is to migrate pages from the GPU to the CPU
2499     // and possibly remote map the GPU to sysmem if accessed_by is set.
2500     // This is safe because we hold the va_block lock across the calls to
2501     // uvm_hmm_devmem_fault_alloc_and_copy(), migrate_vma_pages(),
2502     // uvm_hmm_devmem_fault_finalize_and_map(), and migrate_vma_finalize().
2503     // If uvm_hmm_devmem_fault_alloc_and_copy() needs to drop the va_block
2504     // lock, a sequence number is used to tell if an invalidate() callback
2505     // occurred while not holding the lock. If the sequence number changes,
2506     // all the locks need to be dropped (mm, va_space, va_block) and the whole
2507     // uvm_va_block_service_locked() called again. Otherwise, there were no
2508     // conflicting invalidate callbacks and our snapshots of the CPU page
2509     // tables are accurate and can be used to DMA pages and update GPU page
2510     // tables.
2511     status = uvm_hmm_devmem_fault_alloc_and_copy(&fault_context);
2512     if (status == NV_OK) {
2513         migrate_vma_pages(args);
2514         status = uvm_hmm_devmem_fault_finalize_and_map(&fault_context);
2515     }
2516 
2517     migrate_vma_finalize(args);
2518 
2519     if (status == NV_WARN_NOTHING_TO_DO)
2520         status = NV_OK;
2521 
2522     return status;
2523 }
2524 
2525 static NV_STATUS dmamap_src_sysmem_pages(uvm_va_block_t *va_block,
2526                                          struct vm_area_struct *vma,
2527                                          const unsigned long *src_pfns,
2528                                          unsigned long *dst_pfns,
2529                                          uvm_va_block_region_t region,
2530                                          uvm_page_mask_t *page_mask,
2531                                          uvm_processor_id_t dest_id,
2532                                          uvm_service_block_context_t *service_context)
2533 {
2534     uvm_page_index_t page_index;
2535     NV_STATUS status = NV_OK;
2536 
2537     for_each_va_block_page_in_region_mask(page_index, page_mask, region) {
2538         struct page *src_page;
2539 
2540         if (!(src_pfns[page_index] & MIGRATE_PFN_MIGRATE)) {
2541             // HMM currently has some limitations on what pages can be migrated.
2542             // For example, no file backed pages, device private pages owned by
2543             // a different device, device exclusive or swapped out pages.
2544             goto clr_mask;
2545         }
2546 
2547         // This is the page that will be copied to the destination GPU.
2548         src_page = migrate_pfn_to_page(src_pfns[page_index]);
2549         if (src_page) {
2550             if (is_device_private_page(src_page)) {
2551                 status = gpu_chunk_add(va_block, page_index, src_page);
2552                 if (status != NV_OK)
2553                     break;
2554                 continue;
2555             }
2556 
2557             if (PageSwapCache(src_page)) {
2558                 // TODO: Bug 4050579: Remove this when swap cached pages can be
2559                 // migrated.
2560                 if (service_context) {
2561                     service_context->block_context.hmm.swap_cached = true;
2562                     break;
2563                 }
2564 
2565                 goto clr_mask;
2566             }
2567 
2568             // If the page is already allocated, it is most likely a mirrored
2569             // page. Check to be sure it matches what we have recorded. The
2570             // page shouldn't be a staging page from a GPU to GPU migration
2571             // or a remote mapped atomic sysmem page because migrate_vma_setup()
2572             // found a normal page and non-mirrored pages are only known
2573             // privately to the UVM driver.
2574             if (uvm_page_mask_test(&va_block->cpu.allocated, page_index)) {
2575                 UVM_ASSERT(hmm_va_block_cpu_page_is_same(va_block, page_index, src_page));
2576                 UVM_ASSERT(uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU));
2577                 UVM_ASSERT(uvm_page_mask_test(&va_block->cpu.resident, page_index));
2578             }
2579             else {
2580                 status = hmm_va_block_cpu_page_populate(va_block, page_index, src_page);
2581                 if (status != NV_OK)
2582                     goto clr_mask;
2583 
2584                 // Since there is a CPU resident page, there shouldn't be one
2585                 // anywhere else. TODO: Bug 3660922: Need to handle read
2586                 // duplication at some point.
2587                 UVM_ASSERT(!uvm_va_block_page_resident_processors_count(va_block, page_index));
2588 
2589                 // migrate_vma_setup() was able to isolate and lock the page;
2590                 // therefore, it is CPU resident and not mapped.
2591                 uvm_processor_mask_set(&va_block->resident, UVM_ID_CPU);
2592                 uvm_page_mask_set(&va_block->cpu.resident, page_index);
2593             }
2594 
2595             // The call to migrate_vma_setup() will have inserted a migration
2596             // PTE so the CPU has no access.
2597             cpu_mapping_clear(va_block, page_index);
2598         }
2599         else {
2600             // It is OK to migrate an empty anonymous page, a zero page will
2601             // be allocated on the GPU. Just be sure to free any pages
2602             // used for GPU to GPU copies. It can't be an evicted page because
2603             // migrate_vma_setup() would have found a source page.
2604             if (uvm_page_mask_test(&va_block->cpu.allocated, page_index)) {
2605                 UVM_ASSERT(!uvm_va_block_page_resident_processors_count(va_block, page_index));
2606 
2607                 hmm_va_block_cpu_page_unpopulate(va_block, page_index);
2608             }
2609         }
2610 
2611         continue;
2612 
2613     clr_mask:
2614         // TODO: Bug 3900774: clean up murky mess of mask clearing.
2615         uvm_page_mask_clear(page_mask, page_index);
2616         if (service_context)
2617             clear_service_context_masks(service_context, dest_id, page_index);
2618     }
2619 
2620     if (uvm_page_mask_empty(page_mask) ||
2621         (service_context && service_context->block_context.hmm.swap_cached))
2622         status = NV_WARN_MORE_PROCESSING_REQUIRED;
2623 
2624     if (status != NV_OK)
2625         clean_up_non_migrating_pages(va_block, src_pfns, dst_pfns, region, page_mask);
2626 
2627     return status;
2628 }
2629 
2630 static NV_STATUS uvm_hmm_gpu_fault_alloc_and_copy(struct vm_area_struct *vma,
2631                                                   uvm_hmm_gpu_fault_event_t *uvm_hmm_gpu_fault_event)
2632 {
2633     uvm_processor_id_t processor_id;
2634     uvm_processor_id_t new_residency;
2635     uvm_va_block_t *va_block;
2636     uvm_va_block_retry_t *va_block_retry;
2637     uvm_service_block_context_t *service_context;
2638     uvm_perf_prefetch_hint_t *prefetch_hint;
2639     const unsigned long *src_pfns;
2640     unsigned long *dst_pfns;
2641     uvm_va_block_region_t region;
2642     uvm_page_mask_t *page_mask;
2643     NV_STATUS status;
2644 
2645     processor_id = uvm_hmm_gpu_fault_event->processor_id;
2646     new_residency = uvm_hmm_gpu_fault_event->new_residency;
2647     va_block = uvm_hmm_gpu_fault_event->va_block;
2648     va_block_retry = uvm_hmm_gpu_fault_event->va_block_retry;
2649     service_context = uvm_hmm_gpu_fault_event->service_context;
2650     region = service_context->region;
2651     prefetch_hint = &service_context->prefetch_hint;
2652     src_pfns = service_context->block_context.hmm.src_pfns;
2653     dst_pfns = service_context->block_context.hmm.dst_pfns;
2654 
2655     // Build the migration mask.
2656     // Note that thrashing pinned pages are already accounted for in
2657     // service_context->resident_processors.
2658     page_mask = &uvm_hmm_gpu_fault_event->page_mask;
2659     uvm_page_mask_copy(page_mask,
2660                        &service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency);
2661 
2662     status = dmamap_src_sysmem_pages(va_block,
2663                                      vma,
2664                                      src_pfns,
2665                                      dst_pfns,
2666                                      region,
2667                                      page_mask,
2668                                      new_residency,
2669                                      service_context);
2670     if (status != NV_OK)
2671         return status;
2672 
2673     // Do the alloc and copy but don't update the residency or mapping for the
2674     // new location yet.
2675     status = uvm_va_block_service_copy(processor_id, new_residency, va_block, va_block_retry, service_context);
2676     if (status != NV_OK)
2677         return status;
2678 
2679     // Record the destination PFNs of device private struct pages now that
2680     // uvm_va_block_service_copy() has populated the GPU destination pages.
2681     fill_dst_pfns(va_block,
2682                   src_pfns,
2683                   dst_pfns,
2684                   region,
2685                   page_mask,
2686                   &uvm_hmm_gpu_fault_event->same_devmem_page_mask,
2687                   new_residency);
2688 
2689     return status;
2690 }
2691 
2692 static NV_STATUS uvm_hmm_gpu_fault_finalize_and_map(uvm_hmm_gpu_fault_event_t *uvm_hmm_gpu_fault_event)
2693 {
2694     uvm_processor_id_t processor_id;
2695     uvm_processor_id_t new_residency;
2696     uvm_va_block_t *va_block;
2697     uvm_va_block_retry_t *va_block_retry;
2698     uvm_service_block_context_t *service_context;
2699     const unsigned long *src_pfns;
2700     unsigned long *dst_pfns;
2701     uvm_va_block_region_t region;
2702     uvm_page_index_t page_index;
2703     uvm_page_mask_t *page_mask;
2704     NV_STATUS status, tracker_status;
2705 
2706     processor_id = uvm_hmm_gpu_fault_event->processor_id;
2707     new_residency = uvm_hmm_gpu_fault_event->new_residency;
2708     va_block = uvm_hmm_gpu_fault_event->va_block;
2709     va_block_retry = uvm_hmm_gpu_fault_event->va_block_retry;
2710     service_context = uvm_hmm_gpu_fault_event->service_context;
2711     src_pfns = service_context->block_context.hmm.src_pfns;
2712     dst_pfns = service_context->block_context.hmm.dst_pfns;
2713     region = service_context->region;
2714     page_mask = &uvm_hmm_gpu_fault_event->page_mask;
2715 
2716     // There are a number of reasons why HMM will mark a page as not migrating
2717     // even if we set a valid entry in dst_pfns[]. Mark these pages accordingly.
2718     for_each_va_block_page_in_region_mask(page_index, page_mask, region) {
2719         unsigned long src_pfn = src_pfns[page_index];
2720 
2721         if (src_pfn & MIGRATE_PFN_MIGRATE)
2722             continue;
2723 
2724         // If a device private page isn't migrating and only the GPU page table
2725         // is being updated, continue to process it normally.
2726         if (uvm_page_mask_test(&uvm_hmm_gpu_fault_event->same_devmem_page_mask, page_index))
2727             continue;
2728 
2729         // TODO: Bug 3900774: clean up murky mess of mask clearing.
2730         uvm_page_mask_clear(page_mask, page_index);
2731         clear_service_context_masks(service_context, new_residency, page_index);
2732     }
2733 
2734     if (uvm_page_mask_empty(page_mask))
2735         status = NV_WARN_MORE_PROCESSING_REQUIRED;
2736     else
2737         status = uvm_va_block_service_finish(processor_id, va_block, service_context);
2738 
2739     tracker_status = sync_page_and_chunk_state(va_block,
2740                                                src_pfns,
2741                                                dst_pfns,
2742                                                region,
2743                                                page_mask,
2744                                                &uvm_hmm_gpu_fault_event->same_devmem_page_mask);
2745 
2746     return status == NV_OK ? tracker_status : status;
2747 }
2748 
2749 NV_STATUS uvm_hmm_va_block_service_locked(uvm_processor_id_t processor_id,
2750                                           uvm_processor_id_t new_residency,
2751                                           uvm_va_block_t *va_block,
2752                                           uvm_va_block_retry_t *va_block_retry,
2753                                           uvm_service_block_context_t *service_context)
2754 {
2755     struct mm_struct *mm = service_context->block_context.mm;
2756     struct vm_area_struct *vma = service_context->block_context.hmm.vma;
2757     uvm_va_block_region_t region = service_context->region;
2758     uvm_hmm_gpu_fault_event_t uvm_hmm_gpu_fault_event;
2759     struct migrate_vma *args = &service_context->block_context.hmm.migrate_vma_args;
2760     int ret;
2761     NV_STATUS status = NV_ERR_INVALID_ADDRESS;
2762 
2763     if (!mm)
2764         return status;
2765 
2766     uvm_assert_mmap_lock_locked(mm);
2767     uvm_assert_rwsem_locked(&va_block->hmm.va_space->lock);
2768     uvm_assert_mutex_locked(&va_block->hmm.migrate_lock);
2769     uvm_assert_mutex_locked(&va_block->lock);
2770     UVM_ASSERT(vma);
2771 
2772     // If the desired destination is the CPU, try to fault in CPU pages.
2773     if (UVM_ID_IS_CPU(new_residency))
2774         return hmm_block_cpu_fault_locked(processor_id, va_block, va_block_retry, service_context);
2775 
2776     uvm_hmm_gpu_fault_event.processor_id = processor_id;
2777     uvm_hmm_gpu_fault_event.new_residency = new_residency;
2778     uvm_hmm_gpu_fault_event.va_block = va_block;
2779     uvm_hmm_gpu_fault_event.va_block_retry = va_block_retry;
2780     uvm_hmm_gpu_fault_event.service_context = service_context;
2781 
2782     args->vma = vma;
2783     args->src = service_context->block_context.hmm.src_pfns + region.first;
2784     args->dst = service_context->block_context.hmm.dst_pfns + region.first;
2785     args->start = uvm_va_block_region_start(va_block, region);
2786     args->end = uvm_va_block_region_end(va_block, region) + 1;
2787     args->flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE | MIGRATE_VMA_SELECT_SYSTEM;
2788     args->pgmap_owner = &g_uvm_global;
2789     args->fault_page = NULL;
2790 
2791     ret = migrate_vma_setup_locked(args, va_block);
2792     UVM_ASSERT(!ret);
2793 
2794     // The overall process here is to migrate pages from the CPU or GPUs to the
2795     // faulting GPU.
2796     // This is safe because we hold the va_block lock across the calls to
2797     // uvm_hmm_gpu_fault_alloc_and_copy(), migrate_vma_pages(),
2798     // uvm_hmm_gpu_fault_finalize_and_map(), and migrate_vma_finalize().
2799     // If uvm_hmm_gpu_fault_alloc_and_copy() needs to drop the va_block
2800     // lock, a sequence number is used to tell if an invalidate() callback
2801     // occurred while not holding the lock. If the sequence number changes,
2802     // all the locks need to be dropped (mm, va_space, va_block) and the whole
2803     // uvm_va_block_service_locked() called again. Otherwise, there were no
2804     // conflicting invalidate callbacks and our snapshots of the CPU page
2805     // tables are accurate and can be used to DMA pages and update GPU page
2806     // tables. TODO: Bug 3901904: there might be better ways of handling no
2807     // page being migrated.
2808     status = uvm_hmm_gpu_fault_alloc_and_copy(vma, &uvm_hmm_gpu_fault_event);
2809     if (status == NV_WARN_MORE_PROCESSING_REQUIRED) {
2810         migrate_vma_finalize(args);
2811 
2812         // migrate_vma_setup() might have not been able to lock/isolate any
2813         // pages because they are swapped out or are device exclusive.
2814         // We do know that none of the pages in the region are zero pages
2815         // since migrate_vma_setup() would have reported that information.
2816         // Try to make it resident in system memory and retry the migration.
2817         status = hmm_make_resident_cpu(va_block,
2818                                        service_context->block_context.hmm.vma,
2819                                        service_context->block_context.hmm.src_pfns,
2820                                        region,
2821                                        service_context->access_type,
2822                                        NULL);
2823         return NV_WARN_MORE_PROCESSING_REQUIRED;
2824     }
2825 
2826     if (status == NV_OK) {
2827         migrate_vma_pages(args);
2828         status = uvm_hmm_gpu_fault_finalize_and_map(&uvm_hmm_gpu_fault_event);
2829     }
2830 
2831     migrate_vma_finalize(args);
2832 
2833     if (status == NV_WARN_NOTHING_TO_DO)
2834         status = NV_OK;
2835 
2836     return status;
2837 }
2838 
2839 static NV_STATUS uvm_hmm_migrate_alloc_and_copy(struct vm_area_struct *vma,
2840                                                 uvm_hmm_migrate_event_t *uvm_hmm_migrate_event)
2841 {
2842     uvm_va_block_t *va_block;
2843     uvm_va_block_retry_t *va_block_retry;
2844     uvm_va_block_context_t *va_block_context;
2845     const unsigned long *src_pfns;
2846     unsigned long *dst_pfns;
2847     uvm_va_block_region_t region;
2848     uvm_processor_id_t dest_id;
2849     uvm_page_mask_t *page_mask;
2850     NV_STATUS status;
2851 
2852     va_block = uvm_hmm_migrate_event->va_block;
2853     va_block_retry = uvm_hmm_migrate_event->va_block_retry;
2854     va_block_context = uvm_hmm_migrate_event->va_block_context;
2855     src_pfns = va_block_context->hmm.src_pfns;
2856     dst_pfns = va_block_context->hmm.dst_pfns;
2857     region = uvm_hmm_migrate_event->region;
2858     dest_id = uvm_hmm_migrate_event->dest_id;
2859     page_mask = &uvm_hmm_migrate_event->page_mask;
2860     uvm_page_mask_init_from_region(page_mask, region, NULL);
2861     uvm_page_mask_zero(&uvm_hmm_migrate_event->same_devmem_page_mask);
2862 
2863     uvm_assert_mutex_locked(&va_block->lock);
2864 
2865     if (UVM_ID_IS_CPU(dest_id)) {
2866         status = alloc_and_copy_to_cpu(va_block,
2867                                        vma,
2868                                        src_pfns,
2869                                        dst_pfns,
2870                                        region,
2871                                        page_mask,
2872                                        &uvm_hmm_migrate_event->same_devmem_page_mask,
2873                                        UVM_ID_INVALID,
2874                                        NULL);
2875     }
2876     else {
2877         status = dmamap_src_sysmem_pages(va_block,
2878                                          vma,
2879                                          src_pfns,
2880                                          dst_pfns,
2881                                          region,
2882                                          page_mask,
2883                                          dest_id,
2884                                          NULL);
2885     }
2886     if (status != NV_OK)
2887         return status;
2888 
2889     status = uvm_va_block_make_resident_copy(va_block,
2890                                              va_block_retry,
2891                                              va_block_context,
2892                                              dest_id,
2893                                              region,
2894                                              page_mask,
2895                                              NULL,
2896                                              uvm_hmm_migrate_event->cause);
2897     if (status != NV_OK)
2898         return status;
2899 
2900     if (!UVM_ID_IS_CPU(dest_id)) {
2901         // Record the destination PFNs of device private struct pages now that
2902         // uvm_va_block_make_resident_copy() has populated the GPU destination
2903         // pages.
2904         fill_dst_pfns(va_block,
2905                       src_pfns,
2906                       dst_pfns,
2907                       region,
2908                       page_mask,
2909                       &uvm_hmm_migrate_event->same_devmem_page_mask,
2910                       dest_id);
2911     }
2912 
2913     return status;
2914 }
2915 
2916 static NV_STATUS uvm_hmm_migrate_finalize(uvm_hmm_migrate_event_t *uvm_hmm_migrate_event)
2917 {
2918     uvm_va_block_t *va_block;
2919     uvm_va_block_retry_t *va_block_retry;
2920     uvm_va_block_context_t *va_block_context;
2921     uvm_va_block_region_t region;
2922     uvm_processor_id_t dest_id;
2923     uvm_page_index_t page_index;
2924     uvm_page_mask_t *page_mask;
2925     const unsigned long *src_pfns;
2926     unsigned long *dst_pfns;
2927 
2928     va_block = uvm_hmm_migrate_event->va_block;
2929     va_block_retry = uvm_hmm_migrate_event->va_block_retry;
2930     va_block_context = uvm_hmm_migrate_event->va_block_context;
2931     region = uvm_hmm_migrate_event->region;
2932     dest_id = uvm_hmm_migrate_event->dest_id;
2933     page_mask = &uvm_hmm_migrate_event->page_mask;
2934     src_pfns = va_block_context->hmm.src_pfns;
2935     dst_pfns = va_block_context->hmm.dst_pfns;
2936 
2937     uvm_assert_mutex_locked(&va_block->lock);
2938 
2939     // There are a number of reasons why HMM will mark a page as not migrating
2940     // even if we set a valid entry in dst_pfns[]. Mark these pages accordingly.
2941     for_each_va_block_page_in_region_mask(page_index, page_mask, region) {
2942         unsigned long src_pfn = src_pfns[page_index];
2943 
2944         if (src_pfn & MIGRATE_PFN_MIGRATE)
2945             continue;
2946 
2947         // If a device private page isn't migrating and only the GPU page table
2948         // is being updated, continue to process it normally.
2949         if (uvm_page_mask_test(&uvm_hmm_migrate_event->same_devmem_page_mask, page_index))
2950             continue;
2951 
2952         uvm_page_mask_clear(page_mask, page_index);
2953     }
2954 
2955     uvm_va_block_make_resident_finish(va_block, va_block_context, region, page_mask);
2956 
2957     return sync_page_and_chunk_state(va_block,
2958                                      src_pfns,
2959                                      dst_pfns,
2960                                      region,
2961                                      page_mask,
2962                                      &uvm_hmm_migrate_event->same_devmem_page_mask);
2963 }
2964 
2965 static bool is_resident(uvm_va_block_t *va_block,
2966                         uvm_processor_id_t dest_id,
2967                         uvm_va_block_region_t region)
2968 {
2969     if (!uvm_processor_mask_test(&va_block->resident, dest_id))
2970         return false;
2971 
2972     return uvm_page_mask_region_full(uvm_va_block_resident_mask_get(va_block, dest_id), region);
2973 }
2974 
2975 // Note that migrate_vma_*() doesn't handle asynchronous migrations so the
2976 // migration flag UVM_MIGRATE_FLAG_SKIP_CPU_MAP doesn't have an effect.
2977 // TODO: Bug 3900785: investigate ways to implement async migration.
2978 NV_STATUS uvm_hmm_va_block_migrate_locked(uvm_va_block_t *va_block,
2979                                           uvm_va_block_retry_t *va_block_retry,
2980                                           uvm_va_block_context_t *va_block_context,
2981                                           uvm_processor_id_t dest_id,
2982                                           uvm_va_block_region_t region,
2983                                           uvm_make_resident_cause_t cause)
2984 {
2985     uvm_hmm_migrate_event_t uvm_hmm_migrate_event;
2986     struct vm_area_struct *vma = va_block_context->hmm.vma;
2987     NvU64 start;
2988     NvU64 end;
2989     struct migrate_vma *args = &va_block_context->hmm.migrate_vma_args;
2990     NV_STATUS status;
2991     int ret;
2992 
2993     UVM_ASSERT(vma);
2994     UVM_ASSERT(va_block_context->mm == vma->vm_mm);
2995     uvm_assert_mmap_lock_locked(va_block_context->mm);
2996     uvm_assert_rwsem_locked(&va_block->hmm.va_space->lock);
2997     uvm_assert_mutex_locked(&va_block->hmm.migrate_lock);
2998     uvm_assert_mutex_locked(&va_block->lock);
2999 
3000     start = uvm_va_block_region_start(va_block, region);
3001     end = uvm_va_block_region_end(va_block, region);
3002     UVM_ASSERT(vma->vm_start <= start && end < vma->vm_end);
3003 
3004     uvm_hmm_migrate_event.va_block = va_block;
3005     uvm_hmm_migrate_event.va_block_retry = va_block_retry;
3006     uvm_hmm_migrate_event.va_block_context = va_block_context;
3007     uvm_hmm_migrate_event.region = region;
3008     uvm_hmm_migrate_event.dest_id = dest_id;
3009     uvm_hmm_migrate_event.cause = cause;
3010 
3011     args->vma = vma;
3012     args->src = va_block_context->hmm.src_pfns + region.first;
3013     args->dst = va_block_context->hmm.dst_pfns + region.first;
3014     args->start = uvm_va_block_region_start(va_block, region);
3015     args->end = uvm_va_block_region_end(va_block, region) + 1;
3016     args->flags = UVM_ID_IS_CPU(dest_id) ? MIGRATE_VMA_SELECT_DEVICE_PRIVATE :
3017                                            MIGRATE_VMA_SELECT_DEVICE_PRIVATE | MIGRATE_VMA_SELECT_SYSTEM;
3018     args->pgmap_owner = &g_uvm_global;
3019     args->fault_page = NULL;
3020 
3021     // Note that migrate_vma_setup() doesn't handle file backed or VM_SPECIAL
3022     // VMAs so if UvmMigrate() tries to migrate such a region, -EINVAL will
3023     // be returned and we will only try to make the pages be CPU resident.
3024     ret = migrate_vma_setup_locked(args, va_block);
3025     if (ret)
3026         return hmm_make_resident_cpu(va_block,
3027                                      vma,
3028                                      va_block_context->hmm.src_pfns,
3029                                      region,
3030                                      NULL,
3031                                      NULL);
3032 
3033     // The overall process here is to migrate pages from the CPU or GPUs to the
3034     // destination processor. Note that block_migrate_add_mappings() handles
3035     // updating GPU mappings after the migration.
3036     // This is safe because we hold the va_block lock across the calls to
3037     // uvm_hmm_migrate_alloc_and_copy(), migrate_vma_pages(),
3038     // uvm_hmm_migrate_finalize(), migrate_vma_finalize() and
3039     // block_migrate_add_mappings().
3040     // If uvm_hmm_migrate_alloc_and_copy() needs to drop the va_block
3041     // lock, a sequence number is used to tell if an invalidate() callback
3042     // occurred while not holding the lock. If the sequence number changes,
3043     // all the locks need to be dropped (mm, va_space, va_block) and the whole
3044     // uvm_hmm_va_block_migrate_locked() called again. Otherwise, there were no
3045     // conflicting invalidate callbacks and our snapshots of the CPU page
3046     // tables are accurate and can be used to DMA pages and update GPU page
3047     // tables.
3048     status = uvm_hmm_migrate_alloc_and_copy(vma, &uvm_hmm_migrate_event);
3049     if (status == NV_WARN_MORE_PROCESSING_REQUIRED) {
3050         uvm_processor_id_t id;
3051         uvm_page_mask_t *page_mask;
3052 
3053         migrate_vma_finalize(args);
3054 
3055         // The CPU pages tables might contain only device private pages or
3056         // the migrate_vma_setup() might have not been able to lock/isolate
3057         // any pages because they are swapped out, or on another device.
3058         // We do know that none of the pages in the region are zero pages
3059         // since migrate_vma_setup() would have reported that information.
3060         // Collect all the pages that need to be faulted in and made CPU
3061         // resident, then do the hmm_range_fault() and retry.
3062         page_mask = &va_block_context->caller_page_mask;
3063         uvm_page_mask_init_from_region(page_mask, region, NULL);
3064 
3065         for_each_id_in_mask(id, &va_block->resident) {
3066             if (!uvm_page_mask_andnot(page_mask,
3067                                       page_mask,
3068                                       uvm_va_block_resident_mask_get(va_block, id)))
3069                 return NV_OK;
3070         }
3071 
3072         return hmm_make_resident_cpu(va_block,
3073                                      vma,
3074                                      va_block_context->hmm.src_pfns,
3075                                      region,
3076                                      NULL,
3077                                      NULL);
3078     }
3079 
3080     if (status == NV_OK) {
3081         migrate_vma_pages(args);
3082         status = uvm_hmm_migrate_finalize(&uvm_hmm_migrate_event);
3083     }
3084 
3085     migrate_vma_finalize(args);
3086 
3087     if (status == NV_WARN_NOTHING_TO_DO)
3088         status = NV_OK;
3089 
3090     return status;
3091 }
3092 
3093 NV_STATUS uvm_hmm_migrate_ranges(uvm_va_space_t *va_space,
3094                                  uvm_va_block_context_t *va_block_context,
3095                                  NvU64 base,
3096                                  NvU64 length,
3097                                  uvm_processor_id_t dest_id,
3098                                  uvm_migrate_mode_t mode,
3099                                  uvm_tracker_t *out_tracker)
3100 {
3101     struct mm_struct *mm;
3102     uvm_va_block_t *va_block;
3103     uvm_va_block_retry_t va_block_retry;
3104     NvU64 addr, end, last_address;
3105     NV_STATUS status = NV_OK;
3106 
3107     if (!uvm_hmm_is_enabled(va_space))
3108         return NV_ERR_INVALID_ADDRESS;
3109 
3110     mm = va_block_context->mm;
3111     UVM_ASSERT(mm == va_space->va_space_mm.mm);
3112     uvm_assert_mmap_lock_locked(mm);
3113     uvm_assert_rwsem_locked(&va_space->lock);
3114 
3115     last_address = base + length - 1;
3116 
3117     for (addr = base; addr < last_address; addr = end + 1) {
3118         struct vm_area_struct *vma;
3119 
3120         status = hmm_va_block_find_create(va_space, addr, false, &va_block_context->hmm.vma, &va_block);
3121         if (status != NV_OK)
3122             return status;
3123 
3124         end = va_block->end;
3125         if (end > last_address)
3126             end = last_address;
3127 
3128         vma = va_block_context->hmm.vma;
3129         if (end > vma->vm_end - 1)
3130             end = vma->vm_end - 1;
3131 
3132         status = hmm_migrate_range(va_block,
3133                                    &va_block_retry,
3134                                    va_block_context,
3135                                    dest_id,
3136                                    addr,
3137                                    end,
3138                                    mode,
3139                                    out_tracker);
3140         if (status != NV_OK)
3141             break;
3142     }
3143 
3144     return status;
3145 }
3146 
3147 NV_STATUS uvm_hmm_va_block_evict_chunk_prep(uvm_va_block_t *va_block,
3148                                             uvm_va_block_context_t *va_block_context,
3149                                             uvm_gpu_chunk_t *gpu_chunk,
3150                                             uvm_va_block_region_t chunk_region)
3151 {
3152     uvm_thread_context_t *uvm_context = uvm_thread_context();
3153     unsigned long *src_pfns = va_block_context->hmm.src_pfns;
3154     uvm_gpu_t *gpu = uvm_gpu_chunk_get_gpu(gpu_chunk);
3155     unsigned long pfn = uvm_pmm_gpu_devmem_get_pfn(&gpu->pmm, gpu_chunk);
3156     uvm_page_index_t page_index = chunk_region.first;
3157     int ret;
3158 
3159     uvm_assert_mutex_locked(&va_block->lock);
3160     // TODO: Bug 3368756: add support for large GPU pages.
3161     UVM_ASSERT(uvm_va_block_region_num_pages(chunk_region) == 1);
3162 
3163     uvm_context->ignore_hmm_invalidate_va_block = va_block;
3164     ret = migrate_device_range(src_pfns + page_index, pfn, uvm_va_block_region_num_pages(chunk_region));
3165     uvm_context->ignore_hmm_invalidate_va_block = NULL;
3166     if (ret)
3167         return errno_to_nv_status(ret);
3168 
3169     return NV_OK;
3170 }
3171 
3172 // Note that the caller must initialize va_block_context->hmm.src_pfns by
3173 // calling uvm_hmm_va_block_evict_chunk_prep() before calling this.
3174 static NV_STATUS hmm_va_block_evict_chunks(uvm_va_block_t *va_block,
3175                                            uvm_va_block_context_t *va_block_context,
3176                                            const uvm_page_mask_t *pages_to_evict,
3177                                            uvm_va_block_region_t region,
3178                                            uvm_make_resident_cause_t cause,
3179                                            bool *out_accessed_by_set)
3180 {
3181     NvU64 start = uvm_va_block_region_start(va_block, region);
3182     NvU64 end = uvm_va_block_region_end(va_block, region);
3183     unsigned long *src_pfns = va_block_context->hmm.src_pfns;
3184     unsigned long *dst_pfns = va_block_context->hmm.dst_pfns;
3185     uvm_hmm_migrate_event_t uvm_hmm_migrate_event = {
3186         .va_block = va_block,
3187         .va_block_retry = NULL,
3188         .va_block_context = va_block_context,
3189         .region = region,
3190         .dest_id = UVM_ID_CPU,
3191         .cause = cause,
3192     };
3193     uvm_page_mask_t *page_mask = &uvm_hmm_migrate_event.page_mask;
3194     const uvm_va_policy_t *policy;
3195     uvm_va_policy_node_t *node;
3196     unsigned long npages;
3197     NV_STATUS status;
3198 
3199     uvm_assert_mutex_locked(&va_block->lock);
3200 
3201     if (out_accessed_by_set)
3202         *out_accessed_by_set = false;
3203 
3204     // Note that there is no VMA available when evicting HMM pages.
3205     va_block_context->hmm.vma = NULL;
3206 
3207     uvm_page_mask_copy(page_mask, pages_to_evict);
3208 
3209     uvm_for_each_va_policy_in(policy, va_block, start, end, node, region) {
3210         npages = uvm_va_block_region_num_pages(region);
3211 
3212         if (out_accessed_by_set && uvm_processor_mask_get_count(&policy->accessed_by) > 0)
3213             *out_accessed_by_set = true;
3214 
3215         // Pages resident on the GPU should not have a resident page in system
3216         // memory.
3217         // TODO: Bug 3660922: Need to handle read duplication at some point.
3218         UVM_ASSERT(uvm_page_mask_region_empty(&va_block->cpu.resident, region));
3219 
3220         status = alloc_and_copy_to_cpu(va_block,
3221                                        NULL,
3222                                        src_pfns,
3223                                        dst_pfns,
3224                                        region,
3225                                        page_mask,
3226                                        NULL,
3227                                        UVM_ID_INVALID,
3228                                        NULL);
3229         if (status != NV_OK)
3230             goto err;
3231 
3232         status = uvm_va_block_make_resident_copy(va_block,
3233                                                  NULL,
3234                                                  va_block_context,
3235                                                  UVM_ID_CPU,
3236                                                  region,
3237                                                  page_mask,
3238                                                  NULL,
3239                                                  cause);
3240         if (status != NV_OK)
3241             goto err;
3242 
3243         migrate_device_pages(src_pfns + region.first, dst_pfns + region.first, npages);
3244 
3245         uvm_hmm_migrate_event.region = region;
3246 
3247         status = uvm_hmm_migrate_finalize(&uvm_hmm_migrate_event);
3248         if (status != NV_OK)
3249             goto err;
3250 
3251         migrate_device_finalize(src_pfns + region.first, dst_pfns + region.first, npages);
3252     }
3253 
3254     return NV_OK;
3255 
3256 err:
3257     migrate_device_finalize(src_pfns + region.first, dst_pfns + region.first, npages);
3258     return status;
3259 }
3260 
3261 NV_STATUS uvm_hmm_va_block_evict_chunks(uvm_va_block_t *va_block,
3262                                         uvm_va_block_context_t *va_block_context,
3263                                         const uvm_page_mask_t *pages_to_evict,
3264                                         uvm_va_block_region_t region,
3265                                         bool *out_accessed_by_set)
3266 {
3267     return hmm_va_block_evict_chunks(va_block,
3268                                      va_block_context,
3269                                      pages_to_evict,
3270                                      region,
3271                                      UVM_MAKE_RESIDENT_CAUSE_EVICTION,
3272                                      out_accessed_by_set);
3273 }
3274 
3275 NV_STATUS uvm_hmm_va_block_evict_pages_from_gpu(uvm_va_block_t *va_block,
3276                                                 uvm_gpu_t *gpu,
3277                                                 uvm_va_block_context_t *va_block_context,
3278                                                 const uvm_page_mask_t *pages_to_evict,
3279                                                 uvm_va_block_region_t region)
3280 {
3281     unsigned long *src_pfns = va_block_context->hmm.src_pfns;
3282     uvm_va_block_gpu_state_t *gpu_state;
3283     uvm_page_index_t page_index;
3284     uvm_gpu_chunk_t *gpu_chunk;
3285     NV_STATUS status;
3286 
3287     uvm_assert_mutex_locked(&va_block->lock);
3288 
3289     gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id);
3290     UVM_ASSERT(gpu_state);
3291     UVM_ASSERT(gpu_state->chunks);
3292 
3293     // Fill in the src_pfns[] with the ZONE_DEVICE private PFNs of the GPU.
3294     memset(src_pfns, 0, sizeof(va_block_context->hmm.src_pfns));
3295 
3296     // TODO: Bug 3368756: add support for large GPU pages.
3297     for_each_va_block_page_in_region_mask(page_index, pages_to_evict, region) {
3298         gpu_chunk = uvm_va_block_lookup_gpu_chunk(va_block,
3299                                                   gpu,
3300                                                   uvm_va_block_cpu_page_address(va_block, page_index));
3301         status = uvm_hmm_va_block_evict_chunk_prep(va_block,
3302                                                    va_block_context,
3303                                                    gpu_chunk,
3304                                                    uvm_va_block_region_for_page(page_index));
3305         if (status != NV_OK)
3306             return status;
3307     }
3308 
3309     return hmm_va_block_evict_chunks(va_block,
3310                                      va_block_context,
3311                                      pages_to_evict,
3312                                      region,
3313                                      UVM_MAKE_RESIDENT_CAUSE_API_MIGRATE,
3314                                      NULL);
3315 }
3316 
3317 NV_STATUS uvm_hmm_pmm_gpu_evict_pfn(unsigned long pfn)
3318 {
3319     unsigned long src_pfn = 0;
3320     unsigned long dst_pfn = 0;
3321     struct page *dst_page;
3322     NV_STATUS status = NV_OK;
3323     int ret;
3324 
3325     ret = migrate_device_range(&src_pfn, pfn, 1);
3326     if (ret)
3327         return errno_to_nv_status(ret);
3328 
3329     if (src_pfn & MIGRATE_PFN_MIGRATE) {
3330         // All the code for copying a vidmem page to sysmem relies on
3331         // having a va_block. However certain combinations of mremap()
3332         // and fork() can result in device-private pages being mapped
3333         // in a child process without a va_block.
3334         //
3335         // We don't expect the above to be a common occurance so for
3336         // now we allocate a fresh zero page when evicting without a
3337         // va_block. However this results in child processes losing
3338         // data so make sure we warn about it. Ideally we would just
3339         // not migrate and SIGBUS the child if it tries to access the
3340         // page. However that would prevent unloading of the driver so
3341         // we're stuck with this until we fix the problem.
3342         // TODO: Bug 3902536: add code to migrate GPU memory without having a
3343         // va_block.
3344         WARN_ON(1);
3345         dst_page = alloc_page(GFP_HIGHUSER_MOVABLE | __GFP_ZERO);
3346         if (!dst_page) {
3347             status = NV_ERR_NO_MEMORY;
3348             goto out;
3349         }
3350 
3351         lock_page(dst_page);
3352         dst_pfn = migrate_pfn(page_to_pfn(dst_page));
3353 
3354         migrate_device_pages(&src_pfn, &dst_pfn, 1);
3355     }
3356 
3357 out:
3358     migrate_device_finalize(&src_pfn, &dst_pfn, 1);
3359 
3360     return status;
3361 }
3362 
3363 // The routines below are all for UVM-HMM tests.
3364 
3365 NV_STATUS uvm_hmm_va_block_range_bounds(uvm_va_space_t *va_space,
3366                                         struct mm_struct *mm,
3367                                         NvU64 lookup_address,
3368                                         NvU64 *startp,
3369                                         NvU64 *endp,
3370                                         UVM_TEST_VA_RESIDENCY_INFO_PARAMS *params)
3371 {
3372     struct vm_area_struct *vma;
3373     NvU64 start;
3374     NvU64 end;
3375 
3376     if (!uvm_hmm_is_enabled(va_space) || !mm)
3377         return NV_ERR_INVALID_ADDRESS;
3378 
3379     uvm_assert_mmap_lock_locked(mm);
3380     uvm_assert_rwsem_locked(&va_space->lock);
3381 
3382     // The VMA might have changed while not holding mmap_lock so check it.
3383     vma = find_vma(mm, lookup_address);
3384     if (!uvm_hmm_vma_is_valid(vma, lookup_address, false))
3385         return NV_ERR_INVALID_ADDRESS;
3386 
3387     // Since managed VA ranges don't cover more than one VMA, return only the
3388     // intersecting range of the VA block and VMA.
3389     start = UVM_VA_BLOCK_ALIGN_DOWN(lookup_address);
3390     end = start + UVM_VA_BLOCK_SIZE - 1;
3391     if (start < vma->vm_start)
3392         start = vma->vm_start;
3393     if (end > vma->vm_end - 1)
3394         end = vma->vm_end - 1;
3395 
3396     *startp = start;
3397     *endp   = end;
3398 
3399     if (params) {
3400         uvm_va_space_processor_uuid(va_space, &params->resident_on[0], UVM_ID_CPU);
3401         params->resident_physical_size[0] = PAGE_SIZE;
3402         params->resident_on_count = 1;
3403 
3404         uvm_va_space_processor_uuid(va_space, &params->mapped_on[0], UVM_ID_CPU);
3405         params->mapping_type[0] = (vma->vm_flags & VM_WRITE) ?
3406                                   UVM_PROT_READ_WRITE_ATOMIC : UVM_PROT_READ_ONLY;
3407         params->page_size[0] = PAGE_SIZE;
3408         params->mapped_on_count = 1;
3409 
3410         uvm_va_space_processor_uuid(va_space, &params->populated_on[0], UVM_ID_CPU);
3411         params->populated_on_count = 1;
3412     }
3413 
3414     return NV_OK;
3415 }
3416 
3417 NV_STATUS uvm_hmm_va_block_update_residency_info(uvm_va_block_t *va_block,
3418                                                  struct mm_struct *mm,
3419                                                  NvU64 lookup_address,
3420                                                  bool populate)
3421 {
3422     uvm_va_space_t *va_space = va_block->hmm.va_space;
3423     struct vm_area_struct *vma;
3424     struct hmm_range range;
3425     uvm_va_block_region_t region;
3426     unsigned long pfn;
3427     NvU64 end;
3428     int ret;
3429     NV_STATUS status;
3430 
3431     if (!uvm_hmm_is_enabled(va_space) || !mm)
3432         return NV_ERR_INVALID_ADDRESS;
3433 
3434     uvm_assert_mmap_lock_locked(mm);
3435     uvm_assert_rwsem_locked(&va_space->lock);
3436 
3437     // The VMA might have changed while not holding mmap_lock so check it.
3438     vma = find_vma(mm, lookup_address);
3439     if (!uvm_hmm_vma_is_valid(vma, lookup_address, false))
3440         return NV_ERR_INVALID_ADDRESS;
3441 
3442     end = lookup_address + PAGE_SIZE;
3443     region = uvm_va_block_region_from_start_end(va_block, lookup_address, end - 1);
3444 
3445     range.notifier = &va_block->hmm.notifier;
3446     range.start = lookup_address;
3447     range.end = end;
3448     range.hmm_pfns = &pfn;
3449     range.default_flags = 0;
3450     range.pfn_flags_mask = 0;
3451     range.dev_private_owner = &g_uvm_global;
3452 
3453     if (populate) {
3454         range.default_flags = HMM_PFN_REQ_FAULT;
3455         if (vma->vm_flags & VM_WRITE)
3456             range.default_flags |= HMM_PFN_REQ_WRITE;
3457     }
3458 
3459     uvm_hmm_migrate_begin_wait(va_block);
3460 
3461     while (true) {
3462         range.notifier_seq = mmu_interval_read_begin(range.notifier);
3463         ret = hmm_range_fault(&range);
3464         if (ret == -EBUSY)
3465             continue;
3466         if (ret) {
3467             uvm_hmm_migrate_finish(va_block);
3468             return errno_to_nv_status(ret);
3469         }
3470 
3471         uvm_mutex_lock(&va_block->lock);
3472 
3473         if (!mmu_interval_read_retry(range.notifier, range.notifier_seq))
3474             break;
3475 
3476         uvm_mutex_unlock(&va_block->lock);
3477     }
3478 
3479     // Update the va_block CPU state based on the snapshot.
3480     // Note that we have to adjust the pfns address since it will be indexed
3481     // by region.first.
3482     status = populate_region(va_block, &pfn - region.first, region, NULL);
3483 
3484     uvm_mutex_unlock(&va_block->lock);
3485     uvm_hmm_migrate_finish(va_block);
3486 
3487     return NV_OK;
3488 }
3489 
3490 NV_STATUS uvm_test_split_invalidate_delay(UVM_TEST_SPLIT_INVALIDATE_DELAY_PARAMS *params, struct file *filp)
3491 {
3492     uvm_va_space_t *va_space = uvm_va_space_get(filp);
3493 
3494     atomic64_set(&va_space->test.split_invalidate_delay_us, params->delay_us);
3495 
3496     return NV_OK;
3497 }
3498 
3499 NV_STATUS uvm_hmm_va_range_info(uvm_va_space_t *va_space,
3500                                 struct mm_struct *mm,
3501                                 UVM_TEST_VA_RANGE_INFO_PARAMS *params)
3502 {
3503     uvm_range_tree_node_t *tree_node;
3504     const uvm_va_policy_node_t *node;
3505     struct vm_area_struct *vma;
3506     uvm_va_block_t *va_block;
3507 
3508     if (!mm || !uvm_hmm_is_enabled(va_space))
3509         return NV_ERR_INVALID_ADDRESS;
3510 
3511     uvm_assert_mmap_lock_locked(mm);
3512     uvm_assert_rwsem_locked(&va_space->lock);
3513 
3514     params->type = UVM_TEST_VA_RANGE_TYPE_MANAGED;
3515     params->managed.subtype = UVM_TEST_RANGE_SUBTYPE_HMM;
3516     params->va_range_start = 0;
3517     params->va_range_end = ULONG_MAX;
3518     params->read_duplication = UVM_TEST_READ_DUPLICATION_UNSET;
3519     memset(&params->preferred_location, 0, sizeof(params->preferred_location));
3520     params->accessed_by_count = 0;
3521     params->managed.vma_start = 0;
3522     params->managed.vma_end = 0;
3523     params->managed.is_zombie = NV_FALSE;
3524     params->managed.owned_by_calling_process = (mm == current->mm ? NV_TRUE : NV_FALSE);
3525 
3526     vma = find_vma(mm, params->lookup_address);
3527     if (!uvm_hmm_vma_is_valid(vma, params->lookup_address, false))
3528         return NV_ERR_INVALID_ADDRESS;
3529 
3530     params->va_range_start = vma->vm_start;
3531     params->va_range_end   = vma->vm_end - 1;
3532     params->managed.vma_start = vma->vm_start;
3533     params->managed.vma_end   = vma->vm_end - 1;
3534 
3535     uvm_mutex_lock(&va_space->hmm.blocks_lock);
3536     tree_node = uvm_range_tree_find(&va_space->hmm.blocks, params->lookup_address);
3537     if (!tree_node) {
3538         UVM_ASSERT(uvm_range_tree_find_hole_in(&va_space->hmm.blocks, params->lookup_address,
3539                                                &params->va_range_start, &params->va_range_end) == NV_OK);
3540         uvm_mutex_unlock(&va_space->hmm.blocks_lock);
3541         return NV_OK;
3542     }
3543 
3544     uvm_mutex_unlock(&va_space->hmm.blocks_lock);
3545     va_block = hmm_va_block_from_node(tree_node);
3546     uvm_mutex_lock(&va_block->lock);
3547 
3548     params->va_range_start = va_block->start;
3549     params->va_range_end   = va_block->end;
3550 
3551     node = uvm_va_policy_node_find(va_block, params->lookup_address);
3552     if (node) {
3553         uvm_processor_id_t processor_id;
3554 
3555         if (params->va_range_start < node->node.start)
3556             params->va_range_start = node->node.start;
3557         if (params->va_range_end > node->node.end)
3558             params->va_range_end = node->node.end;
3559 
3560         params->read_duplication = node->policy.read_duplication;
3561 
3562         if (!UVM_ID_IS_INVALID(node->policy.preferred_location))
3563             uvm_va_space_processor_uuid(va_space, &params->preferred_location, node->policy.preferred_location);
3564 
3565         for_each_id_in_mask(processor_id, &node->policy.accessed_by)
3566             uvm_va_space_processor_uuid(va_space, &params->accessed_by[params->accessed_by_count++], processor_id);
3567     }
3568     else {
3569         uvm_range_tree_find_hole_in(&va_block->hmm.va_policy_tree, params->lookup_address,
3570                                     &params->va_range_start, &params->va_range_end);
3571     }
3572 
3573     uvm_mutex_unlock(&va_block->lock);
3574 
3575     return NV_OK;
3576 }
3577 
3578 // TODO: Bug 3660968: Remove this hack as soon as HMM migration is implemented
3579 // for VMAs other than anonymous private memory.
3580 bool uvm_hmm_must_use_sysmem(uvm_va_block_t *va_block,
3581                              uvm_va_block_context_t *va_block_context)
3582 {
3583     struct vm_area_struct *vma = va_block_context->hmm.vma;
3584 
3585     uvm_assert_mutex_locked(&va_block->lock);
3586 
3587     if (!uvm_va_block_is_hmm(va_block))
3588         return false;
3589 
3590     UVM_ASSERT(vma);
3591     UVM_ASSERT(va_block_context->mm == vma->vm_mm);
3592     uvm_assert_mmap_lock_locked(va_block_context->mm);
3593 
3594     // TODO: Bug 4050579: Remove this when swap cached pages can be migrated.
3595     if (va_block_context->hmm.swap_cached)
3596         return true;
3597 
3598     // migrate_vma_setup() can't migrate VM_SPECIAL so we have to force GPU
3599     // remote mapping.
3600     // TODO: Bug 3660968: add support for file-backed migrations.
3601     // TODO: Bug 3368756: add support for transparent huge page migrations.
3602     return !vma_is_anonymous(vma) ||
3603            (vma->vm_flags & VM_SPECIAL) ||
3604            vma_is_dax(vma) ||
3605            is_vm_hugetlb_page(vma);
3606 }
3607 
3608 #endif // UVM_IS_CONFIG_HMM()
3609 
3610