1 /*******************************************************************************
2     Copyright (c) 2016-2023 NVIDIA Corporation
3 
4     Permission is hereby granted, free of charge, to any person obtaining a copy
5     of this software and associated documentation files (the "Software"), to
6     deal in the Software without restriction, including without limitation the
7     rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
8     sell copies of the Software, and to permit persons to whom the Software is
9     furnished to do so, subject to the following conditions:
10 
11         The above copyright notice and this permission notice shall be
12         included in all copies or substantial portions of the Software.
13 
14     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17     THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19     FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20     DEALINGS IN THE SOFTWARE.
21 
22 *******************************************************************************/
23 
24 #include "uvm_hmm.h"
25 
26 // Support for HMM ( https://docs.kernel.org/mm/hmm.html ):
27 
28 #ifdef NVCPU_X86_64
29 static bool uvm_disable_hmm = false;
30 MODULE_PARM_DESC(uvm_disable_hmm,
31                  "Force-disable HMM functionality in the UVM driver. "
32                  "Default: false (HMM is enabled if possible). "
33                  "However, even with uvm_disable_hmm=false, HMM will not be "
34                  "enabled if is not supported in this driver build "
35                  "configuration, or if ATS settings conflict with HMM.");
36 #else
37 // So far, we've only tested HMM on x86_64, so disable it by default everywhere
38 // else.
39 static bool uvm_disable_hmm = true;
40 MODULE_PARM_DESC(uvm_disable_hmm,
41                  "Force-disable HMM functionality in the UVM driver. "
42                  "Default: true (HMM is not enabled on this CPU architecture). "
43                  "However, even with uvm_disable_hmm=false, HMM will not be "
44                  "enabled if is not supported in this driver build "
45                  "configuration, or if ATS settings conflict with HMM.");
46 #endif
47 
48 module_param(uvm_disable_hmm, bool, 0444);
49 
50 #if UVM_IS_CONFIG_HMM()
51 
52 #include <linux/hmm.h>
53 #include <linux/rmap.h>
54 #include <linux/migrate.h>
55 #include <linux/userfaultfd_k.h>
56 #include <linux/memremap.h>
57 #include <linux/wait.h>
58 
59 #include "uvm_common.h"
60 #include "uvm_gpu.h"
61 #include "uvm_pmm_gpu.h"
62 #include "uvm_hal_types.h"
63 #include "uvm_va_block_types.h"
64 #include "uvm_va_space_mm.h"
65 #include "uvm_va_space.h"
66 #include "uvm_va_range.h"
67 #include "uvm_range_tree.h"
68 #include "uvm_pmm_sysmem.h"
69 #include "uvm_lock.h"
70 #include "uvm_api.h"
71 #include "uvm_va_policy.h"
72 #include "uvm_tools.h"
73 
74 static NV_STATUS gpu_chunk_add(uvm_va_block_t *va_block,
75                                uvm_page_index_t page_index,
76                                struct page *page);
77 
78 typedef struct
79 {
80     uvm_processor_id_t processor_id;
81     uvm_processor_id_t new_residency;
82     uvm_va_block_t *va_block;
83     uvm_va_block_retry_t *va_block_retry;
84     uvm_service_block_context_t *service_context;
85     uvm_page_mask_t page_mask;
86     uvm_page_mask_t same_devmem_page_mask;
87 } uvm_hmm_gpu_fault_event_t;
88 
89 typedef struct
90 {
91     uvm_va_block_t *va_block;
92     uvm_va_block_retry_t *va_block_retry;
93     uvm_va_block_context_t *va_block_context;
94     uvm_va_block_region_t region;
95     uvm_processor_id_t dest_id;
96     uvm_make_resident_cause_t cause;
97     uvm_page_mask_t page_mask;
98     uvm_page_mask_t same_devmem_page_mask;
99 } uvm_hmm_migrate_event_t;
100 
101 typedef struct
102 {
103     uvm_processor_id_t processor_id;
104     uvm_va_block_t *va_block;
105     uvm_va_block_retry_t *va_block_retry;
106     uvm_service_block_context_t *service_context;
107     uvm_page_mask_t page_mask;
108     uvm_page_mask_t same_devmem_page_mask;
109 } uvm_hmm_devmem_fault_context_t;
110 
111 bool uvm_hmm_is_enabled_system_wide(void)
112 {
113     return !uvm_disable_hmm && !g_uvm_global.ats.enabled && uvm_va_space_mm_enabled_system();
114 }
115 
116 bool uvm_hmm_is_enabled(uvm_va_space_t *va_space)
117 {
118     return uvm_hmm_is_enabled_system_wide() &&
119            uvm_va_space_mm_enabled(va_space) &&
120            !(va_space->initialization_flags & UVM_INIT_FLAGS_DISABLE_HMM);
121 }
122 
123 static uvm_va_block_t *hmm_va_block_from_node(uvm_range_tree_node_t *node)
124 {
125     if (!node)
126         return NULL;
127     return container_of(node, uvm_va_block_t, hmm.node);
128 }
129 
130 NV_STATUS uvm_hmm_va_space_initialize(uvm_va_space_t *va_space)
131 {
132     uvm_hmm_va_space_t *hmm_va_space = &va_space->hmm;
133     struct mm_struct *mm = va_space->va_space_mm.mm;
134     int ret;
135 
136     if (!uvm_hmm_is_enabled(va_space))
137         return NV_OK;
138 
139     uvm_assert_mmap_lock_locked_write(mm);
140     uvm_assert_rwsem_locked_write(&va_space->lock);
141 
142     uvm_range_tree_init(&hmm_va_space->blocks);
143     uvm_mutex_init(&hmm_va_space->blocks_lock, UVM_LOCK_ORDER_LEAF);
144 
145     // Initialize MMU interval notifiers for this process.
146     // This allows mmu_interval_notifier_insert() to be called without holding
147     // the mmap_lock for write.
148     // Note: there is no __mmu_notifier_unregister(), this call just allocates
149     // memory which is attached to the mm_struct and freed when the mm_struct
150     // is freed.
151     ret = __mmu_notifier_register(NULL, mm);
152     if (ret)
153         return errno_to_nv_status(ret);
154 
155     return NV_OK;
156 }
157 
158 void uvm_hmm_va_space_destroy(uvm_va_space_t *va_space)
159 {
160     uvm_hmm_va_space_t *hmm_va_space = &va_space->hmm;
161     uvm_range_tree_node_t *node, *next;
162     uvm_va_block_t *va_block;
163 
164     if (!uvm_hmm_is_enabled(va_space))
165         return;
166 
167     uvm_assert_rwsem_locked_write(&va_space->lock);
168 
169     // The blocks_lock is not needed when the va_space lock is held for write.
170     uvm_range_tree_for_each_safe(node, next, &hmm_va_space->blocks) {
171         va_block = hmm_va_block_from_node(node);
172         uvm_range_tree_remove(&hmm_va_space->blocks, node);
173         mmu_interval_notifier_remove(&va_block->hmm.notifier);
174         uvm_va_block_kill(va_block);
175     }
176 }
177 
178 static void hmm_va_block_unregister_gpu(uvm_va_block_t *va_block,
179                                         uvm_gpu_t *gpu,
180                                         struct mm_struct *mm)
181 {
182     uvm_va_policy_node_t *node;
183 
184     uvm_mutex_lock(&va_block->lock);
185 
186     // Reset preferred location and accessed-by of policy nodes if needed.
187     uvm_for_each_va_policy_node_in(node, va_block, va_block->start, va_block->end) {
188         if (uvm_id_equal(node->policy.preferred_location, gpu->id))
189             node->policy.preferred_location = UVM_ID_INVALID;
190 
191         uvm_processor_mask_clear(&node->policy.accessed_by, gpu->id);
192     }
193 
194     // Migrate and free any remaining resident allocations on this GPU.
195     uvm_va_block_unregister_gpu_locked(va_block, gpu, mm);
196 
197     uvm_mutex_unlock(&va_block->lock);
198 }
199 
200 void uvm_hmm_unregister_gpu(uvm_va_space_t *va_space, uvm_gpu_t *gpu, struct mm_struct *mm)
201 {
202     uvm_range_tree_node_t *node;
203     uvm_va_block_t *va_block;
204 
205     if (!uvm_hmm_is_enabled(va_space))
206         return;
207 
208     if (mm)
209         uvm_assert_mmap_lock_locked(mm);
210     uvm_assert_rwsem_locked_write(&va_space->lock);
211 
212     uvm_range_tree_for_each(node, &va_space->hmm.blocks) {
213         va_block = hmm_va_block_from_node(node);
214 
215         hmm_va_block_unregister_gpu(va_block, gpu, mm);
216     }
217 }
218 
219 static void hmm_va_block_remove_gpu_va_space(uvm_va_block_t *va_block,
220                                              uvm_gpu_va_space_t *gpu_va_space,
221                                              uvm_va_block_context_t *va_block_context)
222 {
223     uvm_mutex_lock(&va_block->lock);
224 
225     uvm_va_block_remove_gpu_va_space(va_block, gpu_va_space, va_block_context);
226 
227     uvm_mutex_unlock(&va_block->lock);
228 
229     // TODO: Bug 3660922: Need to handle read duplication at some point.
230     // See range_remove_gpu_va_space_managed().
231 }
232 
233 void uvm_hmm_remove_gpu_va_space(uvm_va_space_t *va_space,
234                                  uvm_gpu_va_space_t *gpu_va_space,
235                                  struct mm_struct *mm)
236 {
237     uvm_va_block_context_t *va_block_context;
238     uvm_range_tree_node_t *node, *next;
239     uvm_va_block_t *va_block;
240 
241     if (!uvm_hmm_is_enabled(va_space))
242         return;
243 
244     if (mm)
245         uvm_assert_mmap_lock_locked(mm);
246     uvm_assert_rwsem_locked_write(&va_space->lock);
247 
248     va_block_context = uvm_va_space_block_context(va_space, mm);
249 
250     uvm_range_tree_for_each_safe(node, next, &va_space->hmm.blocks) {
251         va_block = hmm_va_block_from_node(node);
252 
253         hmm_va_block_remove_gpu_va_space(va_block, gpu_va_space, va_block_context);
254     }
255 }
256 
257 static bool hmm_invalidate(uvm_va_block_t *va_block,
258                            const struct mmu_notifier_range *range,
259                            unsigned long cur_seq)
260 {
261     uvm_thread_context_t *uvm_context = uvm_thread_context();
262     struct mmu_interval_notifier *mni = &va_block->hmm.notifier;
263     struct mm_struct *mm = mni->mm;
264     uvm_va_block_context_t *va_block_context;
265     uvm_va_block_region_t region;
266     NvU64 start, end;
267     uvm_processor_id_t id;
268     NV_STATUS status = NV_OK;
269 
270     // The MMU_NOTIFY_RELEASE event isn't really needed since mn_itree_release()
271     // doesn't remove the interval notifiers from the struct_mm so there will
272     // be a full range MMU_NOTIFY_UNMAP event after the release from
273     // unmap_vmas() during exit_mmap().
274     if (range->event == MMU_NOTIFY_SOFT_DIRTY || range->event == MMU_NOTIFY_RELEASE)
275         return true;
276 
277     // Blockable is only set false by
278     // mmu_notifier_invalidate_range_start_nonblock() which is only called in
279     // __oom_reap_task_mm().
280     if (!mmu_notifier_range_blockable(range))
281         return false;
282 
283     // We only ignore invalidations in this context whilst holding the
284     // va_block lock. This prevents deadlock when try_to_migrate()
285     // calls the notifier, but holding the lock prevents other threads
286     // invalidating PTEs so we can safely assume the results of
287     // migrate_vma_setup() are correct.
288     if (uvm_context->ignore_hmm_invalidate_va_block == va_block ||
289         ((range->event == MMU_NOTIFY_MIGRATE || range->event == MMU_NOTIFY_EXCLUSIVE) &&
290          range->owner == &g_uvm_global))
291         return true;
292 
293     va_block_context = uvm_va_block_context_alloc(mm);
294     if (!va_block_context)
295         return true;
296 
297     uvm_mutex_lock(&va_block->lock);
298 
299     // mmu_interval_notifier_remove() is always called before marking a
300     // va_block as dead so this va_block has to be alive.
301     UVM_ASSERT(!uvm_va_block_is_dead(va_block));
302 
303     // Note: unmap_vmas() does MMU_NOTIFY_UNMAP [0, 0xffffffffffffffff]
304     // Also note that hmm_invalidate() can be called when a new va_block is not
305     // yet inserted into the va_space->hmm.blocks table while the original
306     // va_block is being split. The original va_block may have its end address
307     // updated before the mmu interval notifier is updated so this invalidate
308     // may be for a range past the va_block end address.
309     start = range->start;
310     end = (range->end == ULONG_MAX) ? range->end : range->end - 1;
311     if (start < va_block->start)
312         start = va_block->start;
313     if (end > va_block->end)
314         end = va_block->end;
315     if (start > end)
316         goto unlock;
317 
318     // These will be equal if no other thread causes an invalidation
319     // whilst the va_block lock was dropped.
320     uvm_context->hmm_invalidate_seqnum++;
321     va_block->hmm.changed++;
322 
323     mmu_interval_set_seq(mni, cur_seq);
324 
325     region = uvm_va_block_region_from_start_end(va_block, start, end);
326 
327     va_block_context->hmm.vma = NULL;
328     va_block_context->policy = NULL;
329 
330     // We only need to unmap GPUs since Linux handles the CPUs.
331     for_each_gpu_id_in_mask(id, &va_block->mapped) {
332         status = uvm_va_block_unmap(va_block,
333                                     va_block_context,
334                                     id,
335                                     region,
336                                     uvm_va_block_map_mask_get(va_block, id),
337                                     &va_block->tracker);
338         // Note that the va_block lock can be dropped, relocked, and
339         // NV_ERR_MORE_PROCESSING_REQUIRED returned.
340         if (status != NV_OK)
341             break;
342     }
343 
344     if (range->event == MMU_NOTIFY_UNMAP || range->event == MMU_NOTIFY_CLEAR)
345         uvm_va_block_munmap_region(va_block, region);
346 
347     if (status == NV_OK)
348         status = uvm_tracker_wait(&va_block->tracker);
349 
350     // Remove stale HMM struct page pointers to system memory.
351     uvm_va_block_remove_cpu_chunks(va_block, region);
352 
353 unlock:
354     uvm_mutex_unlock(&va_block->lock);
355 
356     uvm_va_block_context_free(va_block_context);
357 
358     UVM_ASSERT(status == NV_OK);
359     return true;
360 }
361 
362 static bool uvm_hmm_invalidate_entry(struct mmu_interval_notifier *mni,
363                                      const struct mmu_notifier_range *range,
364                                      unsigned long cur_seq)
365 {
366     uvm_va_block_t *va_block = container_of(mni, uvm_va_block_t, hmm.notifier);
367 
368     UVM_ENTRY_RET(hmm_invalidate(va_block, range, cur_seq));
369 }
370 
371 static const struct mmu_interval_notifier_ops uvm_hmm_notifier_ops =
372 {
373     .invalidate = uvm_hmm_invalidate_entry,
374 };
375 
376 NV_STATUS uvm_hmm_va_block_find(uvm_va_space_t *va_space,
377                                 NvU64 addr,
378                                 uvm_va_block_t **va_block_ptr)
379 {
380     uvm_range_tree_node_t *node;
381 
382     if (!uvm_hmm_is_enabled(va_space))
383         return NV_ERR_INVALID_ADDRESS;
384 
385     uvm_assert_rwsem_locked(&va_space->lock);
386 
387     uvm_mutex_lock(&va_space->hmm.blocks_lock);
388     node = uvm_range_tree_find(&va_space->hmm.blocks, addr);
389     uvm_mutex_unlock(&va_space->hmm.blocks_lock);
390 
391     if (!node)
392         return NV_ERR_OBJECT_NOT_FOUND;
393 
394     *va_block_ptr = hmm_va_block_from_node(node);
395 
396     return NV_OK;
397 }
398 
399 static int migrate_vma_setup_locked(struct migrate_vma *args, uvm_va_block_t *va_block)
400 {
401     uvm_thread_context_t *uvm_context = uvm_thread_context();
402     int ret;
403 
404     // It's only safe to ignore invalidations whilst doing a migration
405     // and holding the va_block lock.
406     uvm_assert_mutex_locked(&va_block->lock);
407     uvm_context->ignore_hmm_invalidate_va_block = va_block;
408     ret = migrate_vma_setup(args);
409 
410     // We shouldn't be generating any more invalidations now.
411     uvm_context->ignore_hmm_invalidate_va_block = NULL;
412     return ret;
413 }
414 
415 static bool uvm_hmm_vma_is_valid(struct vm_area_struct *vma,
416                                  unsigned long addr,
417                                  bool allow_unreadable_vma)
418 {
419     // UVM doesn't support userfaultfd. hmm_range_fault() doesn't support
420     // VM_IO or VM_PFNMAP VMAs. It also doesn't support VMAs without VM_READ
421     // but we allow those VMAs to have policy set on them.
422     // migrate_vma_setup() doesn't support VM_SPECIAL VMAs but that is handled
423     // by uvm_hmm_must_use_sysmem() forcing residency to the CPU.
424     return vma &&
425            addr >= vma->vm_start &&
426            !userfaultfd_armed(vma) &&
427            !(vma->vm_flags & (VM_IO | VM_PFNMAP)) &&
428            !uvm_vma_is_managed(vma) &&
429            (allow_unreadable_vma || (vma->vm_flags & VM_READ));
430 }
431 
432 static void hmm_va_block_init(uvm_va_block_t *va_block,
433                               uvm_va_space_t *va_space,
434                               NvU64 start,
435                               NvU64 end)
436 {
437     va_block->hmm.va_space = va_space;
438     va_block->hmm.node.start = start;
439     va_block->hmm.node.end = end;
440     uvm_range_tree_init(&va_block->hmm.va_policy_tree);
441     uvm_mutex_init(&va_block->hmm.migrate_lock, UVM_LOCK_ORDER_VA_BLOCK_MIGRATE);
442 }
443 
444 static NV_STATUS hmm_va_block_find_create(uvm_va_space_t *va_space,
445                                           NvU64 addr,
446                                           bool allow_unreadable_vma,
447                                           uvm_va_block_context_t *va_block_context,
448                                           uvm_va_block_t **va_block_ptr)
449 {
450     struct mm_struct *mm = va_space->va_space_mm.mm;
451     struct vm_area_struct *vma;
452     uvm_va_block_t *va_block;
453     NvU64 start, end;
454     NV_STATUS status;
455     int ret;
456 
457     if (!uvm_hmm_is_enabled(va_space))
458         return NV_ERR_INVALID_ADDRESS;
459 
460     UVM_ASSERT(mm);
461     UVM_ASSERT(!va_block_context || va_block_context->mm == mm);
462     uvm_assert_mmap_lock_locked(mm);
463     uvm_assert_rwsem_locked(&va_space->lock);
464     UVM_ASSERT(PAGE_ALIGNED(addr));
465 
466     // Note that we have to allow PROT_NONE VMAs so that policies can be set.
467     vma = find_vma(mm, addr);
468     if (!uvm_hmm_vma_is_valid(vma, addr, allow_unreadable_vma))
469         return NV_ERR_INVALID_ADDRESS;
470 
471     // Since we only hold the va_space read lock, there can be multiple
472     // parallel va_block insertions.
473     uvm_mutex_lock(&va_space->hmm.blocks_lock);
474 
475     va_block = hmm_va_block_from_node(uvm_range_tree_find(&va_space->hmm.blocks, addr));
476     if (va_block)
477         goto done;
478 
479     // The va_block is always created to cover the whole aligned
480     // UVM_VA_BLOCK_SIZE interval unless there are existing UVM va_ranges or
481     // HMM va_blocks. In that case, the new HMM va_block size is adjusted so it
482     // doesn't overlap.
483     start = UVM_VA_BLOCK_ALIGN_DOWN(addr);
484     end = start + UVM_VA_BLOCK_SIZE - 1;
485 
486     // Search for existing UVM va_ranges in the start/end interval and create
487     // a maximum interval that doesn't overlap any existing UVM va_ranges.
488     // We know that 'addr' is not within a va_range or
489     // hmm_va_block_find_create() wouldn't be called.
490     status = uvm_range_tree_find_hole_in(&va_space->va_range_tree, addr, &start, &end);
491     UVM_ASSERT(status == NV_OK);
492 
493     // Search for existing HMM va_blocks in the start/end interval and create
494     // a maximum interval that doesn't overlap any existing HMM va_blocks.
495     status = uvm_range_tree_find_hole_in(&va_space->hmm.blocks, addr, &start, &end);
496     UVM_ASSERT(status == NV_OK);
497 
498     // Create a HMM va_block with a NULL va_range pointer.
499     status = uvm_va_block_create(NULL, start, end, &va_block);
500     if (status != NV_OK)
501         goto err_unlock;
502 
503     hmm_va_block_init(va_block, va_space, start, end);
504 
505     ret = mmu_interval_notifier_insert(&va_block->hmm.notifier,
506                                        mm,
507                                        start,
508                                        end - start + 1,
509                                        &uvm_hmm_notifier_ops);
510     if (ret) {
511         status = errno_to_nv_status(ret);
512         goto err_release;
513     }
514 
515     status = uvm_range_tree_add(&va_space->hmm.blocks, &va_block->hmm.node);
516     UVM_ASSERT(status == NV_OK);
517 
518 done:
519     uvm_mutex_unlock(&va_space->hmm.blocks_lock);
520     if (va_block_context)
521         va_block_context->hmm.vma = vma;
522     *va_block_ptr = va_block;
523     return NV_OK;
524 
525 err_release:
526     uvm_va_block_release(va_block);
527 
528 err_unlock:
529     uvm_mutex_unlock(&va_space->hmm.blocks_lock);
530     return status;
531 }
532 
533 NV_STATUS uvm_hmm_va_block_find_create(uvm_va_space_t *va_space,
534                                        NvU64 addr,
535                                        uvm_va_block_context_t *va_block_context,
536                                        uvm_va_block_t **va_block_ptr)
537 {
538     return hmm_va_block_find_create(va_space, addr, false, va_block_context, va_block_ptr);
539 }
540 
541 NV_STATUS uvm_hmm_find_vma(uvm_va_block_context_t *va_block_context, NvU64 addr)
542 {
543     struct mm_struct *mm = va_block_context->mm;
544     struct vm_area_struct *vma;
545 
546     if (!mm)
547         return NV_ERR_INVALID_ADDRESS;
548 
549     uvm_assert_mmap_lock_locked(mm);
550 
551     vma = find_vma(mm, addr);
552     if (!uvm_hmm_vma_is_valid(vma, addr, false))
553         return NV_ERR_INVALID_ADDRESS;
554 
555     va_block_context->hmm.vma = vma;
556 
557     return NV_OK;
558 }
559 
560 bool uvm_hmm_check_context_vma_is_valid(uvm_va_block_t *va_block,
561                                         uvm_va_block_context_t *va_block_context,
562                                         uvm_va_block_region_t region)
563 {
564     uvm_assert_mutex_locked(&va_block->lock);
565 
566     if (uvm_va_block_is_hmm(va_block)) {
567         struct vm_area_struct *vma = va_block_context->hmm.vma;
568 
569         UVM_ASSERT(vma);
570         UVM_ASSERT(va_block_context->mm == vma->vm_mm);
571         uvm_assert_mmap_lock_locked(va_block_context->mm);
572         UVM_ASSERT(vma->vm_start <= uvm_va_block_region_start(va_block, region));
573         UVM_ASSERT(vma->vm_end > uvm_va_block_region_end(va_block, region));
574     }
575 
576     return true;
577 }
578 
579 void uvm_hmm_service_context_init(uvm_service_block_context_t *service_context)
580 {
581     // TODO: Bug 4050579: Remove this when swap cached pages can be migrated.
582     service_context->block_context.hmm.swap_cached = false;
583 }
584 
585 NV_STATUS uvm_hmm_migrate_begin(uvm_va_block_t *va_block)
586 {
587     if (uvm_mutex_trylock(&va_block->hmm.migrate_lock))
588         return NV_OK;
589 
590     return NV_ERR_BUSY_RETRY;
591 }
592 
593 void uvm_hmm_migrate_begin_wait(uvm_va_block_t *va_block)
594 {
595     uvm_mutex_lock(&va_block->hmm.migrate_lock);
596 }
597 
598 void uvm_hmm_migrate_finish(uvm_va_block_t *va_block)
599 {
600     uvm_mutex_unlock(&va_block->hmm.migrate_lock);
601 }
602 
603 // Migrate the given range [start end] within a va_block to dest_id.
604 static NV_STATUS hmm_migrate_range(uvm_va_block_t *va_block,
605                                    uvm_va_block_retry_t *va_block_retry,
606                                    uvm_va_block_context_t *va_block_context,
607                                    uvm_processor_id_t dest_id,
608                                    NvU64 start,
609                                    NvU64 end,
610                                    uvm_migrate_mode_t mode,
611                                    uvm_tracker_t *out_tracker)
612 {
613     uvm_va_block_region_t region;
614     uvm_va_policy_node_t *node;
615     const uvm_va_policy_t *policy;
616     NV_STATUS status = NV_OK;
617 
618     uvm_hmm_migrate_begin_wait(va_block);
619     uvm_mutex_lock(&va_block->lock);
620 
621     uvm_for_each_va_policy_in(policy, va_block, start, end, node, region) {
622         va_block_context->policy = policy;
623 
624         // Even though UVM_VA_BLOCK_RETRY_LOCKED() may unlock and relock the
625         // va_block lock, the policy remains valid because we hold the mmap
626         // lock so munmap can't remove the policy, and the va_space lock so the
627         // policy APIs can't change the policy.
628         status = UVM_VA_BLOCK_RETRY_LOCKED(va_block,
629                                            va_block_retry,
630                                            uvm_va_block_migrate_locked(va_block,
631                                                                        va_block_retry,
632                                                                        va_block_context,
633                                                                        region,
634                                                                        dest_id,
635                                                                        mode,
636                                                                        out_tracker));
637         if (status != NV_OK)
638             break;
639     }
640 
641     uvm_mutex_unlock(&va_block->lock);
642     uvm_hmm_migrate_finish(va_block);
643 
644     return status;
645 }
646 
647 void uvm_hmm_evict_va_blocks(uvm_va_space_t *va_space)
648 {
649     // We can't use uvm_va_space_mm_retain(), because the va_space_mm
650     // should already be dead by now.
651     struct mm_struct *mm = va_space->va_space_mm.mm;
652     uvm_hmm_va_space_t *hmm_va_space = &va_space->hmm;
653     uvm_range_tree_node_t *node, *next;
654     uvm_va_block_t *va_block;
655     uvm_va_block_context_t *block_context;
656 
657     uvm_down_read_mmap_lock(mm);
658     uvm_va_space_down_write(va_space);
659 
660     uvm_range_tree_for_each_safe(node, next, &hmm_va_space->blocks) {
661         uvm_va_block_region_t region;
662         struct vm_area_struct *vma;
663 
664         va_block = hmm_va_block_from_node(node);
665         block_context = uvm_va_space_block_context(va_space, mm);
666         uvm_hmm_migrate_begin_wait(va_block);
667         uvm_mutex_lock(&va_block->lock);
668         for_each_va_block_vma_region(va_block, mm, vma, &region) {
669             if (!uvm_hmm_vma_is_valid(vma, vma->vm_start, false))
670                 continue;
671 
672             block_context->hmm.vma = vma;
673             block_context->policy = &uvm_va_policy_default;
674             uvm_hmm_va_block_migrate_locked(va_block,
675                                             NULL,
676                                             block_context,
677                                             UVM_ID_CPU,
678                                             region,
679                                             UVM_MAKE_RESIDENT_CAUSE_API_MIGRATE);
680         }
681         uvm_mutex_unlock(&va_block->lock);
682         uvm_hmm_migrate_finish(va_block);
683     }
684 
685     uvm_va_space_up_write(va_space);
686     uvm_up_read_mmap_lock(mm);
687 }
688 
689 NV_STATUS uvm_hmm_test_va_block_inject_split_error(uvm_va_space_t *va_space, NvU64 addr)
690 {
691     uvm_va_block_test_t *block_test;
692     uvm_va_block_t *va_block;
693     NV_STATUS status;
694 
695     if (!uvm_hmm_is_enabled(va_space))
696         return NV_ERR_INVALID_ADDRESS;
697 
698     status = hmm_va_block_find_create(va_space, addr, false, NULL, &va_block);
699     if (status != NV_OK)
700         return status;
701 
702     block_test = uvm_va_block_get_test(va_block);
703     if (block_test)
704         block_test->inject_split_error = true;
705 
706     return NV_OK;
707 }
708 
709 typedef struct {
710     struct mmu_interval_notifier notifier;
711     uvm_va_block_t *existing_block;
712 } hmm_split_invalidate_data_t;
713 
714 static bool hmm_split_invalidate(struct mmu_interval_notifier *mni,
715                                  const struct mmu_notifier_range *range,
716                                  unsigned long cur_seq)
717 {
718     hmm_split_invalidate_data_t *split_data = container_of(mni, hmm_split_invalidate_data_t, notifier);
719 
720     uvm_tools_test_hmm_split_invalidate(split_data->existing_block->hmm.va_space);
721     hmm_invalidate(split_data->existing_block, range, cur_seq);
722 
723     return true;
724 }
725 
726 static bool hmm_split_invalidate_entry(struct mmu_interval_notifier *mni,
727                                        const struct mmu_notifier_range *range,
728                                        unsigned long cur_seq)
729 {
730     UVM_ENTRY_RET(hmm_split_invalidate(mni, range, cur_seq));
731 }
732 
733 static const struct mmu_interval_notifier_ops hmm_notifier_split_ops =
734 {
735     .invalidate = hmm_split_invalidate_entry,
736 };
737 
738 // Splits existing va_block into two pieces, with new_va_block always after
739 // va_block. va_block is updated to have new_end. new_end+1 must be page-
740 // aligned.
741 //
742 // Before: [----------- existing ------------]
743 // After:  [---- existing ----][---- new ----]
744 //                            ^new_end
745 //
746 // On error, va_block is still accessible and is left in its original
747 // functional state.
748 static NV_STATUS hmm_split_block(uvm_va_block_t *va_block,
749                                  NvU64 new_end,
750                                  uvm_va_block_t **new_block_ptr)
751 {
752     uvm_va_space_t *va_space = va_block->hmm.va_space;
753     struct mm_struct *mm = va_space->va_space_mm.mm;
754     hmm_split_invalidate_data_t split_data;
755     NvU64 delay_us;
756     uvm_va_block_t *new_va_block;
757     NV_STATUS status;
758     int ret;
759 
760     uvm_assert_rwsem_locked_write(&va_space->lock);
761 
762     UVM_ASSERT(new_end > va_block->start);
763     UVM_ASSERT(new_end < va_block->end);
764     UVM_ASSERT(PAGE_ALIGNED(new_end + 1));
765 
766     status = uvm_va_block_create(NULL, new_end + 1, va_block->end, &new_va_block);
767     if (status != NV_OK)
768         return status;
769 
770     // Initialize the newly created HMM va_block.
771     hmm_va_block_init(new_va_block, va_space, new_va_block->start, new_va_block->end);
772 
773     ret = mmu_interval_notifier_insert(&new_va_block->hmm.notifier,
774                                        mm,
775                                        new_va_block->start,
776                                        uvm_va_block_size(new_va_block),
777                                        &uvm_hmm_notifier_ops);
778 
779     // Since __mmu_notifier_register() was called when the va_space was
780     // initially created, we know that mm->notifier_subscriptions is valid
781     // and mmu_interval_notifier_insert() can't return ENOMEM.
782     // The only error return is for start + length overflowing but we already
783     // registered the same address range before so there should be no error.
784     UVM_ASSERT(!ret);
785 
786     uvm_mutex_lock(&va_block->lock);
787 
788     status = uvm_va_block_split_locked(va_block, new_end, new_va_block, NULL);
789     if (status != NV_OK)
790         goto err;
791 
792     uvm_mutex_unlock(&va_block->lock);
793 
794     // The MMU interval notifier has to be removed in order to resize it.
795     // That means there would be a window of time when invalidation callbacks
796     // could be missed. To handle this case, we register a temporary notifier
797     // to cover the address range while resizing the old notifier (it is
798     // OK to have multiple notifiers for the same range, we may simply try to
799     // invalidate twice).
800     split_data.existing_block = va_block;
801     ret = mmu_interval_notifier_insert(&split_data.notifier,
802                                        mm,
803                                        va_block->start,
804                                        new_end - va_block->start + 1,
805                                        &hmm_notifier_split_ops);
806     UVM_ASSERT(!ret);
807 
808     // Delay to allow hmm_sanity test to trigger an mmu_notifier during the
809     // critical window where the split invalidate callback is active.
810     delay_us = atomic64_read(&va_space->test.split_invalidate_delay_us);
811     if (delay_us)
812         udelay(delay_us);
813 
814     mmu_interval_notifier_remove(&va_block->hmm.notifier);
815 
816     // Enable notifications on the old block with the smaller size.
817     ret = mmu_interval_notifier_insert(&va_block->hmm.notifier,
818                                        mm,
819                                        va_block->start,
820                                        uvm_va_block_size(va_block),
821                                        &uvm_hmm_notifier_ops);
822     UVM_ASSERT(!ret);
823 
824     mmu_interval_notifier_remove(&split_data.notifier);
825 
826     if (new_block_ptr)
827         *new_block_ptr = new_va_block;
828 
829     return status;
830 
831 err:
832     uvm_mutex_unlock(&va_block->lock);
833     mmu_interval_notifier_remove(&new_va_block->hmm.notifier);
834     uvm_va_block_release(new_va_block);
835     return status;
836 }
837 
838 // Check to see if the HMM va_block would overlap the range start/end and
839 // split it so it can be removed. That breaks down to the following cases:
840 // start/end could cover all of the HMM va_block ->
841 //     remove the va_block
842 // start/end could cover the left part of the HMM va_block ->
843 //     remove the left part
844 // start/end could cover the right part of the HMM va_block ->
845 //     remove the right part
846 // or start/end could "punch a hole" in the middle and leave the ends intact.
847 // In each case, only one HMM va_block is removed so return it in out_va_block.
848 static NV_STATUS split_block_if_needed(uvm_va_block_t *va_block,
849                                        NvU64 start,
850                                        NvU64 end,
851                                        uvm_va_block_t **out_va_block)
852 {
853     uvm_va_block_context_t *va_block_context;
854     uvm_va_space_t *va_space;
855     struct mm_struct *mm;
856     struct vm_area_struct *vma;
857     uvm_va_block_region_t region;
858     NvU64 addr, from, to;
859     uvm_va_block_t *new;
860     NV_STATUS status;
861 
862     if (va_block->start < start) {
863         status = hmm_split_block(va_block, start - 1, &new);
864         if (status != NV_OK)
865             return status;
866 
867         // Keep the left part, the right part will be deleted.
868         va_block = new;
869     }
870 
871     if (va_block->end > end) {
872         status = hmm_split_block(va_block, end, NULL);
873         if (status != NV_OK)
874             return status;
875 
876         // Keep the right part, the left part will be deleted.
877     }
878 
879     *out_va_block = va_block;
880 
881     // Migrate any GPU data to sysmem before destroying the HMM va_block.
882     // We do this because the new va_range might be for a UVM external
883     // allocation which could be converting an address range that was first
884     // operated on by UVM-HMM and the exteral allocation should see that data.
885     va_space = va_block->hmm.va_space;
886     mm = va_space->va_space_mm.mm;
887     va_block_context = uvm_va_space_block_context(va_space, mm);
888 
889     for (addr = va_block->start; addr < va_block->end; addr = to + 1) {
890         vma = find_vma_intersection(mm, addr, va_block->end);
891         if (!vma)
892             break;
893 
894         from = max(addr, (NvU64)vma->vm_start);
895         to = min(va_block->end, (NvU64)vma->vm_end - 1);
896         region = uvm_va_block_region_from_start_end(va_block, from, to);
897 
898         if (!uvm_hmm_vma_is_valid(vma, from, false))
899             continue;
900 
901         va_block_context->hmm.vma = vma;
902 
903         status = hmm_migrate_range(va_block,
904                                    NULL,
905                                    va_block_context,
906                                    UVM_ID_CPU,
907                                    from,
908                                    to,
909                                    UVM_MIGRATE_MODE_MAKE_RESIDENT_AND_MAP,
910                                    NULL);
911         if (status != NV_OK)
912             return status;
913     }
914 
915     return NV_OK;
916 }
917 
918 // Normally, the HMM va_block is destroyed when the va_space is destroyed
919 // (i.e., when the /dev/nvidia-uvm device is closed). A munmap() call triggers
920 // a uvm_hmm_invalidate() callback which unmaps the VMA's range from the GPU's
921 // page tables. However, it doesn't destroy the va_block because that would
922 // require calling mmu_interval_notifier_remove() which can't be called from
923 // the invalidate callback due to Linux locking constraints. If a process
924 // calls mmap()/munmap() for SAM and then creates a managed allocation,
925 // the same VMA range can be picked and there would be a UVM/HMM va_block
926 // conflict. Creating a managed allocation, external allocation, or other
927 // va_range types, calls this function to remove stale HMM va_blocks or split
928 // the HMM va_block so there is no overlap.
929 NV_STATUS uvm_hmm_va_block_reclaim(uvm_va_space_t *va_space,
930                                    struct mm_struct *mm,
931                                    NvU64 start,
932                                    NvU64 end)
933 {
934     uvm_range_tree_node_t *node, *next;
935     uvm_va_block_t *va_block;
936     NV_STATUS status;
937 
938     if (!uvm_hmm_is_enabled(va_space))
939         return NV_OK;
940 
941     if (mm)
942         uvm_assert_mmap_lock_locked(mm);
943     uvm_assert_rwsem_locked_write(&va_space->lock);
944 
945     // Process each HMM va_block that overlaps the interval [start, end].
946     // Note that end is inclusive.
947     // The blocks_lock is not needed when the va_space lock is held for write.
948     uvm_range_tree_for_each_in_safe(node, next, &va_space->hmm.blocks, start, end) {
949         va_block = hmm_va_block_from_node(node);
950 
951         if (mm) {
952             status = split_block_if_needed(va_block, start, end, &va_block);
953             if (status != NV_OK)
954                 return status;
955         }
956 
957         // Note that this waits for any invalidations callbacks to complete
958         // so uvm_hmm_invalidate() won't see a block disapear.
959         // The va_space write lock should prevent uvm_hmm_va_block_find_create()
960         // from adding it back.
961         mmu_interval_notifier_remove(&va_block->hmm.notifier);
962         uvm_range_tree_remove(&va_space->hmm.blocks, &va_block->hmm.node);
963         uvm_va_block_kill(va_block);
964     }
965 
966     UVM_ASSERT(!uvm_range_tree_iter_first(&va_space->hmm.blocks, start, end));
967 
968     return NV_OK;
969 }
970 
971 void uvm_hmm_va_block_split_tree(uvm_va_block_t *existing_va_block, uvm_va_block_t *new_block)
972 {
973     uvm_va_space_t *va_space = existing_va_block->hmm.va_space;
974 
975     UVM_ASSERT(uvm_va_block_is_hmm(existing_va_block));
976     uvm_assert_rwsem_locked_write(&va_space->lock);
977 
978     uvm_range_tree_split(&existing_va_block->hmm.va_space->hmm.blocks,
979                          &existing_va_block->hmm.node,
980                          &new_block->hmm.node);
981 }
982 
983 NV_STATUS uvm_hmm_split_as_needed(uvm_va_space_t *va_space,
984                                   NvU64 addr,
985                                   uvm_va_policy_is_split_needed_t split_needed_cb,
986                                   void *data)
987 {
988     uvm_va_block_t *va_block;
989     uvm_va_policy_node_t *node;
990     NV_STATUS status;
991 
992     uvm_assert_rwsem_locked_write(&va_space->lock);
993 
994     // If there is no HMM va_block or the va_block doesn't span the policy
995     // addr, there is no need to split.
996     status = uvm_hmm_va_block_find(va_space, addr, &va_block);
997     if (status != NV_OK || va_block->start == addr)
998         return NV_OK;
999 
1000     uvm_mutex_lock(&va_block->lock);
1001 
1002     node = uvm_va_policy_node_find(va_block, addr);
1003     if (!node)
1004         goto done;
1005 
1006     // If the policy range doesn't span addr, we're done.
1007     if (addr == node->node.start)
1008         goto done;
1009 
1010     if (split_needed_cb(&node->policy, data))
1011         status = uvm_va_policy_node_split(va_block, node, addr - 1, NULL);
1012 
1013 done:
1014     uvm_mutex_unlock(&va_block->lock);
1015     return status;
1016 }
1017 
1018 static NV_STATUS hmm_set_preferred_location_locked(uvm_va_block_t *va_block,
1019                                                    uvm_va_block_context_t *va_block_context,
1020                                                    uvm_processor_id_t preferred_location,
1021                                                    NvU64 addr,
1022                                                    NvU64 end,
1023                                                    uvm_tracker_t *out_tracker)
1024 {
1025     uvm_processor_mask_t set_accessed_by_processors;
1026     const uvm_va_policy_t *old_policy;
1027     uvm_va_policy_node_t *node;
1028     uvm_va_block_region_t region;
1029     uvm_processor_id_t id;
1030     NV_STATUS status, tracker_status;
1031 
1032     // Note that we can't just call uvm_va_policy_set_range() for the whole
1033     // range [addr end] because we need to examine the old value of
1034     // policy->preferred_location before setting it. Thus we iterate over
1035     // the existing policy nodes.
1036     uvm_for_each_va_policy_in(old_policy, va_block, addr, end, node, region) {
1037         if (uvm_id_equal(old_policy->preferred_location, preferred_location))
1038             continue;
1039 
1040         // If the old preferred location is a valid processor ID, remote
1041         // mappings should be established to the new preferred location if
1042         // accessed-by is set.
1043         uvm_processor_mask_zero(&set_accessed_by_processors);
1044 
1045         if (UVM_ID_IS_VALID(old_policy->preferred_location) &&
1046             uvm_processor_mask_test(&old_policy->accessed_by, old_policy->preferred_location))
1047             uvm_processor_mask_set(&set_accessed_by_processors, old_policy->preferred_location);
1048 
1049         va_block_context->policy = uvm_va_policy_set_preferred_location(va_block,
1050                                                                         region,
1051                                                                         preferred_location,
1052                                                                         old_policy);
1053         if (!va_block_context->policy)
1054             return NV_ERR_NO_MEMORY;
1055 
1056         // Establish new remote mappings if the old preferred location had
1057         // accessed-by set.
1058         for_each_id_in_mask(id, &set_accessed_by_processors) {
1059             status = uvm_va_block_set_accessed_by_locked(va_block, va_block_context, id, region, out_tracker);
1060             if (status != NV_OK)
1061                 return status;
1062         }
1063 
1064         // Even though the UVM_VA_BLOCK_RETRY_LOCKED() may unlock and relock
1065         // the va_block lock, the policy remains valid because we hold the mmap
1066         // lock so munmap can't remove the policy, and the va_space lock so the
1067         // policy APIs can't change the policy.
1068         status = UVM_VA_BLOCK_RETRY_LOCKED(va_block,
1069                                            NULL,
1070                                            uvm_va_block_set_preferred_location_locked(va_block,
1071                                                                                       va_block_context,
1072                                                                                       region));
1073 
1074         tracker_status = uvm_tracker_add_tracker_safe(out_tracker, &va_block->tracker);
1075         if (status == NV_OK)
1076             status = tracker_status;
1077 
1078         if (status != NV_OK)
1079             return status;
1080     }
1081 
1082     return NV_OK;
1083 }
1084 
1085 NV_STATUS uvm_hmm_set_preferred_location(uvm_va_space_t *va_space,
1086                                          uvm_processor_id_t preferred_location,
1087                                          NvU64 base,
1088                                          NvU64 last_address,
1089                                          uvm_tracker_t *out_tracker)
1090 {
1091     uvm_va_block_context_t *va_block_context;
1092     uvm_va_block_t *va_block;
1093     NvU64 addr;
1094     NV_STATUS status = NV_OK;
1095 
1096     if (!uvm_hmm_is_enabled(va_space))
1097         return NV_ERR_INVALID_ADDRESS;
1098 
1099     uvm_assert_mmap_lock_locked(va_space->va_space_mm.mm);
1100     uvm_assert_rwsem_locked_write(&va_space->lock);
1101     UVM_ASSERT(PAGE_ALIGNED(base));
1102     UVM_ASSERT(PAGE_ALIGNED(last_address + 1));
1103     UVM_ASSERT(base < last_address);
1104 
1105     // Update HMM preferred location policy.
1106 
1107     va_block_context = uvm_va_space_block_context(va_space, va_space->va_space_mm.mm);
1108 
1109     for (addr = base; addr < last_address; addr = va_block->end + 1) {
1110         NvU64 end;
1111 
1112         status = hmm_va_block_find_create(va_space, addr, true, va_block_context, &va_block);
1113         if (status != NV_OK)
1114             break;
1115 
1116         end = min(last_address, va_block->end);
1117 
1118         uvm_mutex_lock(&va_block->lock);
1119 
1120         status = hmm_set_preferred_location_locked(va_block,
1121                                                    va_block_context,
1122                                                    preferred_location,
1123                                                    addr,
1124                                                    end,
1125                                                    out_tracker);
1126 
1127         uvm_mutex_unlock(&va_block->lock);
1128 
1129         if (status != NV_OK)
1130             break;
1131     }
1132 
1133     return status;
1134 }
1135 
1136 static NV_STATUS hmm_set_accessed_by_start_end_locked(uvm_va_block_t *va_block,
1137                                                       uvm_va_block_context_t *va_block_context,
1138                                                       uvm_processor_id_t processor_id,
1139                                                       NvU64 start,
1140                                                       NvU64 end,
1141                                                       uvm_tracker_t *out_tracker)
1142 {
1143     uvm_va_space_t *va_space = va_block->hmm.va_space;
1144     uvm_va_policy_node_t *node;
1145     uvm_va_block_region_t region;
1146     NV_STATUS status = NV_OK;
1147 
1148     uvm_for_each_va_policy_node_in(node, va_block, start, end) {
1149         // Read duplication takes precedence over SetAccessedBy.
1150         // Do not add mappings if read duplication is enabled.
1151         if (uvm_va_policy_is_read_duplicate(&node->policy, va_space))
1152             continue;
1153 
1154         va_block_context->policy = &node->policy;
1155         region = uvm_va_block_region_from_start_end(va_block,
1156                                                     max(start, node->node.start),
1157                                                     min(end, node->node.end));
1158 
1159         status = uvm_va_block_set_accessed_by_locked(va_block,
1160                                                      va_block_context,
1161                                                      processor_id,
1162                                                      region,
1163                                                      out_tracker);
1164         if (status != NV_OK)
1165             break;
1166     }
1167 
1168     return status;
1169 }
1170 
1171 NV_STATUS uvm_hmm_set_accessed_by(uvm_va_space_t *va_space,
1172                                   uvm_processor_id_t processor_id,
1173                                   bool set_bit,
1174                                   NvU64 base,
1175                                   NvU64 last_address,
1176                                   uvm_tracker_t *out_tracker)
1177 {
1178     uvm_va_block_context_t *va_block_context;
1179     uvm_va_block_t *va_block;
1180     NvU64 addr;
1181     NV_STATUS status = NV_OK;
1182 
1183     if (!uvm_hmm_is_enabled(va_space))
1184         return NV_ERR_INVALID_ADDRESS;
1185 
1186     uvm_assert_mmap_lock_locked(va_space->va_space_mm.mm);
1187     uvm_assert_rwsem_locked_write(&va_space->lock);
1188     UVM_ASSERT(PAGE_ALIGNED(base));
1189     UVM_ASSERT(PAGE_ALIGNED(last_address + 1));
1190     UVM_ASSERT(base < last_address);
1191 
1192     // Update HMM accessed by policy.
1193 
1194     va_block_context = uvm_va_space_block_context(va_space, va_space->va_space_mm.mm);
1195 
1196     for (addr = base; addr < last_address; addr = va_block->end + 1) {
1197         NvU64 end;
1198 
1199         status = hmm_va_block_find_create(va_space, addr, true, va_block_context, &va_block);
1200         if (status != NV_OK)
1201             break;
1202 
1203         end = min(last_address, va_block->end);
1204 
1205         uvm_mutex_lock(&va_block->lock);
1206 
1207         status = uvm_va_policy_set_range(va_block,
1208                                          addr,
1209                                          end,
1210                                          UVM_VA_POLICY_ACCESSED_BY,
1211                                          !set_bit,
1212                                          processor_id,
1213                                          UVM_READ_DUPLICATION_MAX);
1214 
1215         if (status == NV_OK && set_bit) {
1216             status = hmm_set_accessed_by_start_end_locked(va_block,
1217                                                           va_block_context,
1218                                                           processor_id,
1219                                                           addr,
1220                                                           end,
1221                                                           out_tracker);
1222         }
1223 
1224         uvm_mutex_unlock(&va_block->lock);
1225 
1226         if (status != NV_OK)
1227             break;
1228     }
1229 
1230     return status;
1231 }
1232 
1233 void uvm_hmm_block_add_eviction_mappings(uvm_va_space_t *va_space,
1234                                          uvm_va_block_t *va_block,
1235                                          uvm_va_block_context_t *block_context)
1236 {
1237     uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
1238     uvm_va_policy_node_t *node;
1239     uvm_va_block_region_t region;
1240     uvm_processor_mask_t map_processors;
1241     uvm_processor_id_t id;
1242     NV_STATUS tracker_status;
1243     NV_STATUS status = NV_OK;
1244 
1245     UVM_ASSERT(uvm_va_block_is_hmm(va_block));
1246     uvm_assert_mmap_lock_locked(va_space->va_space_mm.mm);
1247     uvm_assert_rwsem_locked(&va_space->lock);
1248 
1249     uvm_mutex_lock(&va_block->lock);
1250 
1251     uvm_for_each_va_policy_node_in(node, va_block, va_block->start, va_block->end) {
1252         block_context->policy = &node->policy;
1253 
1254         for_each_id_in_mask(id, &node->policy.accessed_by) {
1255             status = hmm_set_accessed_by_start_end_locked(va_block,
1256                                                           block_context,
1257                                                           id,
1258                                                           node->node.start,
1259                                                           node->node.end,
1260                                                           &local_tracker);
1261             if (status != NV_OK)
1262                 break;
1263 
1264             if (!uvm_va_space_map_remote_on_eviction(va_space))
1265                 continue;
1266 
1267             // Exclude the processors that have been already mapped due to
1268             // AccessedBy.
1269             uvm_processor_mask_andnot(&map_processors, &va_block->evicted_gpus, &node->policy.accessed_by);
1270 
1271             for_each_gpu_id_in_mask(id, &map_processors) {
1272                 uvm_gpu_t *gpu = uvm_va_space_get_gpu(va_space, id);
1273                 uvm_va_block_gpu_state_t *gpu_state;
1274 
1275                 if (!gpu->parent->access_counters_supported)
1276                     continue;
1277 
1278                 gpu_state = uvm_va_block_gpu_state_get(va_block, id);
1279                 UVM_ASSERT(gpu_state);
1280 
1281                 // TODO: Bug 2096389: uvm_va_block_add_mappings does not add
1282                 // remote mappings to read-duplicated pages. Add support for it
1283                 // or create a new function.
1284                 status = uvm_va_block_add_mappings(va_block,
1285                                                    block_context,
1286                                                    id,
1287                                                    region,
1288                                                    &gpu_state->evicted,
1289                                                    UvmEventMapRemoteCauseEviction);
1290                 tracker_status = uvm_tracker_add_tracker_safe(&local_tracker, &va_block->tracker);
1291                 status = (status == NV_OK) ? tracker_status : status;
1292                 if (status != NV_OK) {
1293                     UVM_ASSERT(status != NV_ERR_MORE_PROCESSING_REQUIRED);
1294                     break;
1295                 }
1296             }
1297         }
1298     }
1299 
1300     uvm_mutex_unlock(&va_block->lock);
1301 
1302     tracker_status = uvm_tracker_wait_deinit(&local_tracker);
1303     status = (status == NV_OK) ? tracker_status : status;
1304     if (status != NV_OK) {
1305         UVM_ERR_PRINT("Deferred mappings to evicted memory for block [0x%llx, 0x%llx] failed %s\n",
1306                       va_block->start,
1307                       va_block->end,
1308                       nvstatusToString(status));
1309     }
1310 }
1311 
1312 void uvm_hmm_find_policy_end(uvm_va_block_t *va_block,
1313                              uvm_va_block_context_t *va_block_context,
1314                              unsigned long addr,
1315                              NvU64 *endp)
1316 {
1317     struct vm_area_struct *vma = va_block_context->hmm.vma;
1318     const uvm_va_policy_node_t *node;
1319     NvU64 end = va_block->end;
1320 
1321     uvm_assert_mmap_lock_locked(vma->vm_mm);
1322     uvm_assert_mutex_locked(&va_block->lock);
1323 
1324     if (end > vma->vm_end - 1)
1325         end = vma->vm_end - 1;
1326 
1327     node = uvm_va_policy_node_find(va_block, addr);
1328     if (node) {
1329         va_block_context->policy = &node->policy;
1330         if (end > node->node.end)
1331             end = node->node.end;
1332     }
1333     else {
1334         va_block_context->policy = &uvm_va_policy_default;
1335     }
1336 
1337     *endp = end;
1338 }
1339 
1340 NV_STATUS uvm_hmm_find_policy_vma_and_outer(uvm_va_block_t *va_block,
1341                                             uvm_va_block_context_t *va_block_context,
1342                                             uvm_page_index_t page_index,
1343                                             uvm_page_index_t *outerp)
1344 {
1345     struct vm_area_struct *vma;
1346     unsigned long addr;
1347     NvU64 end;
1348     uvm_page_index_t outer;
1349 
1350     UVM_ASSERT(uvm_va_block_is_hmm(va_block));
1351     uvm_assert_mmap_lock_locked(va_block_context->mm);
1352     uvm_assert_mutex_locked(&va_block->lock);
1353 
1354     addr = uvm_va_block_cpu_page_address(va_block, page_index);
1355 
1356     vma = vma_lookup(va_block_context->mm, addr);
1357     if (!vma || !(vma->vm_flags & VM_READ))
1358         return NV_ERR_INVALID_ADDRESS;
1359 
1360     va_block_context->hmm.vma = vma;
1361 
1362     uvm_hmm_find_policy_end(va_block, va_block_context, addr, &end);
1363 
1364     outer = uvm_va_block_cpu_page_index(va_block, end) + 1;
1365     if (*outerp > outer)
1366         *outerp = outer;
1367 
1368     return NV_OK;
1369 }
1370 
1371 static NV_STATUS hmm_clear_thrashing_policy(uvm_va_block_t *va_block,
1372                                             uvm_va_block_context_t *block_context)
1373 {
1374     const uvm_va_policy_t *policy;
1375     uvm_va_policy_node_t *node;
1376     uvm_va_block_region_t region;
1377     NV_STATUS status = NV_OK;
1378 
1379     uvm_mutex_lock(&va_block->lock);
1380 
1381     uvm_for_each_va_policy_in(policy, va_block, va_block->start, va_block->end, node, region) {
1382         block_context->policy = policy;
1383 
1384         // Unmap may split PTEs and require a retry. Needs to be called
1385         // before the pinned pages information is destroyed.
1386         status = UVM_VA_BLOCK_RETRY_LOCKED(va_block,
1387                                            NULL,
1388                                            uvm_perf_thrashing_unmap_remote_pinned_pages_all(va_block,
1389                                                                                             block_context,
1390                                                                                             region));
1391 
1392         uvm_perf_thrashing_info_destroy(va_block);
1393 
1394         if (status != NV_OK)
1395             break;
1396     }
1397 
1398     uvm_mutex_unlock(&va_block->lock);
1399 
1400     return status;
1401 }
1402 
1403 NV_STATUS uvm_hmm_clear_thrashing_policy(uvm_va_space_t *va_space)
1404 {
1405     uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, NULL);
1406     uvm_range_tree_node_t *node, *next;
1407     uvm_va_block_t *va_block;
1408     NV_STATUS status = NV_OK;
1409 
1410     if (!uvm_hmm_is_enabled(va_space))
1411         return NV_OK;
1412 
1413     uvm_assert_rwsem_locked_write(&va_space->lock);
1414 
1415     uvm_range_tree_for_each_safe(node, next, &va_space->hmm.blocks) {
1416         va_block = hmm_va_block_from_node(node);
1417 
1418         status = hmm_clear_thrashing_policy(va_block, block_context);
1419         if (status != NV_OK)
1420             break;
1421     }
1422 
1423     return status;
1424 }
1425 
1426 uvm_va_block_region_t uvm_hmm_get_prefetch_region(uvm_va_block_t *va_block,
1427                                                   uvm_va_block_context_t *va_block_context,
1428                                                   NvU64 address)
1429 {
1430     struct vm_area_struct *vma = va_block_context->hmm.vma;
1431     const uvm_va_policy_t *policy = va_block_context->policy;
1432     NvU64 start, end;
1433 
1434     UVM_ASSERT(uvm_va_block_is_hmm(va_block));
1435 
1436     // We need to limit the prefetch region to the VMA.
1437     start = max(va_block->start, (NvU64)vma->vm_start);
1438     end = min(va_block->end, (NvU64)vma->vm_end - 1);
1439 
1440     // Also, we need to limit the prefetch region to the policy range.
1441     if (uvm_va_policy_is_default(policy)) {
1442         NV_STATUS status = uvm_range_tree_find_hole_in(&va_block->hmm.va_policy_tree,
1443                                                        address,
1444                                                        &start,
1445                                                        &end);
1446         // We already know the hole exists and covers the fault region.
1447         UVM_ASSERT(status == NV_OK);
1448     }
1449     else {
1450         const uvm_va_policy_node_t *node = uvm_va_policy_node_from_policy(policy);
1451 
1452         start = max(start, node->node.start);
1453         end = min(end, node->node.end);
1454     }
1455 
1456     return uvm_va_block_region_from_start_end(va_block, start, end);
1457 }
1458 
1459 uvm_prot_t uvm_hmm_compute_logical_prot(uvm_va_block_t *va_block,
1460                                         uvm_va_block_context_t *va_block_context,
1461                                         NvU64 addr)
1462 {
1463     struct vm_area_struct *vma = va_block_context->hmm.vma;
1464 
1465     UVM_ASSERT(uvm_va_block_is_hmm(va_block));
1466     uvm_assert_mmap_lock_locked(va_block_context->mm);
1467     UVM_ASSERT(vma && addr >= vma->vm_start && addr < vma->vm_end);
1468 
1469     if (!(vma->vm_flags & VM_READ))
1470         return UVM_PROT_NONE;
1471     else if (!(vma->vm_flags & VM_WRITE))
1472         return UVM_PROT_READ_ONLY;
1473     else
1474         return UVM_PROT_READ_WRITE_ATOMIC;
1475 }
1476 
1477 static NV_STATUS hmm_va_block_cpu_page_populate(uvm_va_block_t *va_block,
1478                                                 uvm_page_index_t page_index,
1479                                                 struct page *page)
1480 {
1481     uvm_cpu_chunk_t *chunk;
1482     NV_STATUS status;
1483 
1484     UVM_ASSERT(uvm_va_block_is_hmm(va_block));
1485     UVM_ASSERT(!uvm_page_mask_test(&va_block->cpu.allocated, page_index));
1486 
1487     if (page == ZERO_PAGE(uvm_va_block_cpu_page_address(va_block, page_index)))
1488         return NV_ERR_INVALID_ADDRESS;
1489 
1490     status = uvm_cpu_chunk_alloc_hmm(page, &chunk);
1491     if (status != NV_OK)
1492         return status;
1493 
1494     status = uvm_cpu_chunk_insert_in_block(va_block, chunk, page_index);
1495     if (status != NV_OK) {
1496         uvm_cpu_chunk_free(chunk);
1497         return status;
1498     }
1499 
1500     status = uvm_va_block_map_cpu_chunk_on_gpus(va_block, page_index);
1501     if (status != NV_OK) {
1502         uvm_cpu_chunk_remove_from_block(va_block, page_index);
1503         uvm_cpu_chunk_free(chunk);
1504     }
1505 
1506     return status;
1507 }
1508 
1509 static void hmm_va_block_cpu_page_unpopulate(uvm_va_block_t *va_block,
1510                                              uvm_page_index_t page_index)
1511 {
1512     uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, page_index);
1513 
1514     UVM_ASSERT(uvm_va_block_is_hmm(va_block));
1515 
1516     if (!chunk)
1517         return;
1518 
1519     UVM_ASSERT(!uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU) ||
1520                !uvm_page_mask_test(&va_block->cpu.resident, page_index));
1521 
1522     uvm_cpu_chunk_remove_from_block(va_block, page_index);
1523     uvm_va_block_unmap_cpu_chunk_on_gpus(va_block, chunk, page_index);
1524     uvm_cpu_chunk_free(chunk);
1525 }
1526 
1527 static bool hmm_va_block_cpu_page_is_same(uvm_va_block_t *va_block,
1528                                           uvm_page_index_t page_index,
1529                                           struct page *page)
1530 {
1531     struct page *old_page = uvm_cpu_chunk_get_cpu_page(va_block, page_index);
1532 
1533     UVM_ASSERT(uvm_cpu_chunk_is_hmm(uvm_cpu_chunk_get_chunk_for_page(va_block, page_index)));
1534     return old_page == page;
1535 }
1536 
1537 // uvm_va_block_service_copy() and uvm_va_block_service_finish() expect the
1538 // service_context masks to match what is being processed. Since a page
1539 // that was expected to be processed isn't migrating, we have to clear the
1540 // masks to make service_context consistent with what is actually being
1541 // handled.
1542 static void clear_service_context_masks(uvm_service_block_context_t *service_context,
1543                                         uvm_processor_id_t new_residency,
1544                                         uvm_page_index_t page_index)
1545 {
1546     uvm_page_mask_clear(&service_context->block_context.caller_page_mask, page_index);
1547 
1548     uvm_page_mask_clear(&service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency,
1549                         page_index);
1550 
1551     if (uvm_page_mask_empty(&service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency))
1552         uvm_processor_mask_clear(&service_context->resident_processors, new_residency);
1553 
1554     if (UVM_ID_IS_VALID(service_context->prefetch_hint.residency))
1555         uvm_page_mask_clear(&service_context->prefetch_hint.prefetch_pages_mask, page_index);
1556 
1557     if (service_context->thrashing_pin_count > 0 &&
1558         uvm_page_mask_test_and_clear(&service_context->thrashing_pin_mask, page_index)) {
1559         service_context->thrashing_pin_count--;
1560     }
1561 
1562     if (service_context->read_duplicate_count > 0 &&
1563         uvm_page_mask_test_and_clear(&service_context->read_duplicate_mask, page_index)) {
1564         service_context->read_duplicate_count--;
1565     }
1566 }
1567 
1568 static void cpu_mapping_set(uvm_va_block_t *va_block,
1569                             bool is_write,
1570                             uvm_page_index_t page_index)
1571 {
1572     uvm_processor_mask_set(&va_block->mapped, UVM_ID_CPU);
1573     uvm_page_mask_set(&va_block->maybe_mapped_pages, page_index);
1574     uvm_page_mask_set(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ], page_index);
1575     if (is_write)
1576         uvm_page_mask_set(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE], page_index);
1577     else
1578         uvm_page_mask_clear(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE], page_index);
1579 }
1580 
1581 static void cpu_mapping_clear(uvm_va_block_t *va_block, uvm_page_index_t page_index)
1582 {
1583     uvm_page_mask_clear(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE], page_index);
1584     uvm_page_mask_clear(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ], page_index);
1585     if (uvm_page_mask_empty(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ]))
1586         uvm_processor_mask_clear(&va_block->mapped, UVM_ID_CPU);
1587 }
1588 
1589 static void gpu_chunk_remove(uvm_va_block_t *va_block,
1590                              uvm_page_index_t page_index,
1591                              struct page *page)
1592 {
1593     uvm_va_block_gpu_state_t *gpu_state;
1594     uvm_gpu_chunk_t *gpu_chunk;
1595     uvm_gpu_id_t id;
1596 
1597     id = uvm_pmm_devmem_page_to_gpu_id(page);
1598     gpu_state = uvm_va_block_gpu_state_get(va_block, id);
1599     UVM_ASSERT(gpu_state);
1600 
1601     gpu_chunk = gpu_state->chunks[page_index];
1602     if (!gpu_chunk) {
1603         // If we didn't find a chunk it's because the page was unmapped for
1604         // mremap and no fault has established a new mapping.
1605         UVM_ASSERT(!uvm_page_mask_test(&gpu_state->resident, page_index));
1606         return;
1607     }
1608 
1609     // TODO: Bug 3898467: unmap indirect peers when freeing GPU chunks
1610 
1611     uvm_mmu_chunk_unmap(gpu_chunk, &va_block->tracker);
1612     gpu_state->chunks[page_index] = NULL;
1613 }
1614 
1615 static NV_STATUS gpu_chunk_add(uvm_va_block_t *va_block,
1616                                uvm_page_index_t page_index,
1617                                struct page *page)
1618 {
1619     uvm_va_block_gpu_state_t *gpu_state;
1620     uvm_gpu_chunk_t *gpu_chunk;
1621     uvm_gpu_id_t id;
1622     NV_STATUS status;
1623 
1624     id = uvm_pmm_devmem_page_to_gpu_id(page);
1625     gpu_state = uvm_va_block_gpu_state_get(va_block, id);
1626 
1627     // It's possible that this is a fresh va_block we're trying to add an
1628     // existing gpu_chunk to. This occurs for example when a GPU faults on a
1629     // virtual address that has been remapped with mremap().
1630     if (!gpu_state) {
1631         status = uvm_va_block_gpu_state_alloc(va_block);
1632         if (status != NV_OK)
1633             return status;
1634         gpu_state = uvm_va_block_gpu_state_get(va_block, id);
1635     }
1636 
1637     UVM_ASSERT(gpu_state);
1638 
1639     // Note that a mremap() might be to a CPU virtual address that is nolonger
1640     // aligned with a larger GPU chunk size. We would need to allocate a new
1641     // aligned GPU chunk and copy from old to new.
1642     // TODO: Bug 3368756: add support for large GPU pages.
1643     gpu_chunk = uvm_pmm_devmem_page_to_chunk(page);
1644     UVM_ASSERT(gpu_chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED);
1645     UVM_ASSERT(gpu_chunk->is_referenced);
1646     UVM_ASSERT(page->zone_device_data == va_block->hmm.va_space);
1647 
1648     if (gpu_state->chunks[page_index] == gpu_chunk)
1649         return NV_OK;
1650 
1651     UVM_ASSERT(!gpu_state->chunks[page_index]);
1652 
1653     // In some configurations such as SR-IOV heavy, the chunk cannot be
1654     // referenced using its physical address. Create a virtual mapping.
1655     status = uvm_mmu_chunk_map(gpu_chunk);
1656     if (status != NV_OK)
1657         return status;
1658 
1659     // TODO: Bug 3898467: map indirect peers.
1660 
1661     uvm_processor_mask_set(&va_block->resident, id);
1662     uvm_page_mask_set(&gpu_state->resident, page_index);
1663 
1664     // It is safe to modify the page index field without holding any PMM locks
1665     // because the chunk is allocated, which means that none of the other
1666     // fields in the bitmap can change.
1667     gpu_chunk->va_block = va_block;
1668     gpu_chunk->va_block_page_index = page_index;
1669 
1670     gpu_state->chunks[page_index] = gpu_chunk;
1671 
1672     return NV_OK;
1673 }
1674 
1675 // This is called just before calling migrate_vma_finalize() in order to wait
1676 // for GPU operations to complete and update the va_block state to match which
1677 // pages migrated (or not) and therefore which pages will be released by
1678 // migrate_vma_finalize().
1679 // 'migrated_pages' is the mask of pages that migrated,
1680 // 'same_devmem_page_mask' is the mask of pages that are the same in src_pfns
1681 // and dst_pfns and therefore appear to migrate_vma_*() to be not migrating.
1682 // 'region' is the page index region of all migrated, non-migrated, and
1683 // same_devmem_page_mask pages.
1684 static NV_STATUS sync_page_and_chunk_state(uvm_va_block_t *va_block,
1685                                            const unsigned long *src_pfns,
1686                                            const unsigned long *dst_pfns,
1687                                            uvm_va_block_region_t region,
1688                                            const uvm_page_mask_t *migrated_pages,
1689                                            const uvm_page_mask_t *same_devmem_page_mask)
1690 {
1691     uvm_page_index_t page_index;
1692     NV_STATUS status;
1693 
1694     // Wait for the GPU to finish. migrate_vma_finalize() will release the
1695     // migrated source pages (or non migrating destination pages), so GPU
1696     // opererations must be finished by then.
1697     status = uvm_tracker_wait(&va_block->tracker);
1698 
1699     for_each_va_block_page_in_region(page_index, region) {
1700         struct page *page;
1701 
1702         if (uvm_page_mask_test(same_devmem_page_mask, page_index))
1703             continue;
1704 
1705         // If a page migrated, clean up the source page.
1706         // Otherwise, clean up the destination page.
1707         if (uvm_page_mask_test(migrated_pages, page_index))
1708             page = migrate_pfn_to_page(src_pfns[page_index]);
1709         else
1710             page = migrate_pfn_to_page(dst_pfns[page_index]);
1711 
1712         if (!page)
1713             continue;
1714 
1715         if (is_device_private_page(page)) {
1716             gpu_chunk_remove(va_block, page_index, page);
1717         }
1718         else {
1719             // If the source page is a system memory page,
1720             // migrate_vma_finalize() will release the reference so we should
1721             // clear our pointer to it.
1722             // TODO: Bug 3660922: Need to handle read duplication at some point.
1723             hmm_va_block_cpu_page_unpopulate(va_block, page_index);
1724         }
1725     }
1726 
1727     return status;
1728 }
1729 
1730 // Update va_block state to reflect that the page isn't migrating.
1731 static void clean_up_non_migrating_page(uvm_va_block_t *va_block,
1732                                         const unsigned long *src_pfns,
1733                                         unsigned long *dst_pfns,
1734                                         uvm_page_index_t page_index)
1735 {
1736     struct page *dst_page = migrate_pfn_to_page(dst_pfns[page_index]);
1737 
1738     if (!dst_page)
1739         return;
1740 
1741     // migrate_vma_finalize() will release the dst_page reference so don't keep
1742     // a pointer to it.
1743     if (is_device_private_page(dst_page)) {
1744         gpu_chunk_remove(va_block, page_index, dst_page);
1745     }
1746     else {
1747         UVM_ASSERT(page_ref_count(dst_page) == 1);
1748 
1749         hmm_va_block_cpu_page_unpopulate(va_block, page_index);
1750     }
1751 
1752     unlock_page(dst_page);
1753     put_page(dst_page);
1754     dst_pfns[page_index] = 0;
1755 }
1756 
1757 static void clean_up_non_migrating_pages(uvm_va_block_t *va_block,
1758                                          const unsigned long *src_pfns,
1759                                          unsigned long *dst_pfns,
1760                                          uvm_va_block_region_t region,
1761                                          uvm_page_mask_t *page_mask)
1762 {
1763     uvm_page_index_t page_index;
1764     NV_STATUS status;
1765 
1766     status = uvm_tracker_wait(&va_block->tracker);
1767     UVM_ASSERT(status == NV_OK);
1768 
1769     for_each_va_block_page_in_region_mask(page_index, page_mask, region) {
1770         clean_up_non_migrating_page(va_block, src_pfns, dst_pfns, page_index);
1771     }
1772 }
1773 
1774 // CPU page fault handling.
1775 
1776 // Fill in the dst_pfns[page_index] entry given that there is an allocated
1777 // CPU page.
1778 static void lock_block_cpu_page(uvm_va_block_t *va_block,
1779                                 uvm_page_index_t page_index,
1780                                 struct page *src_page,
1781                                 unsigned long *dst_pfns,
1782                                 uvm_page_mask_t *same_devmem_page_mask)
1783 {
1784     uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, page_index);
1785     uvm_va_block_region_t chunk_region;
1786     struct page *dst_page;
1787 
1788     UVM_ASSERT(chunk);
1789     UVM_ASSERT(chunk->page);
1790 
1791     chunk_region = uvm_va_block_chunk_region(va_block, uvm_cpu_chunk_get_size(chunk), page_index);
1792 
1793     dst_page = chunk->page + (page_index - chunk_region.first);
1794 
1795     UVM_ASSERT(dst_page != ZERO_PAGE(uvm_va_block_cpu_page_address(va_block, page_index)));
1796     UVM_ASSERT(!is_device_private_page(dst_page));
1797 
1798     // The source page is usually a device private page but it could be a GPU
1799     // remote mapped system memory page. It could also be a driver allocated
1800     // page for GPU-to-GPU staged copies (i.e., not a resident copy and owned
1801     // by the driver).
1802     if (is_device_private_page(src_page)) {
1803         // Since the page isn't mirrored, it was allocated by alloc_pages()
1804         // and UVM owns the reference. We leave the reference count unchanged
1805         // and mark the page pointer as mirrored since UVM is transferring
1806         // ownership to Linux and we don't want UVM to double free the page in
1807         // hmm_va_block_cpu_page_unpopulate() or block_kill(). If the page
1808         // does not migrate, it will be freed though.
1809         UVM_ASSERT(!uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU) ||
1810                    !uvm_page_mask_test(&va_block->cpu.resident, page_index));
1811         UVM_ASSERT(chunk->type == UVM_CPU_CHUNK_TYPE_PHYSICAL);
1812         UVM_ASSERT(page_ref_count(dst_page) == 1);
1813         uvm_cpu_chunk_make_hmm(chunk);
1814     }
1815     else {
1816         UVM_ASSERT(same_devmem_page_mask);
1817         UVM_ASSERT(src_page == dst_page);
1818         uvm_page_mask_set(same_devmem_page_mask, page_index);
1819 
1820         // The call to migrate_vma_setup() will have inserted a migration PTE
1821         // so the CPU has no access.
1822         cpu_mapping_clear(va_block, page_index);
1823         return;
1824     }
1825 
1826     lock_page(dst_page);
1827     dst_pfns[page_index] = migrate_pfn(page_to_pfn(dst_page));
1828 }
1829 
1830 static void hmm_mark_gpu_chunk_referenced(uvm_va_block_t *va_block,
1831                                           uvm_gpu_t *gpu,
1832                                           uvm_gpu_chunk_t *gpu_chunk)
1833 {
1834     // Tell PMM to expect a callback from Linux to free the page since the
1835     // device private struct page reference count will determine when the
1836     // GPU chunk is free.
1837     UVM_ASSERT(gpu_chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED);
1838     list_del_init(&gpu_chunk->list);
1839     uvm_pmm_gpu_unpin_referenced(&gpu->pmm, gpu_chunk, va_block);
1840 }
1841 
1842 static void fill_dst_pfn(uvm_va_block_t *va_block,
1843                          uvm_gpu_t *gpu,
1844                          const unsigned long *src_pfns,
1845                          unsigned long *dst_pfns,
1846                          uvm_page_index_t page_index,
1847                          uvm_page_mask_t *same_devmem_page_mask)
1848 {
1849     unsigned long src_pfn = src_pfns[page_index];
1850     uvm_gpu_chunk_t *gpu_chunk;
1851     unsigned long pfn;
1852     struct page *dpage;
1853 
1854     gpu_chunk = uvm_va_block_lookup_gpu_chunk(va_block, gpu, uvm_va_block_cpu_page_address(va_block, page_index));
1855     UVM_ASSERT(gpu_chunk);
1856     UVM_ASSERT(gpu_chunk->log2_size == PAGE_SHIFT);
1857     pfn = uvm_pmm_gpu_devmem_get_pfn(&gpu->pmm, gpu_chunk);
1858 
1859     // If the same GPU page is both source and destination, migrate_vma_pages()
1860     // will see the wrong "expected" reference count and not migrate it, so we
1861     // mark it as not migrating but we keep track of this so we don't confuse
1862     // it with a page that migrate_vma_pages() actually does not migrate.
1863     if ((src_pfn & MIGRATE_PFN_VALID) && (src_pfn >> MIGRATE_PFN_SHIFT) == pfn) {
1864         uvm_page_mask_set(same_devmem_page_mask, page_index);
1865         return;
1866     }
1867 
1868     dpage = pfn_to_page(pfn);
1869     UVM_ASSERT(is_device_private_page(dpage));
1870     UVM_ASSERT(dpage->pgmap->owner == &g_uvm_global);
1871 
1872     hmm_mark_gpu_chunk_referenced(va_block, gpu, gpu_chunk);
1873     UVM_ASSERT(!page_count(dpage));
1874     zone_device_page_init(dpage);
1875     dpage->zone_device_data = va_block->hmm.va_space;
1876 
1877     dst_pfns[page_index] = migrate_pfn(pfn);
1878 }
1879 
1880 static void fill_dst_pfns(uvm_va_block_t *va_block,
1881                           const unsigned long *src_pfns,
1882                           unsigned long *dst_pfns,
1883                           uvm_va_block_region_t region,
1884                           uvm_page_mask_t *page_mask,
1885                           uvm_page_mask_t *same_devmem_page_mask,
1886                           uvm_processor_id_t dest_id)
1887 {
1888     uvm_gpu_t *gpu = uvm_va_space_get_gpu(va_block->hmm.va_space, dest_id);
1889     uvm_page_index_t page_index;
1890 
1891     uvm_page_mask_zero(same_devmem_page_mask);
1892 
1893     for_each_va_block_page_in_region_mask(page_index, page_mask, region) {
1894         if (!(src_pfns[page_index] & MIGRATE_PFN_MIGRATE))
1895             continue;
1896 
1897         fill_dst_pfn(va_block,
1898                      gpu,
1899                      src_pfns,
1900                      dst_pfns,
1901                      page_index,
1902                      same_devmem_page_mask);
1903     }
1904 }
1905 
1906 static NV_STATUS alloc_and_copy_to_cpu(uvm_va_block_t *va_block,
1907                                        struct vm_area_struct *vma,
1908                                        const unsigned long *src_pfns,
1909                                        unsigned long *dst_pfns,
1910                                        uvm_va_block_region_t region,
1911                                        uvm_page_mask_t *page_mask,
1912                                        uvm_page_mask_t *same_devmem_page_mask,
1913                                        uvm_processor_id_t processor_id,
1914                                        uvm_service_block_context_t *service_context)
1915 {
1916     uvm_page_index_t page_index;
1917     NV_STATUS status = NV_OK;
1918 
1919     for_each_va_block_page_in_region_mask(page_index, page_mask, region) {
1920         struct page *src_page;
1921         struct page *dst_page;
1922         gfp_t gfp;
1923 
1924         if (!(src_pfns[page_index] & MIGRATE_PFN_MIGRATE)) {
1925             // Device exclusive PTEs are not selected but we still want to
1926             // process the page so record it as such.
1927             if (service_context && !UVM_ID_IS_CPU(processor_id) &&
1928                 service_context->access_type[page_index] == UVM_FAULT_ACCESS_TYPE_ATOMIC_STRONG) {
1929                 uvm_page_mask_set(same_devmem_page_mask, page_index);
1930                 continue;
1931             }
1932 
1933             // We have previously found a page that is CPU resident which can't
1934             // be migrated (probably a shared mapping) so make sure we establish
1935             // a remote mapping for it.
1936             if (uvm_page_mask_test(same_devmem_page_mask, page_index))
1937                 continue;
1938 
1939             goto clr_mask;
1940         }
1941 
1942         // This is the page that will be copied to system memory.
1943         src_page = migrate_pfn_to_page(src_pfns[page_index]);
1944 
1945         if (src_page) {
1946             // mremap may have caused us to loose the gpu_chunk associated with
1947             // this va_block/page_index so make sure we have the correct chunk.
1948             if (is_device_private_page(src_page))
1949                 gpu_chunk_add(va_block, page_index, src_page);
1950 
1951             if (uvm_page_mask_test(&va_block->cpu.allocated, page_index)) {
1952                 lock_block_cpu_page(va_block, page_index, src_page, dst_pfns, same_devmem_page_mask);
1953                 continue;
1954             }
1955         }
1956 
1957         UVM_ASSERT(!uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU) ||
1958                    !uvm_page_mask_test(&va_block->cpu.resident, page_index));
1959 
1960         // Allocate a user system memory page for the destination.
1961         // This is the typical case since Linux will free the source page when
1962         // migrating to device private memory.
1963         // If there is no source page, it means the page is pte_none() or the
1964         // zero page. This case "shouldn't happen" because we asked
1965         // migrate_vma_setup() only for device private pages but
1966         // migrate_vma_collect_hole() doesn't check the
1967         // MIGRATE_VMA_SELECT_SYSTEM flag.
1968         gfp = GFP_HIGHUSER_MOVABLE;
1969         if (!src_page)
1970             gfp |= __GFP_ZERO;
1971 
1972         dst_page = alloc_page_vma(gfp,
1973                                   vma,
1974                                   va_block->start + (page_index << PAGE_SHIFT));
1975         if (!dst_page) {
1976             // Ignore errors if the page is only for prefetching.
1977             if (service_context &&
1978                 service_context->access_type[page_index] == UVM_FAULT_ACCESS_TYPE_PREFETCH)
1979                 goto clr_mask;
1980             UVM_ERR_PRINT("cannot allocate page %u (addr 0x%llx)\n",
1981                           page_index, va_block->start + (page_index << PAGE_SHIFT));
1982             status = NV_ERR_NO_MEMORY;
1983             break;
1984         }
1985 
1986         status = hmm_va_block_cpu_page_populate(va_block, page_index, dst_page);
1987         if (status != NV_OK) {
1988             __free_page(dst_page);
1989             // Ignore errors if the page is only for prefetching.
1990             if (service_context &&
1991                 service_context->access_type[page_index] == UVM_FAULT_ACCESS_TYPE_PREFETCH)
1992                 goto clr_mask;
1993             break;
1994         }
1995 
1996         // Note that we don't call get_page(dst_page) since alloc_page_vma()
1997         // returns with a page reference count of one and we are passing
1998         // ownership to Linux. Also, uvm_va_block_cpu_page_populate() recorded
1999         // the page as "mirrored" so that migrate_vma_finalize() and
2000         // hmm_va_block_cpu_page_unpopulate() don't double free the page.
2001         lock_page(dst_page);
2002         dst_pfns[page_index] = migrate_pfn(page_to_pfn(dst_page));
2003         continue;
2004 
2005     clr_mask:
2006         // TODO: Bug 3900774: clean up murky mess of mask clearing.
2007         uvm_page_mask_clear(page_mask, page_index);
2008         if (service_context)
2009             clear_service_context_masks(service_context, UVM_ID_CPU, page_index);
2010     }
2011 
2012     if (status != NV_OK)
2013         clean_up_non_migrating_pages(va_block, src_pfns, dst_pfns, region, page_mask);
2014     else if (uvm_page_mask_empty(page_mask))
2015         return NV_WARN_MORE_PROCESSING_REQUIRED;
2016 
2017     return status;
2018 }
2019 
2020 static NV_STATUS uvm_hmm_devmem_fault_alloc_and_copy(uvm_hmm_devmem_fault_context_t *devmem_fault_context)
2021 {
2022     uvm_processor_id_t processor_id;
2023     uvm_service_block_context_t *service_context;
2024     uvm_va_block_retry_t *va_block_retry;
2025     const unsigned long *src_pfns;
2026     unsigned long *dst_pfns;
2027     uvm_page_mask_t *page_mask;
2028     uvm_page_mask_t *same_devmem_page_mask = &devmem_fault_context->same_devmem_page_mask;
2029     uvm_va_block_t *va_block;
2030     NV_STATUS status = NV_OK;
2031 
2032     processor_id = devmem_fault_context->processor_id;
2033     service_context = devmem_fault_context->service_context;
2034     va_block_retry = devmem_fault_context->va_block_retry;
2035     va_block = devmem_fault_context->va_block;
2036     src_pfns = service_context->block_context.hmm.src_pfns;
2037     dst_pfns = service_context->block_context.hmm.dst_pfns;
2038 
2039     // Build the migration page mask.
2040     // Note that thrashing pinned pages and prefetch pages are already
2041     // accounted for in service_context->per_processor_masks.
2042     page_mask = &devmem_fault_context->page_mask;
2043     uvm_page_mask_copy(page_mask, &service_context->per_processor_masks[UVM_ID_CPU_VALUE].new_residency);
2044 
2045     status = alloc_and_copy_to_cpu(va_block,
2046                                    service_context->block_context.hmm.vma,
2047                                    src_pfns,
2048                                    dst_pfns,
2049                                    service_context->region,
2050                                    page_mask,
2051                                    same_devmem_page_mask,
2052                                    processor_id,
2053                                    service_context);
2054     if (status != NV_OK)
2055         return status;
2056 
2057     // Do the copy but don't update the residency or mapping for the new
2058     // location yet.
2059     return uvm_va_block_service_copy(processor_id, UVM_ID_CPU, va_block, va_block_retry, service_context);
2060 }
2061 
2062 static NV_STATUS uvm_hmm_devmem_fault_finalize_and_map(uvm_hmm_devmem_fault_context_t *devmem_fault_context)
2063 {
2064     uvm_processor_id_t processor_id;
2065     uvm_service_block_context_t *service_context;
2066     uvm_perf_prefetch_hint_t *prefetch_hint;
2067     uvm_va_block_retry_t *va_block_retry;
2068     const unsigned long *src_pfns;
2069     unsigned long *dst_pfns;
2070     uvm_page_mask_t *page_mask;
2071     uvm_va_block_t *va_block;
2072     uvm_va_block_region_t region;
2073     uvm_page_index_t page_index;
2074     NV_STATUS status, tracker_status;
2075 
2076     processor_id = devmem_fault_context->processor_id;
2077     service_context = devmem_fault_context->service_context;
2078     prefetch_hint = &service_context->prefetch_hint;
2079     va_block = devmem_fault_context->va_block;
2080     va_block_retry = devmem_fault_context->va_block_retry;
2081     src_pfns = service_context->block_context.hmm.src_pfns;
2082     dst_pfns = service_context->block_context.hmm.dst_pfns;
2083     region = service_context->region;
2084 
2085     page_mask = &devmem_fault_context->page_mask;
2086 
2087     // There are a number of reasons why HMM will mark a page as not migrating
2088     // even if we set a valid entry in dst_pfns[]. Mark these pages accordingly.
2089     for_each_va_block_page_in_region_mask(page_index, page_mask, region) {
2090         if (src_pfns[page_index] & MIGRATE_PFN_MIGRATE)
2091             continue;
2092 
2093         // If a page isn't migrating and only the GPU page table is being
2094         // updated, continue to process it normally.
2095         if (uvm_page_mask_test(&devmem_fault_context->same_devmem_page_mask, page_index))
2096             continue;
2097 
2098         // TODO: Bug 3900774: clean up murky mess of mask clearing.
2099         uvm_page_mask_clear(page_mask, page_index);
2100         clear_service_context_masks(service_context, UVM_ID_CPU, page_index);
2101     }
2102 
2103     if (uvm_page_mask_empty(page_mask))
2104         status = NV_WARN_MORE_PROCESSING_REQUIRED;
2105     else
2106         status = uvm_va_block_service_finish(processor_id, va_block, service_context);
2107 
2108     tracker_status = sync_page_and_chunk_state(va_block,
2109                                                src_pfns,
2110                                                dst_pfns,
2111                                                region,
2112                                                page_mask,
2113                                                &devmem_fault_context->same_devmem_page_mask);
2114 
2115     return status == NV_OK ? tracker_status : status;
2116 }
2117 
2118 static NV_STATUS populate_region(uvm_va_block_t *va_block,
2119                                  unsigned long *pfns,
2120                                  uvm_va_block_region_t region,
2121                                  uvm_page_mask_t *populated_page_mask)
2122 {
2123     uvm_page_index_t page_index;
2124     NV_STATUS status;
2125 
2126     // Make sure GPU state is allocated or else the GPU DMA mappings to
2127     // system memory won't be saved.
2128     status = uvm_va_block_gpu_state_alloc(va_block);
2129     if (status != NV_OK)
2130         return status;
2131 
2132     for_each_va_block_page_in_region(page_index, region) {
2133         struct page *page;
2134 
2135         // This case should only happen when querying CPU residency and we ask
2136         // for something not covered by a VMA. Otherwise, hmm_range_fault()
2137         // returns -EFAULT instead of setting the HMM_PFN_ERROR bit.
2138         if (pfns[page_index] & HMM_PFN_ERROR)
2139             return NV_ERR_INVALID_ADDRESS;
2140 
2141         if (pfns[page_index] & HMM_PFN_VALID) {
2142             page = hmm_pfn_to_page(pfns[page_index]);
2143         }
2144         else {
2145             // The page can't be evicted since it has to be migrated to the GPU
2146             // first which would leave a device private page entry so this has
2147             // to be a pte_none(), swapped out, or similar entry.
2148             // The page would have been allocated if populate_region() is being
2149             // called from uvm_hmm_va_block_service_locked() so this must be
2150             // for uvm_hmm_va_block_update_residency_info(). Just leave the
2151             // residency/populated information unchanged since
2152             // uvm_hmm_invalidate() should handle that if the underlying page
2153             // is invalidated.
2154             // Also note there can be an allocated page due to GPU-to-GPU
2155             // migration between non-peer or indirect peer GPUs.
2156             continue;
2157         }
2158 
2159         if (is_device_private_page(page)) {
2160             // Linux can call hmm_invalidate() and we have to clear the GPU
2161             // chunk pointer in uvm_va_block_gpu_state_t::chunks[] but it might
2162             // not release the device private struct page reference. Since
2163             // hmm_range_fault() did find a device private PTE, we can
2164             // re-establish the GPU chunk pointer.
2165             status = gpu_chunk_add(va_block, page_index, page);
2166             if (status != NV_OK)
2167                 return status;
2168             continue;
2169         }
2170 
2171         // If a CPU chunk is already allocated, check to see it matches what
2172         // hmm_range_fault() found.
2173         if (uvm_page_mask_test(&va_block->cpu.allocated, page_index)) {
2174             UVM_ASSERT(hmm_va_block_cpu_page_is_same(va_block, page_index, page));
2175         }
2176         else {
2177             status = hmm_va_block_cpu_page_populate(va_block, page_index, page);
2178             if (status != NV_OK)
2179                 return status;
2180 
2181             // Record that we populated this page. hmm_block_cpu_fault_locked()
2182             // uses this to ensure pages that don't migrate get remote mapped.
2183             if (populated_page_mask)
2184                 uvm_page_mask_set(populated_page_mask, page_index);
2185         }
2186 
2187         // Since we have a stable snapshot of the CPU pages, we can
2188         // update the residency and protection information.
2189         uvm_processor_mask_set(&va_block->resident, UVM_ID_CPU);
2190         uvm_page_mask_set(&va_block->cpu.resident, page_index);
2191 
2192         cpu_mapping_set(va_block, pfns[page_index] & HMM_PFN_WRITE, page_index);
2193     }
2194 
2195     return NV_OK;
2196 }
2197 
2198 static void hmm_range_fault_begin(uvm_va_block_t *va_block)
2199 {
2200     uvm_thread_context_t *uvm_context = uvm_thread_context();
2201 
2202     uvm_assert_mutex_locked(&va_block->lock);
2203     uvm_context->hmm_invalidate_seqnum = va_block->hmm.changed;
2204 }
2205 
2206 static bool hmm_range_fault_retry(uvm_va_block_t *va_block)
2207 {
2208     uvm_thread_context_t *uvm_context = uvm_thread_context();
2209 
2210     uvm_assert_mutex_locked(&va_block->lock);
2211     return uvm_context->hmm_invalidate_seqnum != va_block->hmm.changed;
2212 }
2213 
2214 // Make the region be resident on the CPU by calling hmm_range_fault() to fault
2215 // in CPU pages.
2216 static NV_STATUS hmm_make_resident_cpu(uvm_va_block_t *va_block,
2217                                        struct vm_area_struct *vma,
2218                                        unsigned long *hmm_pfns,
2219                                        uvm_va_block_region_t region,
2220                                        NvU8 *access_type,
2221                                        uvm_page_mask_t *populated_page_mask)
2222 {
2223     uvm_page_index_t page_index;
2224     int ret;
2225     struct hmm_range range = {
2226         .notifier = &va_block->hmm.notifier,
2227         .start = uvm_va_block_region_start(va_block, region),
2228         .end = uvm_va_block_region_end(va_block, region) + 1,
2229         .hmm_pfns = hmm_pfns + region.first,
2230         .pfn_flags_mask = HMM_PFN_REQ_FAULT | HMM_PFN_REQ_WRITE,
2231         .dev_private_owner = &g_uvm_global,
2232     };
2233 
2234     for_each_va_block_page_in_region(page_index, region) {
2235         if ((access_type && access_type[page_index] >= UVM_FAULT_ACCESS_TYPE_WRITE) ||
2236             (vma->vm_flags & VM_WRITE))
2237             hmm_pfns[page_index] = HMM_PFN_REQ_FAULT | HMM_PFN_REQ_WRITE;
2238         else
2239             hmm_pfns[page_index] = HMM_PFN_REQ_FAULT;
2240     }
2241 
2242     hmm_range_fault_begin(va_block);
2243 
2244     // Mirror the VA block to the HMM address range.
2245     // Note that we request HMM to handle page faults, which means that it will
2246     // populate and map potentially not-yet-existing pages to the VMA.
2247     // Also note that mmu_interval_read_begin() calls wait_event() for any
2248     // parallel invalidation callbacks to finish so we can't hold locks that
2249     // the invalidation callback acquires.
2250     uvm_mutex_unlock(&va_block->lock);
2251 
2252     range.notifier_seq = mmu_interval_read_begin(range.notifier);
2253     ret = hmm_range_fault(&range);
2254 
2255     uvm_mutex_lock(&va_block->lock);
2256 
2257     if (ret)
2258         return (ret == -EBUSY) ? NV_WARN_MORE_PROCESSING_REQUIRED : errno_to_nv_status(ret);
2259 
2260     if (hmm_range_fault_retry(va_block))
2261         return NV_WARN_MORE_PROCESSING_REQUIRED;
2262 
2263     return populate_region(va_block,
2264                            hmm_pfns,
2265                            region,
2266                            populated_page_mask);
2267 }
2268 
2269 // Release the reference count on any pages that were made device exclusive.
2270 static void hmm_release_atomic_pages(uvm_va_block_t *va_block,
2271                                      uvm_service_block_context_t *service_context)
2272 {
2273     uvm_va_block_region_t region = service_context->region;
2274     uvm_page_index_t page_index;
2275 
2276     for_each_va_block_page_in_region(page_index, region) {
2277         struct page *page = service_context->block_context.hmm.pages[page_index];
2278 
2279         if (!page)
2280             continue;
2281 
2282         unlock_page(page);
2283         put_page(page);
2284     }
2285 }
2286 
2287 static NV_STATUS hmm_block_atomic_fault_locked(uvm_processor_id_t processor_id,
2288                                                uvm_va_block_t *va_block,
2289                                                uvm_va_block_retry_t *va_block_retry,
2290                                                uvm_service_block_context_t *service_context)
2291 {
2292     uvm_va_block_region_t region = service_context->region;
2293     struct page **pages = service_context->block_context.hmm.pages;
2294     int npages;
2295     uvm_page_index_t page_index;
2296     uvm_make_resident_cause_t cause;
2297     NV_STATUS status;
2298 
2299     if (!uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU) ||
2300         !uvm_page_mask_region_full(&va_block->cpu.resident, region)) {
2301         // There is an atomic GPU fault. We need to make sure no pages are
2302         // GPU resident so that make_device_exclusive_range() doesn't call
2303         // migrate_to_ram() and cause a va_space lock recursion problem.
2304         if (service_context->operation == UVM_SERVICE_OPERATION_REPLAYABLE_FAULTS)
2305             cause = UVM_MAKE_RESIDENT_CAUSE_REPLAYABLE_FAULT;
2306         else if (service_context->operation == UVM_SERVICE_OPERATION_NON_REPLAYABLE_FAULTS)
2307             cause = UVM_MAKE_RESIDENT_CAUSE_NON_REPLAYABLE_FAULT;
2308         else
2309             cause = UVM_MAKE_RESIDENT_CAUSE_ACCESS_COUNTER;
2310 
2311         status = uvm_hmm_va_block_migrate_locked(va_block,
2312                                                  va_block_retry,
2313                                                  &service_context->block_context,
2314                                                  UVM_ID_CPU,
2315                                                  region,
2316                                                  cause);
2317         if (status != NV_OK)
2318             goto done;
2319 
2320         // make_device_exclusive_range() will try to call migrate_to_ram()
2321         // and deadlock with ourself if the data isn't CPU resident.
2322         if (!uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU) ||
2323             !uvm_page_mask_region_full(&va_block->cpu.resident, region)) {
2324             status = NV_WARN_MORE_PROCESSING_REQUIRED;
2325             goto done;
2326         }
2327     }
2328 
2329     // TODO: Bug 4014681: atomic GPU operations are not supported on MAP_SHARED
2330     // mmap() files so we check for that here and report a fatal fault.
2331     // Otherwise with the current Linux 6.1 make_device_exclusive_range(),
2332     // it doesn't make the page exclusive and we end up in an endless loop.
2333     if (service_context->block_context.hmm.vma->vm_flags & VM_SHARED) {
2334         status = NV_ERR_NOT_SUPPORTED;
2335         goto done;
2336     }
2337 
2338     hmm_range_fault_begin(va_block);
2339 
2340     uvm_mutex_unlock(&va_block->lock);
2341 
2342     npages = make_device_exclusive_range(service_context->block_context.mm,
2343         uvm_va_block_cpu_page_address(va_block, region.first),
2344         uvm_va_block_cpu_page_address(va_block, region.outer - 1) + PAGE_SIZE,
2345         pages + region.first,
2346         &g_uvm_global);
2347 
2348     uvm_mutex_lock(&va_block->lock);
2349 
2350     if (npages < 0) {
2351         status = (npages == -EBUSY) ? NV_WARN_MORE_PROCESSING_REQUIRED : errno_to_nv_status(npages);
2352         goto done;
2353     }
2354 
2355     while ((size_t)npages < uvm_va_block_region_num_pages(region))
2356         pages[region.first + npages++] = NULL;
2357 
2358     if (hmm_range_fault_retry(va_block)) {
2359         status = NV_WARN_MORE_PROCESSING_REQUIRED;
2360         goto release;
2361     }
2362 
2363     status = NV_OK;
2364 
2365     for_each_va_block_page_in_region(page_index, region) {
2366         struct page *page = pages[page_index];
2367 
2368         if (!page) {
2369             // Record that one of the pages isn't exclusive but keep converting
2370             // the others.
2371             status = NV_WARN_MORE_PROCESSING_REQUIRED;
2372             continue;
2373         }
2374 
2375         // If a CPU chunk is already allocated, check to see it matches what
2376         // make_device_exclusive_range() found.
2377         if (uvm_page_mask_test(&va_block->cpu.allocated, page_index)) {
2378             UVM_ASSERT(hmm_va_block_cpu_page_is_same(va_block, page_index, page));
2379             UVM_ASSERT(uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU));
2380             UVM_ASSERT(uvm_page_mask_test(&va_block->cpu.resident, page_index));
2381         }
2382         else {
2383             NV_STATUS s = hmm_va_block_cpu_page_populate(va_block, page_index, page);
2384 
2385             if (s == NV_OK) {
2386                 uvm_processor_mask_set(&va_block->resident, UVM_ID_CPU);
2387                 uvm_page_mask_set(&va_block->cpu.resident, page_index);
2388             }
2389         }
2390 
2391         cpu_mapping_clear(va_block, page_index);
2392     }
2393 
2394     if (status != NV_OK)
2395         goto release;
2396 
2397     status = uvm_va_block_service_copy(processor_id, UVM_ID_CPU, va_block, va_block_retry, service_context);
2398     if (status != NV_OK)
2399         goto release;
2400 
2401     status = uvm_va_block_service_finish(processor_id, va_block, service_context);
2402 
2403 release:
2404     hmm_release_atomic_pages(va_block, service_context);
2405 
2406 done:
2407     return status;
2408 }
2409 
2410 static bool is_atomic_fault(NvU8 *access_type, uvm_va_block_region_t region)
2411 {
2412     uvm_page_index_t page_index;
2413 
2414     for_each_va_block_page_in_region(page_index, region) {
2415         if (access_type[page_index] == UVM_FAULT_ACCESS_TYPE_ATOMIC_STRONG)
2416             return true;
2417     }
2418 
2419     return false;
2420 }
2421 
2422 static bool is_gpu_resident(uvm_va_block_t *va_block, uvm_va_block_region_t region)
2423 {
2424     uvm_processor_id_t gpu_id;
2425 
2426     for_each_gpu_id_in_mask(gpu_id, &va_block->resident) {
2427         uvm_va_block_gpu_state_t *gpu_state;
2428 
2429         gpu_state = uvm_va_block_gpu_state_get(va_block, gpu_id);
2430         if (!uvm_page_mask_region_empty(&gpu_state->resident, region))
2431             return true;
2432     }
2433 
2434     return false;
2435 }
2436 
2437 static NV_STATUS hmm_block_cpu_fault_locked(uvm_processor_id_t processor_id,
2438                                             uvm_va_block_t *va_block,
2439                                             uvm_va_block_retry_t *va_block_retry,
2440                                             uvm_service_block_context_t *service_context)
2441 {
2442     uvm_va_block_region_t region = service_context->region;
2443     struct migrate_vma *args = &service_context->block_context.hmm.migrate_vma_args;
2444     NV_STATUS status;
2445     int ret;
2446     uvm_hmm_devmem_fault_context_t fault_context = {
2447         .processor_id = processor_id,
2448         .va_block = va_block,
2449         .va_block_retry = va_block_retry,
2450         .service_context = service_context,
2451     };
2452 
2453     // Normally the source page will be a device private page that is being
2454     // migrated to system memory. However, when it is a GPU fault, the source
2455     // page can be a system memory page that the GPU needs to remote map
2456     // instead. However migrate_vma_setup() won't select these types of
2457     // mappings/pages:
2458     //  - device exclusive PTEs
2459     //  - shared mappings
2460     //  - file backed mappings
2461     // Also, if the source and destination page are the same, the page reference
2462     // count won't be the "expected" count and migrate_vma_pages() won't migrate
2463     // it. This mask records that uvm_hmm_devmem_fault_alloc_and_copy() and
2464     // uvm_hmm_devmem_fault_finalize_and_map() still needs to process these
2465     // pages even if src_pfn indicates they are not migrating.
2466     uvm_page_mask_zero(&fault_context.same_devmem_page_mask);
2467 
2468     if (!UVM_ID_IS_CPU(processor_id)) {
2469         if (is_atomic_fault(service_context->access_type, region)) {
2470             return hmm_block_atomic_fault_locked(processor_id,
2471                                                  va_block,
2472                                                  va_block_retry,
2473                                                  service_context);
2474         }
2475 
2476         status = hmm_make_resident_cpu(va_block,
2477                                        service_context->block_context.hmm.vma,
2478                                        service_context->block_context.hmm.src_pfns,
2479                                        region,
2480                                        service_context->access_type,
2481                                        &fault_context.same_devmem_page_mask);
2482         if (status != NV_OK)
2483             return status;
2484 
2485         // If no GPU has a resident copy, we can skip the migrate_vma_*().
2486         // This is necessary if uvm_hmm_must_use_sysmem() returned true.
2487         if (!is_gpu_resident(va_block, region)) {
2488             status = uvm_va_block_service_copy(processor_id,
2489                                                UVM_ID_CPU,
2490                                                va_block,
2491                                                va_block_retry,
2492                                                service_context);
2493             if (status != NV_OK)
2494                 return status;
2495 
2496             return uvm_va_block_service_finish(processor_id, va_block, service_context);
2497         }
2498     }
2499 
2500     args->vma = service_context->block_context.hmm.vma;
2501     args->src = service_context->block_context.hmm.src_pfns + region.first;
2502     args->dst = service_context->block_context.hmm.dst_pfns + region.first;
2503     args->start = uvm_va_block_region_start(va_block, region);
2504     args->end = uvm_va_block_region_end(va_block, region) + 1;
2505     args->flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE;
2506     args->pgmap_owner = &g_uvm_global;
2507 
2508     if (UVM_ID_IS_CPU(processor_id)) {
2509         args->fault_page = service_context->cpu_fault.vmf->page;
2510     }
2511     else {
2512         args->flags |= MIGRATE_VMA_SELECT_SYSTEM;
2513         args->fault_page = NULL;
2514     }
2515 
2516     ret = migrate_vma_setup_locked(args, va_block);
2517     UVM_ASSERT(!ret);
2518 
2519     // The overall process here is to migrate pages from the GPU to the CPU
2520     // and possibly remote map the GPU to sysmem if accessed_by is set.
2521     // This is safe because we hold the va_block lock across the calls to
2522     // uvm_hmm_devmem_fault_alloc_and_copy(), migrate_vma_pages(),
2523     // uvm_hmm_devmem_fault_finalize_and_map(), and migrate_vma_finalize().
2524     // If uvm_hmm_devmem_fault_alloc_and_copy() needs to drop the va_block
2525     // lock, a sequence number is used to tell if an invalidate() callback
2526     // occurred while not holding the lock. If the sequence number changes,
2527     // all the locks need to be dropped (mm, va_space, va_block) and the whole
2528     // uvm_va_block_service_locked() called again. Otherwise, there were no
2529     // conflicting invalidate callbacks and our snapshots of the CPU page
2530     // tables are accurate and can be used to DMA pages and update GPU page
2531     // tables.
2532     status = uvm_hmm_devmem_fault_alloc_and_copy(&fault_context);
2533     if (status == NV_OK) {
2534         migrate_vma_pages(args);
2535         status = uvm_hmm_devmem_fault_finalize_and_map(&fault_context);
2536     }
2537 
2538     migrate_vma_finalize(args);
2539 
2540     if (status == NV_WARN_NOTHING_TO_DO)
2541         status = NV_OK;
2542 
2543     return status;
2544 }
2545 
2546 static NV_STATUS dmamap_src_sysmem_pages(uvm_va_block_t *va_block,
2547                                          struct vm_area_struct *vma,
2548                                          const unsigned long *src_pfns,
2549                                          unsigned long *dst_pfns,
2550                                          uvm_va_block_region_t region,
2551                                          uvm_page_mask_t *page_mask,
2552                                          uvm_processor_id_t dest_id,
2553                                          uvm_service_block_context_t *service_context)
2554 {
2555     uvm_page_index_t page_index;
2556     NV_STATUS status = NV_OK;
2557 
2558     for_each_va_block_page_in_region_mask(page_index, page_mask, region) {
2559         struct page *src_page;
2560 
2561         if (!(src_pfns[page_index] & MIGRATE_PFN_MIGRATE)) {
2562             // HMM currently has some limitations on what pages can be migrated.
2563             // For example, no file backed pages, device private pages owned by
2564             // a different device, device exclusive or swapped out pages.
2565             goto clr_mask;
2566         }
2567 
2568         // This is the page that will be copied to the destination GPU.
2569         src_page = migrate_pfn_to_page(src_pfns[page_index]);
2570         if (src_page) {
2571             if (is_device_private_page(src_page)) {
2572                 status = gpu_chunk_add(va_block, page_index, src_page);
2573                 if (status != NV_OK)
2574                     break;
2575                 continue;
2576             }
2577 
2578             if (PageSwapCache(src_page)) {
2579                 // TODO: Bug 4050579: Remove this when swap cached pages can be
2580                 // migrated.
2581                 if (service_context) {
2582                     service_context->block_context.hmm.swap_cached = true;
2583                     break;
2584                 }
2585 
2586                 goto clr_mask;
2587             }
2588 
2589             // If the page is already allocated, it is most likely a mirrored
2590             // page. Check to be sure it matches what we have recorded. The
2591             // page shouldn't be a staging page from a GPU to GPU migration
2592             // or a remote mapped atomic sysmem page because migrate_vma_setup()
2593             // found a normal page and non-mirrored pages are only known
2594             // privately to the UVM driver.
2595             if (uvm_page_mask_test(&va_block->cpu.allocated, page_index)) {
2596                 UVM_ASSERT(hmm_va_block_cpu_page_is_same(va_block, page_index, src_page));
2597                 UVM_ASSERT(uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU));
2598                 UVM_ASSERT(uvm_page_mask_test(&va_block->cpu.resident, page_index));
2599             }
2600             else {
2601                 status = hmm_va_block_cpu_page_populate(va_block, page_index, src_page);
2602                 if (status != NV_OK)
2603                     goto clr_mask;
2604 
2605                 // Since there is a CPU resident page, there shouldn't be one
2606                 // anywhere else. TODO: Bug 3660922: Need to handle read
2607                 // duplication at some point.
2608                 UVM_ASSERT(!uvm_va_block_page_resident_processors_count(va_block, page_index));
2609 
2610                 // migrate_vma_setup() was able to isolate and lock the page;
2611                 // therefore, it is CPU resident and not mapped.
2612                 uvm_processor_mask_set(&va_block->resident, UVM_ID_CPU);
2613                 uvm_page_mask_set(&va_block->cpu.resident, page_index);
2614             }
2615 
2616             // The call to migrate_vma_setup() will have inserted a migration
2617             // PTE so the CPU has no access.
2618             cpu_mapping_clear(va_block, page_index);
2619         }
2620         else {
2621             // It is OK to migrate an empty anonymous page, a zero page will
2622             // be allocated on the GPU. Just be sure to free any pages
2623             // used for GPU to GPU copies. It can't be an evicted page because
2624             // migrate_vma_setup() would have found a source page.
2625             if (uvm_page_mask_test(&va_block->cpu.allocated, page_index)) {
2626                 UVM_ASSERT(!uvm_va_block_page_resident_processors_count(va_block, page_index));
2627 
2628                 hmm_va_block_cpu_page_unpopulate(va_block, page_index);
2629             }
2630         }
2631 
2632         continue;
2633 
2634     clr_mask:
2635         // TODO: Bug 3900774: clean up murky mess of mask clearing.
2636         uvm_page_mask_clear(page_mask, page_index);
2637         if (service_context)
2638             clear_service_context_masks(service_context, dest_id, page_index);
2639     }
2640 
2641     if (uvm_page_mask_empty(page_mask) ||
2642         (service_context && service_context->block_context.hmm.swap_cached))
2643         status = NV_WARN_MORE_PROCESSING_REQUIRED;
2644 
2645     if (status != NV_OK)
2646         clean_up_non_migrating_pages(va_block, src_pfns, dst_pfns, region, page_mask);
2647 
2648     return status;
2649 }
2650 
2651 static NV_STATUS uvm_hmm_gpu_fault_alloc_and_copy(struct vm_area_struct *vma,
2652                                                   uvm_hmm_gpu_fault_event_t *uvm_hmm_gpu_fault_event)
2653 {
2654     uvm_processor_id_t processor_id;
2655     uvm_processor_id_t new_residency;
2656     uvm_va_block_t *va_block;
2657     uvm_va_block_retry_t *va_block_retry;
2658     uvm_service_block_context_t *service_context;
2659     uvm_perf_prefetch_hint_t *prefetch_hint;
2660     const unsigned long *src_pfns;
2661     unsigned long *dst_pfns;
2662     uvm_va_block_region_t region;
2663     uvm_page_mask_t *page_mask;
2664     NV_STATUS status;
2665 
2666     processor_id = uvm_hmm_gpu_fault_event->processor_id;
2667     new_residency = uvm_hmm_gpu_fault_event->new_residency;
2668     va_block = uvm_hmm_gpu_fault_event->va_block;
2669     va_block_retry = uvm_hmm_gpu_fault_event->va_block_retry;
2670     service_context = uvm_hmm_gpu_fault_event->service_context;
2671     region = service_context->region;
2672     prefetch_hint = &service_context->prefetch_hint;
2673     src_pfns = service_context->block_context.hmm.src_pfns;
2674     dst_pfns = service_context->block_context.hmm.dst_pfns;
2675 
2676     // Build the migration mask.
2677     // Note that thrashing pinned pages are already accounted for in
2678     // service_context->resident_processors.
2679     page_mask = &uvm_hmm_gpu_fault_event->page_mask;
2680     uvm_page_mask_copy(page_mask,
2681                        &service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency);
2682 
2683     status = dmamap_src_sysmem_pages(va_block,
2684                                      vma,
2685                                      src_pfns,
2686                                      dst_pfns,
2687                                      region,
2688                                      page_mask,
2689                                      new_residency,
2690                                      service_context);
2691     if (status != NV_OK)
2692         return status;
2693 
2694     // Do the alloc and copy but don't update the residency or mapping for the
2695     // new location yet.
2696     status = uvm_va_block_service_copy(processor_id, new_residency, va_block, va_block_retry, service_context);
2697     if (status != NV_OK)
2698         return status;
2699 
2700     // Record the destination PFNs of device private struct pages now that
2701     // uvm_va_block_service_copy() has populated the GPU destination pages.
2702     fill_dst_pfns(va_block,
2703                   src_pfns,
2704                   dst_pfns,
2705                   region,
2706                   page_mask,
2707                   &uvm_hmm_gpu_fault_event->same_devmem_page_mask,
2708                   new_residency);
2709 
2710     return status;
2711 }
2712 
2713 static NV_STATUS uvm_hmm_gpu_fault_finalize_and_map(uvm_hmm_gpu_fault_event_t *uvm_hmm_gpu_fault_event)
2714 {
2715     uvm_processor_id_t processor_id;
2716     uvm_processor_id_t new_residency;
2717     uvm_va_block_t *va_block;
2718     uvm_va_block_retry_t *va_block_retry;
2719     uvm_service_block_context_t *service_context;
2720     const unsigned long *src_pfns;
2721     unsigned long *dst_pfns;
2722     uvm_va_block_region_t region;
2723     uvm_page_index_t page_index;
2724     uvm_page_mask_t *page_mask;
2725     NV_STATUS status, tracker_status;
2726 
2727     processor_id = uvm_hmm_gpu_fault_event->processor_id;
2728     new_residency = uvm_hmm_gpu_fault_event->new_residency;
2729     va_block = uvm_hmm_gpu_fault_event->va_block;
2730     va_block_retry = uvm_hmm_gpu_fault_event->va_block_retry;
2731     service_context = uvm_hmm_gpu_fault_event->service_context;
2732     src_pfns = service_context->block_context.hmm.src_pfns;
2733     dst_pfns = service_context->block_context.hmm.dst_pfns;
2734     region = service_context->region;
2735     page_mask = &uvm_hmm_gpu_fault_event->page_mask;
2736 
2737     // There are a number of reasons why HMM will mark a page as not migrating
2738     // even if we set a valid entry in dst_pfns[]. Mark these pages accordingly.
2739     for_each_va_block_page_in_region_mask(page_index, page_mask, region) {
2740         unsigned long src_pfn = src_pfns[page_index];
2741 
2742         if (src_pfn & MIGRATE_PFN_MIGRATE)
2743             continue;
2744 
2745         // If a device private page isn't migrating and only the GPU page table
2746         // is being updated, continue to process it normally.
2747         if (uvm_page_mask_test(&uvm_hmm_gpu_fault_event->same_devmem_page_mask, page_index))
2748             continue;
2749 
2750         // TODO: Bug 3900774: clean up murky mess of mask clearing.
2751         uvm_page_mask_clear(page_mask, page_index);
2752         clear_service_context_masks(service_context, new_residency, page_index);
2753     }
2754 
2755     if (uvm_page_mask_empty(page_mask))
2756         status = NV_WARN_MORE_PROCESSING_REQUIRED;
2757     else
2758         status = uvm_va_block_service_finish(processor_id, va_block, service_context);
2759 
2760     tracker_status = sync_page_and_chunk_state(va_block,
2761                                                src_pfns,
2762                                                dst_pfns,
2763                                                region,
2764                                                page_mask,
2765                                                &uvm_hmm_gpu_fault_event->same_devmem_page_mask);
2766 
2767     return status == NV_OK ? tracker_status : status;
2768 }
2769 
2770 NV_STATUS uvm_hmm_va_block_service_locked(uvm_processor_id_t processor_id,
2771                                           uvm_processor_id_t new_residency,
2772                                           uvm_va_block_t *va_block,
2773                                           uvm_va_block_retry_t *va_block_retry,
2774                                           uvm_service_block_context_t *service_context)
2775 {
2776     struct mm_struct *mm = service_context->block_context.mm;
2777     struct vm_area_struct *vma = service_context->block_context.hmm.vma;
2778     uvm_va_block_region_t region = service_context->region;
2779     uvm_hmm_gpu_fault_event_t uvm_hmm_gpu_fault_event;
2780     struct migrate_vma *args = &service_context->block_context.hmm.migrate_vma_args;
2781     int ret;
2782     NV_STATUS status = NV_ERR_INVALID_ADDRESS;
2783 
2784     if (!mm)
2785         return status;
2786 
2787     uvm_assert_mmap_lock_locked(mm);
2788     uvm_assert_rwsem_locked(&va_block->hmm.va_space->lock);
2789     uvm_assert_mutex_locked(&va_block->hmm.migrate_lock);
2790     uvm_assert_mutex_locked(&va_block->lock);
2791     UVM_ASSERT(vma);
2792 
2793     // If the desired destination is the CPU, try to fault in CPU pages.
2794     if (UVM_ID_IS_CPU(new_residency))
2795         return hmm_block_cpu_fault_locked(processor_id, va_block, va_block_retry, service_context);
2796 
2797     uvm_hmm_gpu_fault_event.processor_id = processor_id;
2798     uvm_hmm_gpu_fault_event.new_residency = new_residency;
2799     uvm_hmm_gpu_fault_event.va_block = va_block;
2800     uvm_hmm_gpu_fault_event.va_block_retry = va_block_retry;
2801     uvm_hmm_gpu_fault_event.service_context = service_context;
2802 
2803     args->vma = vma;
2804     args->src = service_context->block_context.hmm.src_pfns + region.first;
2805     args->dst = service_context->block_context.hmm.dst_pfns + region.first;
2806     args->start = uvm_va_block_region_start(va_block, region);
2807     args->end = uvm_va_block_region_end(va_block, region) + 1;
2808     args->flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE | MIGRATE_VMA_SELECT_SYSTEM;
2809     args->pgmap_owner = &g_uvm_global;
2810     args->fault_page = NULL;
2811 
2812     ret = migrate_vma_setup_locked(args, va_block);
2813     UVM_ASSERT(!ret);
2814 
2815     // The overall process here is to migrate pages from the CPU or GPUs to the
2816     // faulting GPU.
2817     // This is safe because we hold the va_block lock across the calls to
2818     // uvm_hmm_gpu_fault_alloc_and_copy(), migrate_vma_pages(),
2819     // uvm_hmm_gpu_fault_finalize_and_map(), and migrate_vma_finalize().
2820     // If uvm_hmm_gpu_fault_alloc_and_copy() needs to drop the va_block
2821     // lock, a sequence number is used to tell if an invalidate() callback
2822     // occurred while not holding the lock. If the sequence number changes,
2823     // all the locks need to be dropped (mm, va_space, va_block) and the whole
2824     // uvm_va_block_service_locked() called again. Otherwise, there were no
2825     // conflicting invalidate callbacks and our snapshots of the CPU page
2826     // tables are accurate and can be used to DMA pages and update GPU page
2827     // tables. TODO: Bug 3901904: there might be better ways of handling no
2828     // page being migrated.
2829     status = uvm_hmm_gpu_fault_alloc_and_copy(vma, &uvm_hmm_gpu_fault_event);
2830     if (status == NV_WARN_MORE_PROCESSING_REQUIRED) {
2831         migrate_vma_finalize(args);
2832 
2833         // migrate_vma_setup() might have not been able to lock/isolate any
2834         // pages because they are swapped out or are device exclusive.
2835         // We do know that none of the pages in the region are zero pages
2836         // since migrate_vma_setup() would have reported that information.
2837         // Try to make it resident in system memory and retry the migration.
2838         status = hmm_make_resident_cpu(va_block,
2839                                        service_context->block_context.hmm.vma,
2840                                        service_context->block_context.hmm.src_pfns,
2841                                        region,
2842                                        service_context->access_type,
2843                                        NULL);
2844         return NV_WARN_MORE_PROCESSING_REQUIRED;
2845     }
2846 
2847     if (status == NV_OK) {
2848         migrate_vma_pages(args);
2849         status = uvm_hmm_gpu_fault_finalize_and_map(&uvm_hmm_gpu_fault_event);
2850     }
2851 
2852     migrate_vma_finalize(args);
2853 
2854     if (status == NV_WARN_NOTHING_TO_DO)
2855         status = NV_OK;
2856 
2857     return status;
2858 }
2859 
2860 static NV_STATUS uvm_hmm_migrate_alloc_and_copy(struct vm_area_struct *vma,
2861                                                 uvm_hmm_migrate_event_t *uvm_hmm_migrate_event)
2862 {
2863     uvm_va_block_t *va_block;
2864     uvm_va_block_retry_t *va_block_retry;
2865     uvm_va_block_context_t *va_block_context;
2866     const unsigned long *src_pfns;
2867     unsigned long *dst_pfns;
2868     uvm_va_block_region_t region;
2869     uvm_processor_id_t dest_id;
2870     uvm_page_mask_t *page_mask;
2871     NV_STATUS status;
2872 
2873     va_block = uvm_hmm_migrate_event->va_block;
2874     va_block_retry = uvm_hmm_migrate_event->va_block_retry;
2875     va_block_context = uvm_hmm_migrate_event->va_block_context;
2876     src_pfns = va_block_context->hmm.src_pfns;
2877     dst_pfns = va_block_context->hmm.dst_pfns;
2878     region = uvm_hmm_migrate_event->region;
2879     dest_id = uvm_hmm_migrate_event->dest_id;
2880     page_mask = &uvm_hmm_migrate_event->page_mask;
2881     uvm_page_mask_init_from_region(page_mask, region, NULL);
2882     uvm_page_mask_zero(&uvm_hmm_migrate_event->same_devmem_page_mask);
2883 
2884     uvm_assert_mutex_locked(&va_block->lock);
2885 
2886     if (UVM_ID_IS_CPU(dest_id)) {
2887         status = alloc_and_copy_to_cpu(va_block,
2888                                        vma,
2889                                        src_pfns,
2890                                        dst_pfns,
2891                                        region,
2892                                        page_mask,
2893                                        &uvm_hmm_migrate_event->same_devmem_page_mask,
2894                                        UVM_ID_INVALID,
2895                                        NULL);
2896     }
2897     else {
2898         status = dmamap_src_sysmem_pages(va_block,
2899                                          vma,
2900                                          src_pfns,
2901                                          dst_pfns,
2902                                          region,
2903                                          page_mask,
2904                                          dest_id,
2905                                          NULL);
2906     }
2907     if (status != NV_OK)
2908         return status;
2909 
2910     UVM_ASSERT(!uvm_va_policy_is_read_duplicate(va_block_context->policy, va_block->hmm.va_space));
2911 
2912     status = uvm_va_block_make_resident_copy(va_block,
2913                                              va_block_retry,
2914                                              va_block_context,
2915                                              dest_id,
2916                                              region,
2917                                              page_mask,
2918                                              NULL,
2919                                              uvm_hmm_migrate_event->cause);
2920     if (status != NV_OK)
2921         return status;
2922 
2923     if (!UVM_ID_IS_CPU(dest_id)) {
2924         // Record the destination PFNs of device private struct pages now that
2925         // uvm_va_block_make_resident_copy() has populated the GPU destination
2926         // pages.
2927         fill_dst_pfns(va_block,
2928                       src_pfns,
2929                       dst_pfns,
2930                       region,
2931                       page_mask,
2932                       &uvm_hmm_migrate_event->same_devmem_page_mask,
2933                       dest_id);
2934     }
2935 
2936     return status;
2937 }
2938 
2939 static NV_STATUS uvm_hmm_migrate_finalize(uvm_hmm_migrate_event_t *uvm_hmm_migrate_event)
2940 {
2941     uvm_va_block_t *va_block;
2942     uvm_va_block_retry_t *va_block_retry;
2943     uvm_va_block_context_t *va_block_context;
2944     uvm_va_block_region_t region;
2945     uvm_processor_id_t dest_id;
2946     uvm_page_index_t page_index;
2947     uvm_page_mask_t *page_mask;
2948     const unsigned long *src_pfns;
2949     unsigned long *dst_pfns;
2950 
2951     va_block = uvm_hmm_migrate_event->va_block;
2952     va_block_retry = uvm_hmm_migrate_event->va_block_retry;
2953     va_block_context = uvm_hmm_migrate_event->va_block_context;
2954     region = uvm_hmm_migrate_event->region;
2955     dest_id = uvm_hmm_migrate_event->dest_id;
2956     page_mask = &uvm_hmm_migrate_event->page_mask;
2957     src_pfns = va_block_context->hmm.src_pfns;
2958     dst_pfns = va_block_context->hmm.dst_pfns;
2959 
2960     uvm_assert_mutex_locked(&va_block->lock);
2961 
2962     // There are a number of reasons why HMM will mark a page as not migrating
2963     // even if we set a valid entry in dst_pfns[]. Mark these pages accordingly.
2964     for_each_va_block_page_in_region_mask(page_index, page_mask, region) {
2965         unsigned long src_pfn = src_pfns[page_index];
2966 
2967         if (src_pfn & MIGRATE_PFN_MIGRATE)
2968             continue;
2969 
2970         // If a device private page isn't migrating and only the GPU page table
2971         // is being updated, continue to process it normally.
2972         if (uvm_page_mask_test(&uvm_hmm_migrate_event->same_devmem_page_mask, page_index))
2973             continue;
2974 
2975         uvm_page_mask_clear(page_mask, page_index);
2976     }
2977 
2978     uvm_va_block_make_resident_finish(va_block, va_block_context, region, page_mask);
2979 
2980     return sync_page_and_chunk_state(va_block,
2981                                      src_pfns,
2982                                      dst_pfns,
2983                                      region,
2984                                      page_mask,
2985                                      &uvm_hmm_migrate_event->same_devmem_page_mask);
2986 }
2987 
2988 static bool is_resident(uvm_va_block_t *va_block,
2989                         uvm_processor_id_t dest_id,
2990                         uvm_va_block_region_t region)
2991 {
2992     if (!uvm_processor_mask_test(&va_block->resident, dest_id))
2993         return false;
2994 
2995     return uvm_page_mask_region_full(uvm_va_block_resident_mask_get(va_block, dest_id), region);
2996 }
2997 
2998 // Note that migrate_vma_*() doesn't handle asynchronous migrations so the
2999 // migration flag UVM_MIGRATE_FLAG_SKIP_CPU_MAP doesn't have an effect.
3000 // TODO: Bug 3900785: investigate ways to implement async migration.
3001 NV_STATUS uvm_hmm_va_block_migrate_locked(uvm_va_block_t *va_block,
3002                                           uvm_va_block_retry_t *va_block_retry,
3003                                           uvm_va_block_context_t *va_block_context,
3004                                           uvm_processor_id_t dest_id,
3005                                           uvm_va_block_region_t region,
3006                                           uvm_make_resident_cause_t cause)
3007 {
3008     uvm_hmm_migrate_event_t uvm_hmm_migrate_event;
3009     struct vm_area_struct *vma = va_block_context->hmm.vma;
3010     NvU64 start;
3011     NvU64 end;
3012     struct migrate_vma *args = &va_block_context->hmm.migrate_vma_args;
3013     NV_STATUS status;
3014     int ret;
3015 
3016     UVM_ASSERT(vma);
3017     UVM_ASSERT(va_block_context->mm == vma->vm_mm);
3018     uvm_assert_mmap_lock_locked(va_block_context->mm);
3019     uvm_assert_rwsem_locked(&va_block->hmm.va_space->lock);
3020     uvm_assert_mutex_locked(&va_block->hmm.migrate_lock);
3021     uvm_assert_mutex_locked(&va_block->lock);
3022 
3023     start = uvm_va_block_region_start(va_block, region);
3024     end = uvm_va_block_region_end(va_block, region);
3025     UVM_ASSERT(vma->vm_start <= start && end < vma->vm_end);
3026 
3027     uvm_hmm_migrate_event.va_block = va_block;
3028     uvm_hmm_migrate_event.va_block_retry = va_block_retry;
3029     uvm_hmm_migrate_event.va_block_context = va_block_context;
3030     uvm_hmm_migrate_event.region = region;
3031     uvm_hmm_migrate_event.dest_id = dest_id;
3032     uvm_hmm_migrate_event.cause = cause;
3033 
3034     args->vma = vma;
3035     args->src = va_block_context->hmm.src_pfns + region.first;
3036     args->dst = va_block_context->hmm.dst_pfns + region.first;
3037     args->start = uvm_va_block_region_start(va_block, region);
3038     args->end = uvm_va_block_region_end(va_block, region) + 1;
3039     args->flags = UVM_ID_IS_CPU(dest_id) ? MIGRATE_VMA_SELECT_DEVICE_PRIVATE :
3040                                            MIGRATE_VMA_SELECT_DEVICE_PRIVATE | MIGRATE_VMA_SELECT_SYSTEM;
3041     args->pgmap_owner = &g_uvm_global;
3042     args->fault_page = NULL;
3043 
3044     // Note that migrate_vma_setup() doesn't handle file backed or VM_SPECIAL
3045     // VMAs so if UvmMigrate() tries to migrate such a region, -EINVAL will
3046     // be returned and we will only try to make the pages be CPU resident.
3047     ret = migrate_vma_setup_locked(args, va_block);
3048     if (ret)
3049         return hmm_make_resident_cpu(va_block,
3050                                      vma,
3051                                      va_block_context->hmm.src_pfns,
3052                                      region,
3053                                      NULL,
3054                                      NULL);
3055 
3056     // The overall process here is to migrate pages from the CPU or GPUs to the
3057     // destination processor. Note that block_migrate_add_mappings() handles
3058     // updating GPU mappings after the migration.
3059     // This is safe because we hold the va_block lock across the calls to
3060     // uvm_hmm_migrate_alloc_and_copy(), migrate_vma_pages(),
3061     // uvm_hmm_migrate_finalize(), migrate_vma_finalize() and
3062     // block_migrate_add_mappings().
3063     // If uvm_hmm_migrate_alloc_and_copy() needs to drop the va_block
3064     // lock, a sequence number is used to tell if an invalidate() callback
3065     // occurred while not holding the lock. If the sequence number changes,
3066     // all the locks need to be dropped (mm, va_space, va_block) and the whole
3067     // uvm_hmm_va_block_migrate_locked() called again. Otherwise, there were no
3068     // conflicting invalidate callbacks and our snapshots of the CPU page
3069     // tables are accurate and can be used to DMA pages and update GPU page
3070     // tables.
3071     status = uvm_hmm_migrate_alloc_and_copy(vma, &uvm_hmm_migrate_event);
3072     if (status == NV_WARN_MORE_PROCESSING_REQUIRED) {
3073         uvm_processor_id_t id;
3074         uvm_page_mask_t *page_mask;
3075 
3076         migrate_vma_finalize(args);
3077 
3078         // The CPU pages tables might contain only device private pages or
3079         // the migrate_vma_setup() might have not been able to lock/isolate
3080         // any pages because they are swapped out, or on another device.
3081         // We do know that none of the pages in the region are zero pages
3082         // since migrate_vma_setup() would have reported that information.
3083         // Collect all the pages that need to be faulted in and made CPU
3084         // resident, then do the hmm_range_fault() and retry.
3085         page_mask = &va_block_context->caller_page_mask;
3086         uvm_page_mask_init_from_region(page_mask, region, NULL);
3087 
3088         for_each_id_in_mask(id, &va_block->resident) {
3089             if (!uvm_page_mask_andnot(page_mask,
3090                                       page_mask,
3091                                       uvm_va_block_resident_mask_get(va_block, id)))
3092                 return NV_OK;
3093         }
3094 
3095         return hmm_make_resident_cpu(va_block,
3096                                      vma,
3097                                      va_block_context->hmm.src_pfns,
3098                                      region,
3099                                      NULL,
3100                                      NULL);
3101     }
3102 
3103     if (status == NV_OK) {
3104         migrate_vma_pages(args);
3105         status = uvm_hmm_migrate_finalize(&uvm_hmm_migrate_event);
3106     }
3107 
3108     migrate_vma_finalize(args);
3109 
3110     if (status == NV_WARN_NOTHING_TO_DO)
3111         status = NV_OK;
3112 
3113     return status;
3114 }
3115 
3116 NV_STATUS uvm_hmm_migrate_ranges(uvm_va_space_t *va_space,
3117                                  uvm_va_block_context_t *va_block_context,
3118                                  NvU64 base,
3119                                  NvU64 length,
3120                                  uvm_processor_id_t dest_id,
3121                                  uvm_migrate_mode_t mode,
3122                                  uvm_tracker_t *out_tracker)
3123 {
3124     struct mm_struct *mm;
3125     uvm_va_block_t *va_block;
3126     uvm_va_block_retry_t va_block_retry;
3127     NvU64 addr, end, last_address;
3128     NV_STATUS status = NV_OK;
3129 
3130     if (!uvm_hmm_is_enabled(va_space))
3131         return NV_ERR_INVALID_ADDRESS;
3132 
3133     mm = va_block_context->mm;
3134     UVM_ASSERT(mm == va_space->va_space_mm.mm);
3135     uvm_assert_mmap_lock_locked(mm);
3136     uvm_assert_rwsem_locked(&va_space->lock);
3137 
3138     last_address = base + length - 1;
3139 
3140     for (addr = base; addr < last_address; addr = end + 1) {
3141         struct vm_area_struct *vma;
3142 
3143         status = hmm_va_block_find_create(va_space, addr, false, va_block_context, &va_block);
3144         if (status != NV_OK)
3145             return status;
3146 
3147         end = va_block->end;
3148         if (end > last_address)
3149             end = last_address;
3150 
3151         vma = va_block_context->hmm.vma;
3152         if (end > vma->vm_end - 1)
3153             end = vma->vm_end - 1;
3154 
3155         status = hmm_migrate_range(va_block,
3156                                    &va_block_retry,
3157                                    va_block_context,
3158                                    dest_id,
3159                                    addr,
3160                                    end,
3161                                    mode,
3162                                    out_tracker);
3163         if (status != NV_OK)
3164             break;
3165     }
3166 
3167     return status;
3168 }
3169 
3170 NV_STATUS uvm_hmm_va_block_evict_chunk_prep(uvm_va_block_t *va_block,
3171                                             uvm_va_block_context_t *va_block_context,
3172                                             uvm_gpu_chunk_t *gpu_chunk,
3173                                             uvm_va_block_region_t chunk_region)
3174 {
3175     uvm_thread_context_t *uvm_context = uvm_thread_context();
3176     unsigned long *src_pfns = va_block_context->hmm.src_pfns;
3177     uvm_gpu_t *gpu = uvm_gpu_chunk_get_gpu(gpu_chunk);
3178     unsigned long pfn = uvm_pmm_gpu_devmem_get_pfn(&gpu->pmm, gpu_chunk);
3179     uvm_page_index_t page_index = chunk_region.first;
3180     int ret;
3181 
3182     uvm_assert_mutex_locked(&va_block->lock);
3183     // TODO: Bug 3368756: add support for large GPU pages.
3184     UVM_ASSERT(uvm_va_block_region_num_pages(chunk_region) == 1);
3185 
3186     uvm_context->ignore_hmm_invalidate_va_block = va_block;
3187     ret = migrate_device_range(src_pfns + page_index, pfn, uvm_va_block_region_num_pages(chunk_region));
3188     uvm_context->ignore_hmm_invalidate_va_block = NULL;
3189     if (ret)
3190         return errno_to_nv_status(ret);
3191 
3192     return NV_OK;
3193 }
3194 
3195 // Note that the caller must initialize va_block_context->hmm.src_pfns by
3196 // calling uvm_hmm_va_block_evict_chunk_prep() before calling this.
3197 static NV_STATUS hmm_va_block_evict_chunks(uvm_va_block_t *va_block,
3198                                            uvm_va_block_context_t *va_block_context,
3199                                            const uvm_page_mask_t *pages_to_evict,
3200                                            uvm_va_block_region_t region,
3201                                            uvm_make_resident_cause_t cause,
3202                                            bool *out_accessed_by_set)
3203 {
3204     NvU64 start = uvm_va_block_region_start(va_block, region);
3205     NvU64 end = uvm_va_block_region_end(va_block, region);
3206     unsigned long *src_pfns = va_block_context->hmm.src_pfns;
3207     unsigned long *dst_pfns = va_block_context->hmm.dst_pfns;
3208     uvm_hmm_migrate_event_t uvm_hmm_migrate_event = {
3209         .va_block = va_block,
3210         .va_block_retry = NULL,
3211         .va_block_context = va_block_context,
3212         .region = region,
3213         .dest_id = UVM_ID_CPU,
3214         .cause = cause,
3215     };
3216     uvm_page_mask_t *page_mask = &uvm_hmm_migrate_event.page_mask;
3217     const uvm_va_policy_t *policy;
3218     uvm_va_policy_node_t *node;
3219     unsigned long npages;
3220     NV_STATUS status;
3221 
3222     uvm_assert_mutex_locked(&va_block->lock);
3223 
3224     if (out_accessed_by_set)
3225         *out_accessed_by_set = false;
3226 
3227     // Note that there is no VMA available when evicting HMM pages.
3228     va_block_context->hmm.vma = NULL;
3229 
3230     uvm_page_mask_copy(page_mask, pages_to_evict);
3231 
3232     uvm_for_each_va_policy_in(policy, va_block, start, end, node, region) {
3233         npages = uvm_va_block_region_num_pages(region);
3234 
3235         va_block_context->policy = policy;
3236         if (out_accessed_by_set && uvm_processor_mask_get_count(&policy->accessed_by) > 0)
3237             *out_accessed_by_set = true;
3238 
3239         // Pages resident on the GPU should not have a resident page in system
3240         // memory.
3241         // TODO: Bug 3660922: Need to handle read duplication at some point.
3242         UVM_ASSERT(uvm_page_mask_region_empty(&va_block->cpu.resident, region));
3243 
3244         status = alloc_and_copy_to_cpu(va_block,
3245                                        NULL,
3246                                        src_pfns,
3247                                        dst_pfns,
3248                                        region,
3249                                        page_mask,
3250                                        NULL,
3251                                        UVM_ID_INVALID,
3252                                        NULL);
3253         if (status != NV_OK)
3254             goto err;
3255 
3256         status = uvm_va_block_make_resident_copy(va_block,
3257                                                  NULL,
3258                                                  va_block_context,
3259                                                  UVM_ID_CPU,
3260                                                  region,
3261                                                  page_mask,
3262                                                  NULL,
3263                                                  cause);
3264         if (status != NV_OK)
3265             goto err;
3266 
3267         migrate_device_pages(src_pfns + region.first, dst_pfns + region.first, npages);
3268 
3269         uvm_hmm_migrate_event.region = region;
3270 
3271         status = uvm_hmm_migrate_finalize(&uvm_hmm_migrate_event);
3272         if (status != NV_OK)
3273             goto err;
3274 
3275         migrate_device_finalize(src_pfns + region.first, dst_pfns + region.first, npages);
3276     }
3277 
3278     return NV_OK;
3279 
3280 err:
3281     migrate_device_finalize(src_pfns + region.first, dst_pfns + region.first, npages);
3282     return status;
3283 }
3284 
3285 NV_STATUS uvm_hmm_va_block_evict_chunks(uvm_va_block_t *va_block,
3286                                         uvm_va_block_context_t *va_block_context,
3287                                         const uvm_page_mask_t *pages_to_evict,
3288                                         uvm_va_block_region_t region,
3289                                         bool *out_accessed_by_set)
3290 {
3291     return hmm_va_block_evict_chunks(va_block,
3292                                      va_block_context,
3293                                      pages_to_evict,
3294                                      region,
3295                                      UVM_MAKE_RESIDENT_CAUSE_EVICTION,
3296                                      out_accessed_by_set);
3297 }
3298 
3299 NV_STATUS uvm_hmm_va_block_evict_pages_from_gpu(uvm_va_block_t *va_block,
3300                                                 uvm_gpu_t *gpu,
3301                                                 uvm_va_block_context_t *va_block_context,
3302                                                 const uvm_page_mask_t *pages_to_evict,
3303                                                 uvm_va_block_region_t region)
3304 {
3305     unsigned long *src_pfns = va_block_context->hmm.src_pfns;
3306     uvm_va_block_gpu_state_t *gpu_state;
3307     uvm_page_index_t page_index;
3308     uvm_gpu_chunk_t *gpu_chunk;
3309     NV_STATUS status;
3310 
3311     uvm_assert_mutex_locked(&va_block->lock);
3312 
3313     gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id);
3314     UVM_ASSERT(gpu_state);
3315     UVM_ASSERT(gpu_state->chunks);
3316 
3317     // Fill in the src_pfns[] with the ZONE_DEVICE private PFNs of the GPU.
3318     memset(src_pfns, 0, sizeof(va_block_context->hmm.src_pfns));
3319 
3320     // TODO: Bug 3368756: add support for large GPU pages.
3321     for_each_va_block_page_in_region_mask(page_index, pages_to_evict, region) {
3322         gpu_chunk = uvm_va_block_lookup_gpu_chunk(va_block,
3323                                                   gpu,
3324                                                   uvm_va_block_cpu_page_address(va_block, page_index));
3325         status = uvm_hmm_va_block_evict_chunk_prep(va_block,
3326                                                    va_block_context,
3327                                                    gpu_chunk,
3328                                                    uvm_va_block_region_for_page(page_index));
3329         if (status != NV_OK)
3330             return status;
3331     }
3332 
3333     return hmm_va_block_evict_chunks(va_block,
3334                                      va_block_context,
3335                                      pages_to_evict,
3336                                      region,
3337                                      UVM_MAKE_RESIDENT_CAUSE_API_MIGRATE,
3338                                      NULL);
3339 }
3340 
3341 NV_STATUS uvm_hmm_pmm_gpu_evict_pfn(unsigned long pfn)
3342 {
3343     unsigned long src_pfn = 0;
3344     unsigned long dst_pfn = 0;
3345     struct page *dst_page;
3346     NV_STATUS status = NV_OK;
3347     int ret;
3348 
3349     ret = migrate_device_range(&src_pfn, pfn, 1);
3350     if (ret)
3351         return errno_to_nv_status(ret);
3352 
3353     if (src_pfn & MIGRATE_PFN_MIGRATE) {
3354         // All the code for copying a vidmem page to sysmem relies on
3355         // having a va_block. However certain combinations of mremap()
3356         // and fork() can result in device-private pages being mapped
3357         // in a child process without a va_block.
3358         //
3359         // We don't expect the above to be a common occurance so for
3360         // now we allocate a fresh zero page when evicting without a
3361         // va_block. However this results in child processes losing
3362         // data so make sure we warn about it. Ideally we would just
3363         // not migrate and SIGBUS the child if it tries to access the
3364         // page. However that would prevent unloading of the driver so
3365         // we're stuck with this until we fix the problem.
3366         // TODO: Bug 3902536: add code to migrate GPU memory without having a
3367         // va_block.
3368         WARN_ON(1);
3369         dst_page = alloc_page(GFP_HIGHUSER_MOVABLE | __GFP_ZERO);
3370         if (!dst_page) {
3371             status = NV_ERR_NO_MEMORY;
3372             goto out;
3373         }
3374 
3375         lock_page(dst_page);
3376         dst_pfn = migrate_pfn(page_to_pfn(dst_page));
3377 
3378         migrate_device_pages(&src_pfn, &dst_pfn, 1);
3379     }
3380 
3381 out:
3382     migrate_device_finalize(&src_pfn, &dst_pfn, 1);
3383 
3384     return status;
3385 }
3386 
3387 // The routines below are all for UVM-HMM tests.
3388 
3389 NV_STATUS uvm_hmm_va_block_range_bounds(uvm_va_space_t *va_space,
3390                                         struct mm_struct *mm,
3391                                         NvU64 lookup_address,
3392                                         NvU64 *startp,
3393                                         NvU64 *endp,
3394                                         UVM_TEST_VA_RESIDENCY_INFO_PARAMS *params)
3395 {
3396     struct vm_area_struct *vma;
3397     NvU64 start;
3398     NvU64 end;
3399 
3400     if (!uvm_hmm_is_enabled(va_space) || !mm)
3401         return NV_ERR_INVALID_ADDRESS;
3402 
3403     uvm_assert_mmap_lock_locked(mm);
3404     uvm_assert_rwsem_locked(&va_space->lock);
3405 
3406     // The VMA might have changed while not holding mmap_lock so check it.
3407     vma = find_vma(mm, lookup_address);
3408     if (!uvm_hmm_vma_is_valid(vma, lookup_address, false))
3409         return NV_ERR_INVALID_ADDRESS;
3410 
3411     // Since managed VA ranges don't cover more than one VMA, return only the
3412     // intersecting range of the VA block and VMA.
3413     start = UVM_VA_BLOCK_ALIGN_DOWN(lookup_address);
3414     end = start + UVM_VA_BLOCK_SIZE - 1;
3415     if (start < vma->vm_start)
3416         start = vma->vm_start;
3417     if (end > vma->vm_end - 1)
3418         end = vma->vm_end - 1;
3419 
3420     *startp = start;
3421     *endp   = end;
3422 
3423     if (params) {
3424         uvm_va_space_processor_uuid(va_space, &params->resident_on[0], UVM_ID_CPU);
3425         params->resident_physical_size[0] = PAGE_SIZE;
3426         params->resident_on_count = 1;
3427 
3428         uvm_va_space_processor_uuid(va_space, &params->mapped_on[0], UVM_ID_CPU);
3429         params->mapping_type[0] = (vma->vm_flags & VM_WRITE) ?
3430                                   UVM_PROT_READ_WRITE_ATOMIC : UVM_PROT_READ_ONLY;
3431         params->page_size[0] = PAGE_SIZE;
3432         params->mapped_on_count = 1;
3433 
3434         uvm_va_space_processor_uuid(va_space, &params->populated_on[0], UVM_ID_CPU);
3435         params->populated_on_count = 1;
3436     }
3437 
3438     return NV_OK;
3439 }
3440 
3441 NV_STATUS uvm_hmm_va_block_update_residency_info(uvm_va_block_t *va_block,
3442                                                  struct mm_struct *mm,
3443                                                  NvU64 lookup_address,
3444                                                  bool populate)
3445 {
3446     uvm_va_space_t *va_space = va_block->hmm.va_space;
3447     struct vm_area_struct *vma;
3448     struct hmm_range range;
3449     uvm_va_block_region_t region;
3450     unsigned long pfn;
3451     NvU64 end;
3452     int ret;
3453     NV_STATUS status;
3454 
3455     if (!uvm_hmm_is_enabled(va_space) || !mm)
3456         return NV_ERR_INVALID_ADDRESS;
3457 
3458     uvm_assert_mmap_lock_locked(mm);
3459     uvm_assert_rwsem_locked(&va_space->lock);
3460 
3461     // The VMA might have changed while not holding mmap_lock so check it.
3462     vma = find_vma(mm, lookup_address);
3463     if (!uvm_hmm_vma_is_valid(vma, lookup_address, false))
3464         return NV_ERR_INVALID_ADDRESS;
3465 
3466     end = lookup_address + PAGE_SIZE;
3467     region = uvm_va_block_region_from_start_end(va_block, lookup_address, end - 1);
3468 
3469     range.notifier = &va_block->hmm.notifier;
3470     range.start = lookup_address;
3471     range.end = end;
3472     range.hmm_pfns = &pfn;
3473     range.default_flags = 0;
3474     range.pfn_flags_mask = 0;
3475     range.dev_private_owner = &g_uvm_global;
3476 
3477     if (populate) {
3478         range.default_flags = HMM_PFN_REQ_FAULT;
3479         if (vma->vm_flags & VM_WRITE)
3480             range.default_flags |= HMM_PFN_REQ_WRITE;
3481     }
3482 
3483     uvm_hmm_migrate_begin_wait(va_block);
3484 
3485     while (true) {
3486         range.notifier_seq = mmu_interval_read_begin(range.notifier);
3487         ret = hmm_range_fault(&range);
3488         if (ret == -EBUSY)
3489             continue;
3490         if (ret) {
3491             uvm_hmm_migrate_finish(va_block);
3492             return errno_to_nv_status(ret);
3493         }
3494 
3495         uvm_mutex_lock(&va_block->lock);
3496 
3497         if (!mmu_interval_read_retry(range.notifier, range.notifier_seq))
3498             break;
3499 
3500         uvm_mutex_unlock(&va_block->lock);
3501     }
3502 
3503     // Update the va_block CPU state based on the snapshot.
3504     // Note that we have to adjust the pfns address since it will be indexed
3505     // by region.first.
3506     status = populate_region(va_block, &pfn - region.first, region, NULL);
3507 
3508     uvm_mutex_unlock(&va_block->lock);
3509     uvm_hmm_migrate_finish(va_block);
3510 
3511     return NV_OK;
3512 }
3513 
3514 NV_STATUS uvm_test_split_invalidate_delay(UVM_TEST_SPLIT_INVALIDATE_DELAY_PARAMS *params, struct file *filp)
3515 {
3516     uvm_va_space_t *va_space = uvm_va_space_get(filp);
3517 
3518     atomic64_set(&va_space->test.split_invalidate_delay_us, params->delay_us);
3519 
3520     return NV_OK;
3521 }
3522 
3523 NV_STATUS uvm_hmm_va_range_info(uvm_va_space_t *va_space,
3524                                 struct mm_struct *mm,
3525                                 UVM_TEST_VA_RANGE_INFO_PARAMS *params)
3526 {
3527     uvm_range_tree_node_t *tree_node;
3528     const uvm_va_policy_node_t *node;
3529     struct vm_area_struct *vma;
3530     uvm_va_block_t *va_block;
3531 
3532     if (!mm || !uvm_hmm_is_enabled(va_space))
3533         return NV_ERR_INVALID_ADDRESS;
3534 
3535     uvm_assert_mmap_lock_locked(mm);
3536     uvm_assert_rwsem_locked(&va_space->lock);
3537 
3538     params->type = UVM_TEST_VA_RANGE_TYPE_MANAGED;
3539     params->managed.subtype = UVM_TEST_RANGE_SUBTYPE_HMM;
3540     params->va_range_start = 0;
3541     params->va_range_end = ULONG_MAX;
3542     params->read_duplication = UVM_TEST_READ_DUPLICATION_UNSET;
3543     memset(&params->preferred_location, 0, sizeof(params->preferred_location));
3544     params->accessed_by_count = 0;
3545     params->managed.vma_start = 0;
3546     params->managed.vma_end = 0;
3547     params->managed.is_zombie = NV_FALSE;
3548     params->managed.owned_by_calling_process = (mm == current->mm ? NV_TRUE : NV_FALSE);
3549 
3550     vma = find_vma(mm, params->lookup_address);
3551     if (!uvm_hmm_vma_is_valid(vma, params->lookup_address, false))
3552         return NV_ERR_INVALID_ADDRESS;
3553 
3554     params->va_range_start = vma->vm_start;
3555     params->va_range_end   = vma->vm_end - 1;
3556     params->managed.vma_start = vma->vm_start;
3557     params->managed.vma_end   = vma->vm_end - 1;
3558 
3559     uvm_mutex_lock(&va_space->hmm.blocks_lock);
3560     tree_node = uvm_range_tree_find(&va_space->hmm.blocks, params->lookup_address);
3561     if (!tree_node) {
3562         UVM_ASSERT(uvm_range_tree_find_hole_in(&va_space->hmm.blocks, params->lookup_address,
3563                                                &params->va_range_start, &params->va_range_end) == NV_OK);
3564         uvm_mutex_unlock(&va_space->hmm.blocks_lock);
3565         return NV_OK;
3566     }
3567 
3568     uvm_mutex_unlock(&va_space->hmm.blocks_lock);
3569     va_block = hmm_va_block_from_node(tree_node);
3570     uvm_mutex_lock(&va_block->lock);
3571 
3572     params->va_range_start = va_block->start;
3573     params->va_range_end   = va_block->end;
3574 
3575     node = uvm_va_policy_node_find(va_block, params->lookup_address);
3576     if (node) {
3577         uvm_processor_id_t processor_id;
3578 
3579         if (params->va_range_start < node->node.start)
3580             params->va_range_start = node->node.start;
3581         if (params->va_range_end > node->node.end)
3582             params->va_range_end = node->node.end;
3583 
3584         params->read_duplication = node->policy.read_duplication;
3585 
3586         if (!UVM_ID_IS_INVALID(node->policy.preferred_location))
3587             uvm_va_space_processor_uuid(va_space, &params->preferred_location, node->policy.preferred_location);
3588 
3589         for_each_id_in_mask(processor_id, &node->policy.accessed_by)
3590             uvm_va_space_processor_uuid(va_space, &params->accessed_by[params->accessed_by_count++], processor_id);
3591     }
3592     else {
3593         uvm_range_tree_find_hole_in(&va_block->hmm.va_policy_tree, params->lookup_address,
3594                                     &params->va_range_start, &params->va_range_end);
3595     }
3596 
3597     uvm_mutex_unlock(&va_block->lock);
3598 
3599     return NV_OK;
3600 }
3601 
3602 // TODO: Bug 3660968: Remove this hack as soon as HMM migration is implemented
3603 // for VMAs other than anonymous private memory.
3604 bool uvm_hmm_must_use_sysmem(uvm_va_block_t *va_block,
3605                              uvm_va_block_context_t *va_block_context)
3606 {
3607     struct vm_area_struct *vma = va_block_context->hmm.vma;
3608 
3609     uvm_assert_mutex_locked(&va_block->lock);
3610 
3611     if (!uvm_va_block_is_hmm(va_block))
3612         return false;
3613 
3614     UVM_ASSERT(vma);
3615     UVM_ASSERT(va_block_context->mm == vma->vm_mm);
3616     uvm_assert_mmap_lock_locked(va_block_context->mm);
3617 
3618     // TODO: Bug 4050579: Remove this when swap cached pages can be migrated.
3619     if (va_block_context->hmm.swap_cached)
3620         return true;
3621 
3622     // migrate_vma_setup() can't migrate VM_SPECIAL so we have to force GPU
3623     // remote mapping.
3624     // TODO: Bug 3660968: add support for file-backed migrations.
3625     // TODO: Bug 3368756: add support for transparent huge page migrations.
3626     return !vma_is_anonymous(vma) ||
3627            (vma->vm_flags & VM_SPECIAL) ||
3628            vma_is_dax(vma) ||
3629            is_vm_hugetlb_page(vma);
3630 }
3631 
3632 #endif // UVM_IS_CONFIG_HMM()
3633 
3634