1 /*******************************************************************************
2     Copyright (c) 2016-2023 NVIDIA Corporation
3 
4     Permission is hereby granted, free of charge, to any person obtaining a copy
5     of this software and associated documentation files (the "Software"), to
6     deal in the Software without restriction, including without limitation the
7     rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
8     sell copies of the Software, and to permit persons to whom the Software is
9     furnished to do so, subject to the following conditions:
10 
11         The above copyright notice and this permission notice shall be
12         included in all copies or substantial portions of the Software.
13 
14     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17     THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19     FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20     DEALINGS IN THE SOFTWARE.
21 
22 *******************************************************************************/
23 
24 #include "uvm_hmm.h"
25 
26 // Support for HMM ( https://docs.kernel.org/mm/hmm.html ):
27 
28 #ifdef NVCPU_X86_64
29 static bool uvm_disable_hmm = false;
30 MODULE_PARM_DESC(uvm_disable_hmm,
31                  "Force-disable HMM functionality in the UVM driver. "
32                  "Default: false (HMM is enabled if possible). "
33                  "However, even with uvm_disable_hmm=false, HMM will not be "
34                  "enabled if is not supported in this driver build "
35                  "configuration, or if ATS settings conflict with HMM.");
36 #else
37 // So far, we've only tested HMM on x86_64, so disable it by default everywhere
38 // else.
39 static bool uvm_disable_hmm = true;
40 MODULE_PARM_DESC(uvm_disable_hmm,
41                  "Force-disable HMM functionality in the UVM driver. "
42                  "Default: true (HMM is not enabled on this CPU architecture). "
43                  "However, even with uvm_disable_hmm=false, HMM will not be "
44                  "enabled if is not supported in this driver build "
45                  "configuration, or if ATS settings conflict with HMM.");
46 #endif
47 
48 module_param(uvm_disable_hmm, bool, 0444);
49 
50 #if UVM_IS_CONFIG_HMM()
51 
52 #include <linux/hmm.h>
53 #include <linux/rmap.h>
54 #include <linux/migrate.h>
55 #include <linux/userfaultfd_k.h>
56 #include <linux/memremap.h>
57 #include <linux/wait.h>
58 
59 #include "uvm_common.h"
60 #include "uvm_gpu.h"
61 #include "uvm_pmm_gpu.h"
62 #include "uvm_hal_types.h"
63 #include "uvm_push.h"
64 #include "uvm_hal.h"
65 #include "uvm_va_block_types.h"
66 #include "uvm_va_space_mm.h"
67 #include "uvm_va_space.h"
68 #include "uvm_va_range.h"
69 #include "uvm_range_tree.h"
70 #include "uvm_pmm_sysmem.h"
71 #include "uvm_lock.h"
72 #include "uvm_api.h"
73 #include "uvm_va_policy.h"
74 #include "uvm_tools.h"
75 
76 static NV_STATUS gpu_chunk_add(uvm_va_block_t *va_block,
77                                uvm_page_index_t page_index,
78                                struct page *page);
79 
80 typedef struct
81 {
82     uvm_processor_id_t processor_id;
83     uvm_processor_id_t new_residency;
84     uvm_va_block_t *va_block;
85     uvm_va_block_retry_t *va_block_retry;
86     uvm_service_block_context_t *service_context;
87     uvm_page_mask_t page_mask;
88     uvm_page_mask_t same_devmem_page_mask;
89 } uvm_hmm_gpu_fault_event_t;
90 
91 typedef struct
92 {
93     uvm_va_block_t *va_block;
94     uvm_va_block_retry_t *va_block_retry;
95     uvm_va_block_context_t *va_block_context;
96     uvm_va_block_region_t region;
97     uvm_processor_id_t dest_id;
98     uvm_make_resident_cause_t cause;
99     uvm_page_mask_t page_mask;
100     uvm_page_mask_t same_devmem_page_mask;
101 } uvm_hmm_migrate_event_t;
102 
103 typedef struct
104 {
105     uvm_processor_id_t processor_id;
106     uvm_va_block_t *va_block;
107     uvm_va_block_retry_t *va_block_retry;
108     uvm_service_block_context_t *service_context;
109     uvm_page_mask_t page_mask;
110     uvm_page_mask_t same_devmem_page_mask;
111 } uvm_hmm_devmem_fault_context_t;
112 
113 bool uvm_hmm_is_enabled_system_wide(void)
114 {
115     if (uvm_disable_hmm)
116         return false;
117 
118     if (g_uvm_global.ats.enabled)
119         return false;
120 
121     // Confidential Computing and HMM impose mutually exclusive constraints. In
122     // Confidential Computing the GPU can only access pages resident in vidmem,
123     // but in HMM pages may be required to be resident in sysmem: file backed
124     // VMAs, huge pages, etc.
125     if (g_uvm_global.conf_computing_enabled)
126         return false;
127 
128     return uvm_va_space_mm_enabled_system();
129 }
130 
131 bool uvm_hmm_is_enabled(uvm_va_space_t *va_space)
132 {
133     return uvm_hmm_is_enabled_system_wide() &&
134            uvm_va_space_mm_enabled(va_space) &&
135            !(va_space->initialization_flags & UVM_INIT_FLAGS_DISABLE_HMM);
136 }
137 
138 static uvm_va_block_t *hmm_va_block_from_node(uvm_range_tree_node_t *node)
139 {
140     if (!node)
141         return NULL;
142     return container_of(node, uvm_va_block_t, hmm.node);
143 }
144 
145 // Copies the contents of the source device-private page to the
146 // destination CPU page. This will invalidate mappings, so cannot be
147 // called while holding any va_block locks.
148 static void hmm_copy_devmem_page(struct page *dst_page, struct page *src_page)
149 {
150     uvm_tracker_t tracker = UVM_TRACKER_INIT();
151     uvm_gpu_phys_address_t src_addr;
152     uvm_gpu_phys_address_t dst_addr;
153     uvm_gpu_chunk_t *gpu_chunk;
154     NvU64 dma_addr;
155     uvm_push_t push;
156     NV_STATUS status = NV_OK;
157     uvm_gpu_t *gpu;
158 
159     // Holding a reference on the device-private page ensures the gpu
160     // is already retained. This is because when a GPU is unregistered
161     // all device-private pages are migrated back to the CPU and freed
162     // before releasing the GPU. Therefore if we could get a reference
163     // to the page the GPU must be retained.
164     UVM_ASSERT(is_device_private_page(src_page) && page_count(src_page));
165     gpu_chunk = uvm_pmm_devmem_page_to_chunk(src_page);
166     gpu = uvm_gpu_chunk_get_gpu(gpu_chunk);
167     status = uvm_mmu_chunk_map(gpu_chunk);
168     if (status != NV_OK)
169         goto out_zero;
170 
171     status = uvm_parent_gpu_map_cpu_pages(gpu->parent, dst_page, PAGE_SIZE, &dma_addr);
172     if (status != NV_OK)
173         goto out_unmap_gpu;
174 
175     dst_addr = uvm_gpu_phys_address(UVM_APERTURE_SYS, dma_addr);
176     src_addr = uvm_gpu_phys_address(UVM_APERTURE_VID, gpu_chunk->address);
177     status = uvm_push_begin_acquire(gpu->channel_manager,
178                                     UVM_CHANNEL_TYPE_GPU_TO_CPU,
179                                     &tracker,
180                                     &push,
181                                     "Copy for remote process fault");
182     if (status != NV_OK)
183         goto out_unmap_cpu;
184 
185     gpu->parent->ce_hal->memcopy(&push,
186                                  uvm_gpu_address_copy(gpu, dst_addr),
187                                  uvm_gpu_address_copy(gpu, src_addr),
188                                  PAGE_SIZE);
189     uvm_push_end(&push);
190     status = uvm_tracker_add_push_safe(&tracker, &push);
191     if (status == NV_OK)
192         uvm_tracker_wait_deinit(&tracker);
193 
194 out_unmap_cpu:
195     uvm_parent_gpu_unmap_cpu_pages(gpu->parent, dma_addr, PAGE_SIZE);
196 
197 out_unmap_gpu:
198     uvm_mmu_chunk_unmap(gpu_chunk, NULL);
199 
200 out_zero:
201     // We can't fail eviction because we need to free the device-private pages
202     // so the GPU can be unregistered. So the best we can do is warn on any
203     // failures and zero the uninitialised page. This could result in data loss
204     // in the application but failures are not expected.
205     if (WARN_ON(status != NV_OK))
206         memzero_page(dst_page, 0, PAGE_SIZE);
207 }
208 
209 static NV_STATUS uvm_hmm_pmm_gpu_evict_pfn(unsigned long pfn)
210 {
211     unsigned long src_pfn = 0;
212     unsigned long dst_pfn = 0;
213     struct page *dst_page;
214     NV_STATUS status = NV_OK;
215     int ret;
216 
217     ret = migrate_device_range(&src_pfn, pfn, 1);
218     if (ret)
219         return errno_to_nv_status(ret);
220 
221     if (src_pfn & MIGRATE_PFN_MIGRATE) {
222 
223         dst_page = alloc_page(GFP_HIGHUSER_MOVABLE);
224         if (!dst_page) {
225             status = NV_ERR_NO_MEMORY;
226             goto out;
227         }
228 
229         lock_page(dst_page);
230         hmm_copy_devmem_page(dst_page, migrate_pfn_to_page(src_pfn));
231         dst_pfn = migrate_pfn(page_to_pfn(dst_page));
232         migrate_device_pages(&src_pfn, &dst_pfn, 1);
233     }
234 
235 out:
236     migrate_device_finalize(&src_pfn, &dst_pfn, 1);
237 
238     if (!(src_pfn & MIGRATE_PFN_MIGRATE))
239         status = NV_ERR_BUSY_RETRY;
240 
241     return status;
242 }
243 
244 void uvm_hmm_va_space_initialize(uvm_va_space_t *va_space)
245 {
246     uvm_hmm_va_space_t *hmm_va_space = &va_space->hmm;
247 
248     if (!uvm_hmm_is_enabled(va_space))
249         return;
250 
251     uvm_range_tree_init(&hmm_va_space->blocks);
252     uvm_mutex_init(&hmm_va_space->blocks_lock, UVM_LOCK_ORDER_LEAF);
253 
254     return;
255 }
256 
257 void uvm_hmm_va_space_destroy(uvm_va_space_t *va_space)
258 {
259     uvm_hmm_va_space_t *hmm_va_space = &va_space->hmm;
260     uvm_range_tree_node_t *node, *next;
261     uvm_va_block_t *va_block;
262 
263     if (!uvm_hmm_is_enabled(va_space))
264         return;
265 
266     uvm_assert_rwsem_locked_write(&va_space->lock);
267 
268     // The blocks_lock is not needed when the va_space lock is held for write.
269     uvm_range_tree_for_each_safe(node, next, &hmm_va_space->blocks) {
270         va_block = hmm_va_block_from_node(node);
271         uvm_range_tree_remove(&hmm_va_space->blocks, node);
272         mmu_interval_notifier_remove(&va_block->hmm.notifier);
273         uvm_va_block_kill(va_block);
274     }
275 }
276 
277 static void hmm_va_block_unregister_gpu(uvm_va_block_t *va_block,
278                                         uvm_gpu_t *gpu,
279                                         struct mm_struct *mm)
280 {
281     uvm_va_policy_node_t *node;
282 
283     uvm_mutex_lock(&va_block->lock);
284 
285     // Reset preferred location and accessed-by of policy nodes if needed.
286     uvm_for_each_va_policy_node_in(node, va_block, va_block->start, va_block->end) {
287         if (uvm_id_equal(node->policy.preferred_location, gpu->id))
288             node->policy.preferred_location = UVM_ID_INVALID;
289 
290         uvm_processor_mask_clear(&node->policy.accessed_by, gpu->id);
291     }
292 
293     // Migrate and free any remaining resident allocations on this GPU.
294     uvm_va_block_unregister_gpu_locked(va_block, gpu, mm);
295 
296     uvm_mutex_unlock(&va_block->lock);
297 }
298 
299 void uvm_hmm_unregister_gpu(uvm_va_space_t *va_space, uvm_gpu_t *gpu, struct mm_struct *mm)
300 {
301     uvm_range_tree_node_t *node;
302     uvm_va_block_t *va_block;
303     struct range range = gpu->pmm.devmem.pagemap.range;
304     unsigned long pfn;
305     bool retry;
306 
307     if (!uvm_hmm_is_enabled(va_space))
308         return;
309 
310     if (mm)
311         uvm_assert_mmap_lock_locked(mm);
312     uvm_assert_rwsem_locked_write(&va_space->lock);
313 
314     // There could be pages with page->zone_device_data pointing to the va_space
315     // which may be about to be freed. Migrate those back to the CPU so we don't
316     // fault on them. Normally infinite retries are bad, but we don't have any
317     // option here. Device-private pages can't be pinned so migration should
318     // eventually succeed. Even if we did eventually bail out of the loop we'd
319     // just stall in memunmap_pages() anyway.
320     do {
321         retry = false;
322 
323         for (pfn = __phys_to_pfn(range.start); pfn <= __phys_to_pfn(range.end); pfn++) {
324             struct page *page = pfn_to_page(pfn);
325 
326             UVM_ASSERT(is_device_private_page(page));
327 
328             // This check is racy because nothing stops the page being freed and
329             // even reused. That doesn't matter though - worst case the
330             // migration fails, we retry and find the va_space doesn't match.
331             if (page->zone_device_data == va_space)
332                 if (uvm_hmm_pmm_gpu_evict_pfn(pfn) != NV_OK)
333                     retry = true;
334         }
335     } while (retry);
336 
337     uvm_range_tree_for_each(node, &va_space->hmm.blocks) {
338         va_block = hmm_va_block_from_node(node);
339 
340         hmm_va_block_unregister_gpu(va_block, gpu, mm);
341     }
342 }
343 
344 static void hmm_va_block_remove_gpu_va_space(uvm_va_block_t *va_block,
345                                              uvm_gpu_va_space_t *gpu_va_space,
346                                              uvm_va_block_context_t *va_block_context)
347 {
348     uvm_mutex_lock(&va_block->lock);
349 
350     uvm_va_block_remove_gpu_va_space(va_block, gpu_va_space, va_block_context);
351 
352     uvm_mutex_unlock(&va_block->lock);
353 
354     // TODO: Bug 3660922: Need to handle read duplication at some point.
355     // See range_remove_gpu_va_space_managed().
356 }
357 
358 void uvm_hmm_remove_gpu_va_space(uvm_va_space_t *va_space,
359                                  uvm_gpu_va_space_t *gpu_va_space,
360                                  struct mm_struct *mm)
361 {
362     uvm_va_block_context_t *va_block_context;
363     uvm_range_tree_node_t *node, *next;
364     uvm_va_block_t *va_block;
365 
366     if (!uvm_hmm_is_enabled(va_space))
367         return;
368 
369     if (mm)
370         uvm_assert_mmap_lock_locked(mm);
371     uvm_assert_rwsem_locked_write(&va_space->lock);
372 
373     va_block_context = uvm_va_space_block_context(va_space, mm);
374 
375     uvm_range_tree_for_each_safe(node, next, &va_space->hmm.blocks) {
376         va_block = hmm_va_block_from_node(node);
377 
378         hmm_va_block_remove_gpu_va_space(va_block, gpu_va_space, va_block_context);
379     }
380 }
381 
382 static bool hmm_invalidate(uvm_va_block_t *va_block,
383                            const struct mmu_notifier_range *range,
384                            unsigned long cur_seq)
385 {
386     uvm_thread_context_t *uvm_context = uvm_thread_context();
387     struct mmu_interval_notifier *mni = &va_block->hmm.notifier;
388     struct mm_struct *mm = mni->mm;
389     uvm_va_block_context_t *va_block_context;
390     uvm_va_block_region_t region;
391     NvU64 start, end;
392     uvm_processor_id_t id;
393     NV_STATUS status = NV_OK;
394 
395     // The MMU_NOTIFY_RELEASE event isn't really needed since mn_itree_release()
396     // doesn't remove the interval notifiers from the struct_mm so there will
397     // be a full range MMU_NOTIFY_UNMAP event after the release from
398     // unmap_vmas() during exit_mmap().
399     if (range->event == MMU_NOTIFY_SOFT_DIRTY || range->event == MMU_NOTIFY_RELEASE)
400         return true;
401 
402     // Blockable is only set false by
403     // mmu_notifier_invalidate_range_start_nonblock() which is only called in
404     // __oom_reap_task_mm().
405     if (!mmu_notifier_range_blockable(range))
406         return false;
407 
408     // We only ignore invalidations in this context whilst holding the
409     // va_block lock. This prevents deadlock when try_to_migrate()
410     // calls the notifier, but holding the lock prevents other threads
411     // invalidating PTEs so we can safely assume the results of
412     // migrate_vma_setup() are correct.
413     if (uvm_context->ignore_hmm_invalidate_va_block == va_block ||
414         ((range->event == MMU_NOTIFY_MIGRATE || range->event == MMU_NOTIFY_EXCLUSIVE) &&
415          range->owner == &g_uvm_global))
416         return true;
417 
418     va_block_context = uvm_va_block_context_alloc(mm);
419     if (!va_block_context)
420         return true;
421 
422     uvm_mutex_lock(&va_block->lock);
423 
424     // mmu_interval_notifier_remove() is always called before marking a
425     // va_block as dead so this va_block has to be alive.
426     UVM_ASSERT(!uvm_va_block_is_dead(va_block));
427 
428     // Note: unmap_vmas() does MMU_NOTIFY_UNMAP [0, 0xffffffffffffffff]
429     // Also note that hmm_invalidate() can be called when a new va_block is not
430     // yet inserted into the va_space->hmm.blocks table while the original
431     // va_block is being split. The original va_block may have its end address
432     // updated before the mmu interval notifier is updated so this invalidate
433     // may be for a range past the va_block end address.
434     start = range->start;
435     end = (range->end == ULONG_MAX) ? range->end : range->end - 1;
436     if (start < va_block->start)
437         start = va_block->start;
438     if (end > va_block->end)
439         end = va_block->end;
440     if (start > end)
441         goto unlock;
442 
443     // These will be equal if no other thread causes an invalidation
444     // whilst the va_block lock was dropped.
445     uvm_context->hmm_invalidate_seqnum++;
446     va_block->hmm.changed++;
447 
448     mmu_interval_set_seq(mni, cur_seq);
449 
450     region = uvm_va_block_region_from_start_end(va_block, start, end);
451 
452     va_block_context->hmm.vma = NULL;
453 
454     // We only need to unmap GPUs since Linux handles the CPUs.
455     for_each_gpu_id_in_mask(id, &va_block->mapped) {
456         status = uvm_va_block_unmap(va_block,
457                                     va_block_context,
458                                     id,
459                                     region,
460                                     uvm_va_block_map_mask_get(va_block, id),
461                                     &va_block->tracker);
462         // Note that the va_block lock can be dropped, relocked, and
463         // NV_ERR_MORE_PROCESSING_REQUIRED returned.
464         if (status != NV_OK)
465             break;
466     }
467 
468     if (range->event == MMU_NOTIFY_UNMAP || range->event == MMU_NOTIFY_CLEAR)
469         uvm_va_block_munmap_region(va_block, region);
470 
471     if (status == NV_OK)
472         status = uvm_tracker_wait(&va_block->tracker);
473 
474     // Remove stale HMM struct page pointers to system memory.
475     uvm_va_block_remove_cpu_chunks(va_block, region);
476 
477 unlock:
478     uvm_mutex_unlock(&va_block->lock);
479 
480     uvm_va_block_context_free(va_block_context);
481 
482     UVM_ASSERT(status == NV_OK);
483     return true;
484 }
485 
486 static bool uvm_hmm_invalidate_entry(struct mmu_interval_notifier *mni,
487                                      const struct mmu_notifier_range *range,
488                                      unsigned long cur_seq)
489 {
490     uvm_va_block_t *va_block = container_of(mni, uvm_va_block_t, hmm.notifier);
491 
492     UVM_ENTRY_RET(hmm_invalidate(va_block, range, cur_seq));
493 }
494 
495 static const struct mmu_interval_notifier_ops uvm_hmm_notifier_ops =
496 {
497     .invalidate = uvm_hmm_invalidate_entry,
498 };
499 
500 NV_STATUS uvm_hmm_va_block_find(uvm_va_space_t *va_space,
501                                 NvU64 addr,
502                                 uvm_va_block_t **va_block_ptr)
503 {
504     uvm_range_tree_node_t *node;
505 
506     if (!uvm_hmm_is_enabled(va_space))
507         return NV_ERR_INVALID_ADDRESS;
508 
509     uvm_assert_rwsem_locked(&va_space->lock);
510 
511     uvm_mutex_lock(&va_space->hmm.blocks_lock);
512     node = uvm_range_tree_find(&va_space->hmm.blocks, addr);
513     uvm_mutex_unlock(&va_space->hmm.blocks_lock);
514 
515     if (!node)
516         return NV_ERR_OBJECT_NOT_FOUND;
517 
518     *va_block_ptr = hmm_va_block_from_node(node);
519 
520     return NV_OK;
521 }
522 
523 static int migrate_vma_setup_locked(struct migrate_vma *args, uvm_va_block_t *va_block)
524 {
525     uvm_thread_context_t *uvm_context = uvm_thread_context();
526     int ret;
527 
528     // It's only safe to ignore invalidations whilst doing a migration
529     // and holding the va_block lock.
530     uvm_assert_mutex_locked(&va_block->lock);
531     uvm_context->ignore_hmm_invalidate_va_block = va_block;
532     ret = migrate_vma_setup(args);
533 
534     // We shouldn't be generating any more invalidations now.
535     uvm_context->ignore_hmm_invalidate_va_block = NULL;
536     return ret;
537 }
538 
539 static bool uvm_hmm_vma_is_valid(struct vm_area_struct *vma,
540                                  unsigned long addr,
541                                  bool allow_unreadable_vma)
542 {
543     // UVM doesn't support userfaultfd. hmm_range_fault() doesn't support
544     // VM_IO or VM_PFNMAP VMAs. It also doesn't support VMAs without VM_READ
545     // but we allow those VMAs to have policy set on them.
546     // migrate_vma_setup() doesn't support VM_SPECIAL VMAs but that is handled
547     // by uvm_hmm_must_use_sysmem() forcing residency to the CPU.
548     return vma &&
549            addr >= vma->vm_start &&
550            !userfaultfd_armed(vma) &&
551            !(vma->vm_flags & (VM_IO | VM_PFNMAP)) &&
552            !uvm_vma_is_managed(vma) &&
553            (allow_unreadable_vma || (vma->vm_flags & VM_READ));
554 }
555 
556 static void hmm_va_block_init(uvm_va_block_t *va_block,
557                               uvm_va_space_t *va_space,
558                               NvU64 start,
559                               NvU64 end)
560 {
561     va_block->hmm.va_space = va_space;
562     va_block->hmm.node.start = start;
563     va_block->hmm.node.end = end;
564     uvm_range_tree_init(&va_block->hmm.va_policy_tree);
565     uvm_mutex_init(&va_block->hmm.migrate_lock, UVM_LOCK_ORDER_VA_BLOCK_MIGRATE);
566 }
567 
568 static NV_STATUS hmm_va_block_find_create(uvm_va_space_t *va_space,
569                                           NvU64 addr,
570                                           bool allow_unreadable_vma,
571                                           struct vm_area_struct **vma_out,
572                                           uvm_va_block_t **va_block_ptr)
573 {
574     struct mm_struct *mm;
575     struct vm_area_struct *va_block_vma;
576     uvm_va_block_t *va_block;
577     NvU64 start, end;
578     NV_STATUS status;
579     int ret;
580 
581     if (!uvm_hmm_is_enabled(va_space))
582         return NV_ERR_INVALID_ADDRESS;
583 
584     mm = va_space->va_space_mm.mm;
585     uvm_assert_mmap_lock_locked(mm);
586     uvm_assert_rwsem_locked(&va_space->lock);
587     UVM_ASSERT(PAGE_ALIGNED(addr));
588 
589     // Note that we have to allow PROT_NONE VMAs so that policies can be set.
590     va_block_vma = find_vma(mm, addr);
591     if (!uvm_hmm_vma_is_valid(va_block_vma, addr, allow_unreadable_vma))
592         return NV_ERR_INVALID_ADDRESS;
593 
594     // Since we only hold the va_space read lock, there can be multiple
595     // parallel va_block insertions.
596     uvm_mutex_lock(&va_space->hmm.blocks_lock);
597 
598     va_block = hmm_va_block_from_node(uvm_range_tree_find(&va_space->hmm.blocks, addr));
599     if (va_block)
600         goto done;
601 
602     // The va_block is always created to cover the whole aligned
603     // UVM_VA_BLOCK_SIZE interval unless there are existing UVM va_ranges or
604     // HMM va_blocks. In that case, the new HMM va_block size is adjusted so it
605     // doesn't overlap.
606     start = UVM_VA_BLOCK_ALIGN_DOWN(addr);
607     end = start + UVM_VA_BLOCK_SIZE - 1;
608 
609     // Search for existing UVM va_ranges in the start/end interval and create
610     // a maximum interval that doesn't overlap any existing UVM va_ranges.
611     // We know that 'addr' is not within a va_range or
612     // hmm_va_block_find_create() wouldn't be called.
613     status = uvm_range_tree_find_hole_in(&va_space->va_range_tree, addr, &start, &end);
614     UVM_ASSERT(status == NV_OK);
615 
616     // Search for existing HMM va_blocks in the start/end interval and create
617     // a maximum interval that doesn't overlap any existing HMM va_blocks.
618     status = uvm_range_tree_find_hole_in(&va_space->hmm.blocks, addr, &start, &end);
619     UVM_ASSERT(status == NV_OK);
620 
621     // Create a HMM va_block with a NULL va_range pointer.
622     status = uvm_va_block_create(NULL, start, end, &va_block);
623     if (status != NV_OK)
624         goto err_unlock;
625 
626     hmm_va_block_init(va_block, va_space, start, end);
627 
628     ret = mmu_interval_notifier_insert(&va_block->hmm.notifier,
629                                        mm,
630                                        start,
631                                        end - start + 1,
632                                        &uvm_hmm_notifier_ops);
633     if (ret) {
634         status = errno_to_nv_status(ret);
635         goto err_release;
636     }
637 
638     status = uvm_range_tree_add(&va_space->hmm.blocks, &va_block->hmm.node);
639     UVM_ASSERT(status == NV_OK);
640 
641 done:
642     uvm_mutex_unlock(&va_space->hmm.blocks_lock);
643     if (vma_out)
644         *vma_out = va_block_vma;
645     *va_block_ptr = va_block;
646     return NV_OK;
647 
648 err_release:
649     uvm_va_block_release(va_block);
650 
651 err_unlock:
652     uvm_mutex_unlock(&va_space->hmm.blocks_lock);
653     return status;
654 }
655 
656 NV_STATUS uvm_hmm_va_block_find_create(uvm_va_space_t *va_space,
657                                        NvU64 addr,
658                                        struct vm_area_struct **vma,
659                                        uvm_va_block_t **va_block_ptr)
660 {
661     return hmm_va_block_find_create(va_space, addr, false, vma, va_block_ptr);
662 }
663 
664 NV_STATUS uvm_hmm_find_vma(struct mm_struct *mm, struct vm_area_struct **vma_out, NvU64 addr)
665 {
666     if (!mm)
667         return NV_ERR_INVALID_ADDRESS;
668 
669     uvm_assert_mmap_lock_locked(mm);
670 
671     *vma_out = find_vma(mm, addr);
672     if (!uvm_hmm_vma_is_valid(*vma_out, addr, false))
673         return NV_ERR_INVALID_ADDRESS;
674 
675     return NV_OK;
676 }
677 
678 bool uvm_hmm_check_context_vma_is_valid(uvm_va_block_t *va_block,
679                                         struct vm_area_struct *vma,
680                                         uvm_va_block_region_t region)
681 {
682     uvm_assert_mutex_locked(&va_block->lock);
683 
684     if (uvm_va_block_is_hmm(va_block)) {
685         UVM_ASSERT(vma);
686         UVM_ASSERT(va_block->hmm.va_space->va_space_mm.mm == vma->vm_mm);
687         uvm_assert_mmap_lock_locked(va_block->hmm.va_space->va_space_mm.mm);
688         UVM_ASSERT(vma->vm_start <= uvm_va_block_region_start(va_block, region));
689         UVM_ASSERT(vma->vm_end > uvm_va_block_region_end(va_block, region));
690     }
691 
692     return true;
693 }
694 
695 NV_STATUS uvm_hmm_migrate_begin(uvm_va_block_t *va_block)
696 {
697     if (uvm_mutex_trylock(&va_block->hmm.migrate_lock))
698         return NV_OK;
699 
700     return NV_ERR_BUSY_RETRY;
701 }
702 
703 void uvm_hmm_migrate_begin_wait(uvm_va_block_t *va_block)
704 {
705     uvm_mutex_lock(&va_block->hmm.migrate_lock);
706 }
707 
708 void uvm_hmm_migrate_finish(uvm_va_block_t *va_block)
709 {
710     uvm_mutex_unlock(&va_block->hmm.migrate_lock);
711 }
712 
713 // Migrate the given range [start end] within a va_block to dest_id.
714 static NV_STATUS hmm_migrate_range(uvm_va_block_t *va_block,
715                                    uvm_va_block_retry_t *va_block_retry,
716                                    uvm_va_block_context_t *va_block_context,
717                                    uvm_processor_id_t dest_id,
718                                    NvU64 start,
719                                    NvU64 end,
720                                    uvm_migrate_mode_t mode,
721                                    uvm_tracker_t *out_tracker)
722 {
723     uvm_va_block_region_t region;
724     uvm_va_policy_node_t *node;
725     const uvm_va_policy_t *policy;
726     NV_STATUS status = NV_OK;
727 
728     uvm_hmm_migrate_begin_wait(va_block);
729     uvm_mutex_lock(&va_block->lock);
730 
731     uvm_for_each_va_policy_in(policy, va_block, start, end, node, region) {
732         // Even though UVM_VA_BLOCK_RETRY_LOCKED() may unlock and relock the
733         // va_block lock, the policy remains valid because we hold the mmap
734         // lock so munmap can't remove the policy, and the va_space lock so the
735         // policy APIs can't change the policy.
736         status = UVM_VA_BLOCK_RETRY_LOCKED(va_block,
737                                            va_block_retry,
738                                            uvm_va_block_migrate_locked(va_block,
739                                                                        va_block_retry,
740                                                                        va_block_context,
741                                                                        region,
742                                                                        dest_id,
743                                                                        mode,
744                                                                        out_tracker));
745         if (status != NV_OK)
746             break;
747     }
748 
749     uvm_mutex_unlock(&va_block->lock);
750     uvm_hmm_migrate_finish(va_block);
751 
752     return status;
753 }
754 
755 NV_STATUS uvm_hmm_test_va_block_inject_split_error(uvm_va_space_t *va_space, NvU64 addr)
756 {
757     uvm_va_block_test_t *block_test;
758     uvm_va_block_t *va_block;
759     NV_STATUS status;
760 
761     if (!uvm_hmm_is_enabled(va_space))
762         return NV_ERR_INVALID_ADDRESS;
763 
764     status = hmm_va_block_find_create(va_space, addr, false, NULL, &va_block);
765     if (status != NV_OK)
766         return status;
767 
768     block_test = uvm_va_block_get_test(va_block);
769     if (block_test)
770         block_test->inject_split_error = true;
771 
772     return NV_OK;
773 }
774 
775 typedef struct {
776     struct mmu_interval_notifier notifier;
777     uvm_va_block_t *existing_block;
778 } hmm_split_invalidate_data_t;
779 
780 static bool hmm_split_invalidate(struct mmu_interval_notifier *mni,
781                                  const struct mmu_notifier_range *range,
782                                  unsigned long cur_seq)
783 {
784     hmm_split_invalidate_data_t *split_data = container_of(mni, hmm_split_invalidate_data_t, notifier);
785 
786     uvm_tools_test_hmm_split_invalidate(split_data->existing_block->hmm.va_space);
787     hmm_invalidate(split_data->existing_block, range, cur_seq);
788 
789     return true;
790 }
791 
792 static bool hmm_split_invalidate_entry(struct mmu_interval_notifier *mni,
793                                        const struct mmu_notifier_range *range,
794                                        unsigned long cur_seq)
795 {
796     UVM_ENTRY_RET(hmm_split_invalidate(mni, range, cur_seq));
797 }
798 
799 static const struct mmu_interval_notifier_ops hmm_notifier_split_ops =
800 {
801     .invalidate = hmm_split_invalidate_entry,
802 };
803 
804 // Splits existing va_block into two pieces, with new_va_block always after
805 // va_block. va_block is updated to have new_end. new_end+1 must be page-
806 // aligned.
807 //
808 // Before: [----------- existing ------------]
809 // After:  [---- existing ----][---- new ----]
810 //                            ^new_end
811 //
812 // On error, va_block is still accessible and is left in its original
813 // functional state.
814 static NV_STATUS hmm_split_block(uvm_va_block_t *va_block,
815                                  NvU64 new_end,
816                                  uvm_va_block_t **new_block_ptr)
817 {
818     uvm_va_space_t *va_space = va_block->hmm.va_space;
819     struct mm_struct *mm = va_space->va_space_mm.mm;
820     hmm_split_invalidate_data_t split_data;
821     NvU64 delay_us;
822     uvm_va_block_t *new_va_block;
823     NV_STATUS status;
824     int ret;
825 
826     uvm_assert_rwsem_locked_write(&va_space->lock);
827 
828     UVM_ASSERT(new_end > va_block->start);
829     UVM_ASSERT(new_end < va_block->end);
830     UVM_ASSERT(PAGE_ALIGNED(new_end + 1));
831 
832     status = uvm_va_block_create(NULL, new_end + 1, va_block->end, &new_va_block);
833     if (status != NV_OK)
834         return status;
835 
836     // Initialize the newly created HMM va_block.
837     hmm_va_block_init(new_va_block, va_space, new_va_block->start, new_va_block->end);
838 
839     ret = mmu_interval_notifier_insert(&new_va_block->hmm.notifier,
840                                        mm,
841                                        new_va_block->start,
842                                        uvm_va_block_size(new_va_block),
843                                        &uvm_hmm_notifier_ops);
844 
845     // Since __mmu_notifier_register() was called when the va_space was
846     // initially created, we know that mm->notifier_subscriptions is valid
847     // and mmu_interval_notifier_insert() can't return ENOMEM.
848     // The only error return is for start + length overflowing but we already
849     // registered the same address range before so there should be no error.
850     UVM_ASSERT(!ret);
851 
852     uvm_mutex_lock(&va_block->lock);
853 
854     status = uvm_va_block_split_locked(va_block, new_end, new_va_block, NULL);
855     if (status != NV_OK)
856         goto err;
857 
858     uvm_mutex_unlock(&va_block->lock);
859 
860     // The MMU interval notifier has to be removed in order to resize it.
861     // That means there would be a window of time when invalidation callbacks
862     // could be missed. To handle this case, we register a temporary notifier
863     // to cover the address range while resizing the old notifier (it is
864     // OK to have multiple notifiers for the same range, we may simply try to
865     // invalidate twice).
866     split_data.existing_block = va_block;
867     ret = mmu_interval_notifier_insert(&split_data.notifier,
868                                        mm,
869                                        va_block->start,
870                                        new_end - va_block->start + 1,
871                                        &hmm_notifier_split_ops);
872     UVM_ASSERT(!ret);
873 
874     // Delay to allow hmm_sanity test to trigger an mmu_notifier during the
875     // critical window where the split invalidate callback is active.
876     delay_us = atomic64_read(&va_space->test.split_invalidate_delay_us);
877     if (delay_us)
878         udelay(delay_us);
879 
880     mmu_interval_notifier_remove(&va_block->hmm.notifier);
881 
882     // Enable notifications on the old block with the smaller size.
883     ret = mmu_interval_notifier_insert(&va_block->hmm.notifier,
884                                        mm,
885                                        va_block->start,
886                                        uvm_va_block_size(va_block),
887                                        &uvm_hmm_notifier_ops);
888     UVM_ASSERT(!ret);
889 
890     mmu_interval_notifier_remove(&split_data.notifier);
891 
892     if (new_block_ptr)
893         *new_block_ptr = new_va_block;
894 
895     return status;
896 
897 err:
898     uvm_mutex_unlock(&va_block->lock);
899     mmu_interval_notifier_remove(&new_va_block->hmm.notifier);
900     uvm_va_block_release(new_va_block);
901     return status;
902 }
903 
904 // Check to see if the HMM va_block would overlap the range start/end and
905 // split it so it can be removed. That breaks down to the following cases:
906 // start/end could cover all of the HMM va_block ->
907 //     remove the va_block
908 // start/end could cover the left part of the HMM va_block ->
909 //     remove the left part
910 // start/end could cover the right part of the HMM va_block ->
911 //     remove the right part
912 // or start/end could "punch a hole" in the middle and leave the ends intact.
913 // In each case, only one HMM va_block is removed so return it in out_va_block.
914 static NV_STATUS split_block_if_needed(uvm_va_block_t *va_block,
915                                        NvU64 start,
916                                        NvU64 end,
917                                        uvm_va_block_t **out_va_block)
918 {
919     uvm_va_block_context_t *va_block_context;
920     uvm_va_space_t *va_space;
921     struct mm_struct *mm;
922     struct vm_area_struct *vma;
923     uvm_va_block_region_t region;
924     NvU64 addr, from, to;
925     uvm_va_block_t *new;
926     NV_STATUS status;
927 
928     if (va_block->start < start) {
929         status = hmm_split_block(va_block, start - 1, &new);
930         if (status != NV_OK)
931             return status;
932 
933         // Keep the left part, the right part will be deleted.
934         va_block = new;
935     }
936 
937     if (va_block->end > end) {
938         status = hmm_split_block(va_block, end, NULL);
939         if (status != NV_OK)
940             return status;
941 
942         // Keep the right part, the left part will be deleted.
943     }
944 
945     *out_va_block = va_block;
946 
947     // Migrate any GPU data to sysmem before destroying the HMM va_block.
948     // We do this because the new va_range might be for a UVM external
949     // allocation which could be converting an address range that was first
950     // operated on by UVM-HMM and the exteral allocation should see that data.
951     va_space = va_block->hmm.va_space;
952     mm = va_space->va_space_mm.mm;
953     va_block_context = uvm_va_space_block_context(va_space, mm);
954 
955     for (addr = va_block->start; addr < va_block->end; addr = to + 1) {
956         vma = find_vma_intersection(mm, addr, va_block->end);
957         if (!vma)
958             break;
959 
960         from = max(addr, (NvU64)vma->vm_start);
961         to = min(va_block->end, (NvU64)vma->vm_end - 1);
962         region = uvm_va_block_region_from_start_end(va_block, from, to);
963 
964         if (!uvm_hmm_vma_is_valid(vma, from, false))
965             continue;
966 
967         va_block_context->hmm.vma = vma;
968 
969         status = hmm_migrate_range(va_block,
970                                    NULL,
971                                    va_block_context,
972                                    UVM_ID_CPU,
973                                    from,
974                                    to,
975                                    UVM_MIGRATE_MODE_MAKE_RESIDENT_AND_MAP,
976                                    NULL);
977         if (status != NV_OK)
978             return status;
979     }
980 
981     return NV_OK;
982 }
983 
984 // Normally, the HMM va_block is destroyed when the va_space is destroyed
985 // (i.e., when the /dev/nvidia-uvm device is closed). A munmap() call triggers
986 // a uvm_hmm_invalidate() callback which unmaps the VMA's range from the GPU's
987 // page tables. However, it doesn't destroy the va_block because that would
988 // require calling mmu_interval_notifier_remove() which can't be called from
989 // the invalidate callback due to Linux locking constraints. If a process
990 // calls mmap()/munmap() for SAM and then creates a managed allocation,
991 // the same VMA range can be picked and there would be a UVM/HMM va_block
992 // conflict. Creating a managed allocation, external allocation, or other
993 // va_range types, calls this function to remove stale HMM va_blocks or split
994 // the HMM va_block so there is no overlap.
995 NV_STATUS uvm_hmm_va_block_reclaim(uvm_va_space_t *va_space,
996                                    struct mm_struct *mm,
997                                    NvU64 start,
998                                    NvU64 end)
999 {
1000     uvm_range_tree_node_t *node, *next;
1001     uvm_va_block_t *va_block;
1002     NV_STATUS status;
1003 
1004     if (!uvm_hmm_is_enabled(va_space))
1005         return NV_OK;
1006 
1007     if (mm)
1008         uvm_assert_mmap_lock_locked(mm);
1009     uvm_assert_rwsem_locked_write(&va_space->lock);
1010 
1011     // Process each HMM va_block that overlaps the interval [start, end].
1012     // Note that end is inclusive.
1013     // The blocks_lock is not needed when the va_space lock is held for write.
1014     uvm_range_tree_for_each_in_safe(node, next, &va_space->hmm.blocks, start, end) {
1015         va_block = hmm_va_block_from_node(node);
1016 
1017         if (mm) {
1018             status = split_block_if_needed(va_block, start, end, &va_block);
1019             if (status != NV_OK)
1020                 return status;
1021         }
1022 
1023         // Note that this waits for any invalidations callbacks to complete
1024         // so uvm_hmm_invalidate() won't see a block disapear.
1025         // The va_space write lock should prevent uvm_hmm_va_block_find_create()
1026         // from adding it back.
1027         mmu_interval_notifier_remove(&va_block->hmm.notifier);
1028         uvm_range_tree_remove(&va_space->hmm.blocks, &va_block->hmm.node);
1029         uvm_va_block_kill(va_block);
1030     }
1031 
1032     UVM_ASSERT(!uvm_range_tree_iter_first(&va_space->hmm.blocks, start, end));
1033 
1034     return NV_OK;
1035 }
1036 
1037 void uvm_hmm_va_block_split_tree(uvm_va_block_t *existing_va_block, uvm_va_block_t *new_block)
1038 {
1039     uvm_va_space_t *va_space = existing_va_block->hmm.va_space;
1040 
1041     UVM_ASSERT(uvm_va_block_is_hmm(existing_va_block));
1042     uvm_assert_rwsem_locked_write(&va_space->lock);
1043 
1044     uvm_range_tree_split(&existing_va_block->hmm.va_space->hmm.blocks,
1045                          &existing_va_block->hmm.node,
1046                          &new_block->hmm.node);
1047 }
1048 
1049 NV_STATUS uvm_hmm_split_as_needed(uvm_va_space_t *va_space,
1050                                   NvU64 addr,
1051                                   uvm_va_policy_is_split_needed_t split_needed_cb,
1052                                   void *data)
1053 {
1054     uvm_va_block_t *va_block;
1055     uvm_va_policy_node_t *node;
1056     NV_STATUS status;
1057 
1058     uvm_assert_rwsem_locked_write(&va_space->lock);
1059 
1060     // If there is no HMM va_block or the va_block doesn't span the policy
1061     // addr, there is no need to split.
1062     status = uvm_hmm_va_block_find(va_space, addr, &va_block);
1063     if (status != NV_OK || va_block->start == addr)
1064         return NV_OK;
1065 
1066     uvm_mutex_lock(&va_block->lock);
1067 
1068     node = uvm_va_policy_node_find(va_block, addr);
1069     if (!node)
1070         goto done;
1071 
1072     // If the policy range doesn't span addr, we're done.
1073     if (addr == node->node.start)
1074         goto done;
1075 
1076     if (split_needed_cb(&node->policy, data))
1077         status = uvm_va_policy_node_split(va_block, node, addr - 1, NULL);
1078 
1079 done:
1080     uvm_mutex_unlock(&va_block->lock);
1081     return status;
1082 }
1083 
1084 static NV_STATUS hmm_set_preferred_location_locked(uvm_va_block_t *va_block,
1085                                                    uvm_va_block_context_t *va_block_context,
1086                                                    uvm_processor_id_t preferred_location,
1087                                                    int preferred_cpu_nid,
1088                                                    NvU64 addr,
1089                                                    NvU64 end,
1090                                                    uvm_tracker_t *out_tracker)
1091 {
1092     uvm_processor_mask_t set_accessed_by_processors;
1093     const uvm_va_policy_t *old_policy;
1094     uvm_va_policy_node_t *node;
1095     uvm_va_block_region_t region;
1096     uvm_processor_id_t id;
1097     NV_STATUS status, tracker_status;
1098 
1099     // Note that we can't just call uvm_va_policy_set_range() for the whole
1100     // range [addr end] because we need to examine the old value of
1101     // policy->preferred_location and policy->preferred_nid before setting it.
1102     // Thus we iterate over the existing policy nodes.
1103     uvm_for_each_va_policy_in(old_policy, va_block, addr, end, node, region) {
1104         if (uvm_va_policy_preferred_location_equal(old_policy, preferred_location, preferred_cpu_nid))
1105             continue;
1106 
1107         // If the old preferred location is a valid processor ID, remote
1108         // mappings should be established to the new preferred location if
1109         // accessed-by is set.
1110         uvm_processor_mask_zero(&set_accessed_by_processors);
1111 
1112         if (UVM_ID_IS_VALID(old_policy->preferred_location) &&
1113             uvm_processor_mask_test(&old_policy->accessed_by, old_policy->preferred_location))
1114             uvm_processor_mask_set(&set_accessed_by_processors, old_policy->preferred_location);
1115 
1116         if (!uvm_va_policy_set_preferred_location(va_block,
1117                                                   region,
1118                                                   preferred_location,
1119                                                   preferred_cpu_nid,
1120                                                   old_policy))
1121             return NV_ERR_NO_MEMORY;
1122 
1123         // Establish new remote mappings if the old preferred location had
1124         // accessed-by set.
1125         for_each_id_in_mask(id, &set_accessed_by_processors) {
1126             status = uvm_va_block_set_accessed_by_locked(va_block, va_block_context, id, region, out_tracker);
1127             if (status != NV_OK)
1128                 return status;
1129         }
1130 
1131         // Even though the UVM_VA_BLOCK_RETRY_LOCKED() may unlock and relock
1132         // the va_block lock, the policy remains valid because we hold the mmap
1133         // lock so munmap can't remove the policy, and the va_space lock so the
1134         // policy APIs can't change the policy.
1135         status = UVM_VA_BLOCK_RETRY_LOCKED(va_block,
1136                                            NULL,
1137                                            uvm_va_block_set_preferred_location_locked(va_block,
1138                                                                                       va_block_context,
1139                                                                                       region));
1140 
1141         tracker_status = uvm_tracker_add_tracker_safe(out_tracker, &va_block->tracker);
1142         if (status == NV_OK)
1143             status = tracker_status;
1144 
1145         if (status != NV_OK)
1146             return status;
1147     }
1148 
1149     return NV_OK;
1150 }
1151 
1152 NV_STATUS uvm_hmm_set_preferred_location(uvm_va_space_t *va_space,
1153                                          uvm_processor_id_t preferred_location,
1154                                          int preferred_cpu_nid,
1155                                          NvU64 base,
1156                                          NvU64 last_address,
1157                                          uvm_tracker_t *out_tracker)
1158 {
1159     uvm_va_block_context_t *va_block_context;
1160     uvm_va_block_t *va_block;
1161     NvU64 addr;
1162     NV_STATUS status = NV_OK;
1163 
1164     if (!uvm_hmm_is_enabled(va_space))
1165         return NV_ERR_INVALID_ADDRESS;
1166 
1167     uvm_assert_mmap_lock_locked(va_space->va_space_mm.mm);
1168     uvm_assert_rwsem_locked_write(&va_space->lock);
1169     UVM_ASSERT(PAGE_ALIGNED(base));
1170     UVM_ASSERT(PAGE_ALIGNED(last_address + 1));
1171     UVM_ASSERT(base < last_address);
1172 
1173     // Update HMM preferred location policy.
1174 
1175     va_block_context = uvm_va_space_block_context(va_space, va_space->va_space_mm.mm);
1176 
1177     for (addr = base; addr < last_address; addr = va_block->end + 1) {
1178         NvU64 end;
1179 
1180         status = hmm_va_block_find_create(va_space, addr, true, &va_block_context->hmm.vma, &va_block);
1181         if (status != NV_OK)
1182             break;
1183 
1184         end = min(last_address, va_block->end);
1185 
1186         uvm_mutex_lock(&va_block->lock);
1187 
1188         status = hmm_set_preferred_location_locked(va_block,
1189                                                    va_block_context,
1190                                                    preferred_location,
1191                                                    preferred_cpu_nid,
1192                                                    addr,
1193                                                    end,
1194                                                    out_tracker);
1195 
1196         uvm_mutex_unlock(&va_block->lock);
1197 
1198         if (status != NV_OK)
1199             break;
1200     }
1201 
1202     return status;
1203 }
1204 
1205 static NV_STATUS hmm_set_accessed_by_start_end_locked(uvm_va_block_t *va_block,
1206                                                       uvm_va_block_context_t *va_block_context,
1207                                                       uvm_processor_id_t processor_id,
1208                                                       NvU64 start,
1209                                                       NvU64 end,
1210                                                       uvm_tracker_t *out_tracker)
1211 {
1212     uvm_va_space_t *va_space = va_block->hmm.va_space;
1213     uvm_va_policy_node_t *node;
1214     uvm_va_block_region_t region;
1215     NV_STATUS status = NV_OK;
1216 
1217     uvm_for_each_va_policy_node_in(node, va_block, start, end) {
1218         // Read duplication takes precedence over SetAccessedBy.
1219         // Do not add mappings if read duplication is enabled.
1220         if (uvm_va_policy_is_read_duplicate(&node->policy, va_space))
1221             continue;
1222 
1223         region = uvm_va_block_region_from_start_end(va_block,
1224                                                     max(start, node->node.start),
1225                                                     min(end, node->node.end));
1226 
1227         status = uvm_va_block_set_accessed_by_locked(va_block,
1228                                                      va_block_context,
1229                                                      processor_id,
1230                                                      region,
1231                                                      out_tracker);
1232         if (status != NV_OK)
1233             break;
1234     }
1235 
1236     return status;
1237 }
1238 
1239 NV_STATUS uvm_hmm_set_accessed_by(uvm_va_space_t *va_space,
1240                                   uvm_processor_id_t processor_id,
1241                                   bool set_bit,
1242                                   NvU64 base,
1243                                   NvU64 last_address,
1244                                   uvm_tracker_t *out_tracker)
1245 {
1246     uvm_va_block_context_t *va_block_context;
1247     uvm_va_block_t *va_block;
1248     NvU64 addr;
1249     NV_STATUS status = NV_OK;
1250 
1251     if (!uvm_hmm_is_enabled(va_space))
1252         return NV_ERR_INVALID_ADDRESS;
1253 
1254     uvm_assert_mmap_lock_locked(va_space->va_space_mm.mm);
1255     uvm_assert_rwsem_locked_write(&va_space->lock);
1256     UVM_ASSERT(PAGE_ALIGNED(base));
1257     UVM_ASSERT(PAGE_ALIGNED(last_address + 1));
1258     UVM_ASSERT(base < last_address);
1259 
1260     // Update HMM accessed by policy.
1261 
1262     va_block_context = uvm_va_space_block_context(va_space, va_space->va_space_mm.mm);
1263 
1264     for (addr = base; addr < last_address; addr = va_block->end + 1) {
1265         NvU64 end;
1266 
1267         status = hmm_va_block_find_create(va_space, addr, true, &va_block_context->hmm.vma, &va_block);
1268         if (status != NV_OK)
1269             break;
1270 
1271         end = min(last_address, va_block->end);
1272 
1273         uvm_mutex_lock(&va_block->lock);
1274 
1275         status = uvm_va_policy_set_range(va_block,
1276                                          addr,
1277                                          end,
1278                                          UVM_VA_POLICY_ACCESSED_BY,
1279                                          !set_bit,
1280                                          processor_id,
1281                                          NUMA_NO_NODE,
1282                                          UVM_READ_DUPLICATION_MAX);
1283 
1284         if (status == NV_OK && set_bit) {
1285             status = hmm_set_accessed_by_start_end_locked(va_block,
1286                                                           va_block_context,
1287                                                           processor_id,
1288                                                           addr,
1289                                                           end,
1290                                                           out_tracker);
1291         }
1292 
1293         uvm_mutex_unlock(&va_block->lock);
1294 
1295         if (status != NV_OK)
1296             break;
1297     }
1298 
1299     return status;
1300 }
1301 
1302 void uvm_hmm_block_add_eviction_mappings(uvm_va_space_t *va_space,
1303                                          uvm_va_block_t *va_block,
1304                                          uvm_va_block_context_t *block_context)
1305 {
1306     uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
1307     uvm_va_policy_node_t *node;
1308     uvm_va_block_region_t region;
1309     uvm_processor_mask_t map_processors;
1310     uvm_processor_id_t id;
1311     NV_STATUS tracker_status;
1312     NV_STATUS status = NV_OK;
1313 
1314     UVM_ASSERT(uvm_va_block_is_hmm(va_block));
1315     uvm_assert_mmap_lock_locked(va_space->va_space_mm.mm);
1316     uvm_assert_rwsem_locked(&va_space->lock);
1317 
1318     uvm_mutex_lock(&va_block->lock);
1319 
1320     uvm_for_each_va_policy_node_in(node, va_block, va_block->start, va_block->end) {
1321         for_each_id_in_mask(id, &node->policy.accessed_by) {
1322             status = hmm_set_accessed_by_start_end_locked(va_block,
1323                                                           block_context,
1324                                                           id,
1325                                                           node->node.start,
1326                                                           node->node.end,
1327                                                           &local_tracker);
1328             if (status != NV_OK)
1329                 break;
1330 
1331             if (!uvm_va_space_map_remote_on_eviction(va_space))
1332                 continue;
1333 
1334             // Exclude the processors that have been already mapped due to
1335             // AccessedBy.
1336             uvm_processor_mask_andnot(&map_processors, &va_block->evicted_gpus, &node->policy.accessed_by);
1337 
1338             for_each_gpu_id_in_mask(id, &map_processors) {
1339                 uvm_gpu_t *gpu = uvm_va_space_get_gpu(va_space, id);
1340                 uvm_va_block_gpu_state_t *gpu_state;
1341 
1342                 if (!gpu->parent->access_counters_supported)
1343                     continue;
1344 
1345                 gpu_state = uvm_va_block_gpu_state_get(va_block, id);
1346                 UVM_ASSERT(gpu_state);
1347 
1348                 // TODO: Bug 2096389: uvm_va_block_add_mappings does not add
1349                 // remote mappings to read-duplicated pages. Add support for it
1350                 // or create a new function.
1351                 status = uvm_va_block_add_mappings(va_block,
1352                                                    block_context,
1353                                                    id,
1354                                                    region,
1355                                                    &gpu_state->evicted,
1356                                                    UvmEventMapRemoteCauseEviction);
1357                 tracker_status = uvm_tracker_add_tracker_safe(&local_tracker, &va_block->tracker);
1358                 status = (status == NV_OK) ? tracker_status : status;
1359                 if (status != NV_OK) {
1360                     UVM_ASSERT(status != NV_ERR_MORE_PROCESSING_REQUIRED);
1361                     break;
1362                 }
1363             }
1364         }
1365     }
1366 
1367     uvm_mutex_unlock(&va_block->lock);
1368 
1369     tracker_status = uvm_tracker_wait_deinit(&local_tracker);
1370     status = (status == NV_OK) ? tracker_status : status;
1371     if (status != NV_OK) {
1372         UVM_ERR_PRINT("Deferred mappings to evicted memory for block [0x%llx, 0x%llx] failed %s\n",
1373                       va_block->start,
1374                       va_block->end,
1375                       nvstatusToString(status));
1376     }
1377 }
1378 
1379 const uvm_va_policy_t *uvm_hmm_find_policy_end(uvm_va_block_t *va_block,
1380                                                struct vm_area_struct *vma,
1381                                                unsigned long addr,
1382                                                NvU64 *endp)
1383 {
1384     const uvm_va_policy_node_t *node;
1385     const uvm_va_policy_t *policy;
1386     NvU64 end = va_block->end;
1387 
1388     uvm_assert_mmap_lock_locked(vma->vm_mm);
1389     uvm_assert_mutex_locked(&va_block->lock);
1390 
1391     if (end > vma->vm_end - 1)
1392         end = vma->vm_end - 1;
1393 
1394     node = uvm_va_policy_node_find(va_block, addr);
1395     if (node) {
1396         policy = &node->policy;
1397         if (end > node->node.end)
1398             end = node->node.end;
1399     }
1400     else {
1401         policy = &uvm_va_policy_default;
1402     }
1403 
1404     *endp = end;
1405 
1406     return policy;
1407 }
1408 
1409 NV_STATUS uvm_hmm_find_policy_vma_and_outer(uvm_va_block_t *va_block,
1410                                             struct vm_area_struct **vma_out,
1411                                             uvm_page_index_t page_index,
1412                                             const uvm_va_policy_t **policy,
1413                                             uvm_page_index_t *outerp)
1414 {
1415     unsigned long addr;
1416     NvU64 end;
1417     uvm_page_index_t outer;
1418     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
1419     struct mm_struct *mm = va_space->va_space_mm.mm;
1420 
1421     if (!mm)
1422         return NV_ERR_INVALID_ADDRESS;
1423 
1424     UVM_ASSERT(uvm_va_block_is_hmm(va_block));
1425     uvm_assert_mmap_lock_locked(mm);
1426     uvm_assert_mutex_locked(&va_block->lock);
1427 
1428     addr = uvm_va_block_cpu_page_address(va_block, page_index);
1429 
1430     *vma_out = vma_lookup(mm, addr);
1431     if (!*vma_out || !((*vma_out)->vm_flags & VM_READ))
1432         return NV_ERR_INVALID_ADDRESS;
1433 
1434     *policy = uvm_hmm_find_policy_end(va_block, *vma_out, addr, &end);
1435 
1436     outer = uvm_va_block_cpu_page_index(va_block, end) + 1;
1437     if (*outerp > outer)
1438         *outerp = outer;
1439 
1440     return NV_OK;
1441 }
1442 
1443 static NV_STATUS hmm_clear_thrashing_policy(uvm_va_block_t *va_block,
1444                                             uvm_va_block_context_t *block_context)
1445 {
1446     const uvm_va_policy_t *policy;
1447     uvm_va_policy_node_t *node;
1448     uvm_va_block_region_t region;
1449     NV_STATUS status = NV_OK;
1450 
1451     uvm_mutex_lock(&va_block->lock);
1452 
1453     uvm_for_each_va_policy_in(policy, va_block, va_block->start, va_block->end, node, region) {
1454         // Unmap may split PTEs and require a retry. Needs to be called
1455         // before the pinned pages information is destroyed.
1456         status = UVM_VA_BLOCK_RETRY_LOCKED(va_block,
1457                                            NULL,
1458                                            uvm_perf_thrashing_unmap_remote_pinned_pages_all(va_block,
1459                                                                                             block_context,
1460                                                                                             region));
1461 
1462         uvm_perf_thrashing_info_destroy(va_block);
1463 
1464         if (status != NV_OK)
1465             break;
1466     }
1467 
1468     uvm_mutex_unlock(&va_block->lock);
1469 
1470     return status;
1471 }
1472 
1473 NV_STATUS uvm_hmm_clear_thrashing_policy(uvm_va_space_t *va_space)
1474 {
1475     uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, NULL);
1476     uvm_range_tree_node_t *node, *next;
1477     uvm_va_block_t *va_block;
1478     NV_STATUS status = NV_OK;
1479 
1480     if (!uvm_hmm_is_enabled(va_space))
1481         return NV_OK;
1482 
1483     uvm_assert_rwsem_locked_write(&va_space->lock);
1484 
1485     uvm_range_tree_for_each_safe(node, next, &va_space->hmm.blocks) {
1486         va_block = hmm_va_block_from_node(node);
1487 
1488         status = hmm_clear_thrashing_policy(va_block, block_context);
1489         if (status != NV_OK)
1490             break;
1491     }
1492 
1493     return status;
1494 }
1495 
1496 uvm_va_block_region_t uvm_hmm_get_prefetch_region(uvm_va_block_t *va_block,
1497                                                   struct vm_area_struct *vma,
1498                                                   const uvm_va_policy_t *policy,
1499                                                   NvU64 address)
1500 {
1501     NvU64 start, end;
1502 
1503     UVM_ASSERT(uvm_va_block_is_hmm(va_block));
1504 
1505     // We need to limit the prefetch region to the VMA.
1506     start = max(va_block->start, (NvU64)vma->vm_start);
1507     end = min(va_block->end, (NvU64)vma->vm_end - 1);
1508 
1509     // Also, we need to limit the prefetch region to the policy range.
1510     if (uvm_va_policy_is_default(policy)) {
1511         NV_STATUS status = uvm_range_tree_find_hole_in(&va_block->hmm.va_policy_tree,
1512                                                        address,
1513                                                        &start,
1514                                                        &end);
1515         // We already know the hole exists and covers the fault region.
1516         UVM_ASSERT(status == NV_OK);
1517     }
1518     else {
1519         const uvm_va_policy_node_t *node = uvm_va_policy_node_from_policy(policy);
1520 
1521         start = max(start, node->node.start);
1522         end = min(end, node->node.end);
1523     }
1524 
1525     return uvm_va_block_region_from_start_end(va_block, start, end);
1526 }
1527 
1528 uvm_prot_t uvm_hmm_compute_logical_prot(uvm_va_block_t *va_block,
1529                                         struct vm_area_struct *vma,
1530                                         NvU64 addr)
1531 {
1532     UVM_ASSERT(uvm_va_block_is_hmm(va_block));
1533     uvm_assert_mmap_lock_locked(va_block->hmm.va_space->va_space_mm.mm);
1534     UVM_ASSERT(vma && addr >= vma->vm_start && addr < vma->vm_end);
1535 
1536     if (!(vma->vm_flags & VM_READ))
1537         return UVM_PROT_NONE;
1538     else if (!(vma->vm_flags & VM_WRITE))
1539         return UVM_PROT_READ_ONLY;
1540     else
1541         return UVM_PROT_READ_WRITE_ATOMIC;
1542 }
1543 
1544 static NV_STATUS hmm_va_block_cpu_page_populate(uvm_va_block_t *va_block,
1545                                                 uvm_page_index_t page_index,
1546                                                 struct page *page)
1547 {
1548     uvm_cpu_chunk_t *chunk;
1549     NV_STATUS status;
1550 
1551     UVM_ASSERT(uvm_va_block_is_hmm(va_block));
1552     UVM_ASSERT(!uvm_page_mask_test(&va_block->cpu.allocated, page_index));
1553 
1554     if (page == ZERO_PAGE(uvm_va_block_cpu_page_address(va_block, page_index)))
1555         return NV_ERR_INVALID_ADDRESS;
1556 
1557     status = uvm_cpu_chunk_alloc_hmm(page, &chunk);
1558     if (status != NV_OK)
1559         return status;
1560 
1561     status = uvm_cpu_chunk_insert_in_block(va_block, chunk, page_index);
1562     if (status != NV_OK) {
1563         uvm_cpu_chunk_free(chunk);
1564         return status;
1565     }
1566 
1567     status = uvm_va_block_map_cpu_chunk_on_gpus(va_block, chunk, page_index);
1568     if (status != NV_OK) {
1569         uvm_cpu_chunk_remove_from_block(va_block, page_to_nid(page), page_index);
1570         uvm_cpu_chunk_free(chunk);
1571     }
1572 
1573     return status;
1574 }
1575 
1576 static void hmm_va_block_cpu_unpopulate_chunk(uvm_va_block_t *va_block,
1577                                               uvm_cpu_chunk_t *chunk,
1578                                               int chunk_nid,
1579                                               uvm_page_index_t page_index)
1580 {
1581     if (!chunk)
1582         return;
1583 
1584     UVM_ASSERT(!uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU) ||
1585                !uvm_va_block_cpu_is_page_resident_on(va_block, NUMA_NO_NODE, page_index));
1586     UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == PAGE_SIZE);
1587 
1588     uvm_cpu_chunk_remove_from_block(va_block, chunk_nid, page_index);
1589     uvm_va_block_unmap_cpu_chunk_on_gpus(va_block, chunk, page_index);
1590     uvm_cpu_chunk_free(chunk);
1591 }
1592 
1593 static void hmm_va_block_cpu_page_unpopulate(uvm_va_block_t *va_block, uvm_page_index_t page_index, struct page *page)
1594 {
1595     uvm_cpu_chunk_t *chunk;
1596 
1597     UVM_ASSERT(uvm_va_block_is_hmm(va_block));
1598 
1599     if (page) {
1600         chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, page_to_nid(page), page_index);
1601         hmm_va_block_cpu_unpopulate_chunk(va_block, chunk, page_to_nid(page), page_index);
1602     }
1603     else {
1604         int nid;
1605 
1606         for_each_possible_uvm_node(nid) {
1607             chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, nid, page_index);
1608             hmm_va_block_cpu_unpopulate_chunk(va_block, chunk, nid, page_index);
1609         }
1610     }
1611 }
1612 
1613 static bool hmm_va_block_cpu_page_is_same(uvm_va_block_t *va_block,
1614                                           uvm_page_index_t page_index,
1615                                           struct page *page)
1616 {
1617     struct page *old_page = uvm_va_block_get_cpu_page(va_block, page_index);
1618 
1619     UVM_ASSERT(uvm_cpu_chunk_is_hmm(uvm_cpu_chunk_get_chunk_for_page(va_block, page_to_nid(page), page_index)));
1620     return old_page == page;
1621 }
1622 
1623 // uvm_va_block_service_copy() and uvm_va_block_service_finish() expect the
1624 // service_context masks to match what is being processed. Since a page
1625 // that was expected to be processed isn't migrating, we have to clear the
1626 // masks to make service_context consistent with what is actually being
1627 // handled.
1628 static void clear_service_context_masks(uvm_service_block_context_t *service_context,
1629                                         uvm_processor_id_t new_residency,
1630                                         uvm_page_index_t page_index)
1631 {
1632     uvm_page_mask_clear(&service_context->block_context->caller_page_mask, page_index);
1633 
1634     uvm_page_mask_clear(&service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency,
1635                         page_index);
1636 
1637     if (uvm_page_mask_empty(&service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency))
1638         uvm_processor_mask_clear(&service_context->resident_processors, new_residency);
1639 
1640     if (UVM_ID_IS_VALID(service_context->prefetch_hint.residency))
1641         uvm_page_mask_clear(&service_context->prefetch_hint.prefetch_pages_mask, page_index);
1642 
1643     if (service_context->thrashing_pin_count > 0 &&
1644         uvm_page_mask_test_and_clear(&service_context->thrashing_pin_mask, page_index)) {
1645         service_context->thrashing_pin_count--;
1646     }
1647 
1648     if (service_context->read_duplicate_count > 0 &&
1649         uvm_page_mask_test_and_clear(&service_context->read_duplicate_mask, page_index)) {
1650         service_context->read_duplicate_count--;
1651     }
1652 }
1653 
1654 static void cpu_mapping_set(uvm_va_block_t *va_block,
1655                             bool is_write,
1656                             uvm_page_index_t page_index)
1657 {
1658     uvm_processor_mask_set(&va_block->mapped, UVM_ID_CPU);
1659     uvm_page_mask_set(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ], page_index);
1660     if (is_write)
1661         uvm_page_mask_set(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE], page_index);
1662     else
1663         uvm_page_mask_clear(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE], page_index);
1664 }
1665 
1666 static void cpu_mapping_clear(uvm_va_block_t *va_block, uvm_page_index_t page_index)
1667 {
1668     uvm_page_mask_clear(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE], page_index);
1669     uvm_page_mask_clear(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ], page_index);
1670     if (uvm_page_mask_empty(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ]))
1671         uvm_processor_mask_clear(&va_block->mapped, UVM_ID_CPU);
1672 }
1673 
1674 static void gpu_chunk_remove(uvm_va_block_t *va_block,
1675                              uvm_page_index_t page_index,
1676                              struct page *page)
1677 {
1678     uvm_va_block_gpu_state_t *gpu_state;
1679     uvm_gpu_chunk_t *gpu_chunk;
1680     uvm_gpu_id_t id;
1681 
1682     id = uvm_pmm_devmem_page_to_gpu_id(page);
1683     gpu_state = uvm_va_block_gpu_state_get(va_block, id);
1684     UVM_ASSERT(gpu_state);
1685 
1686     gpu_chunk = gpu_state->chunks[page_index];
1687     if (!gpu_chunk) {
1688         // If we didn't find a chunk it's because the page was unmapped for
1689         // mremap and no fault has established a new mapping.
1690         UVM_ASSERT(!uvm_page_mask_test(&gpu_state->resident, page_index));
1691         return;
1692     }
1693 
1694     // TODO: Bug 3898467: unmap indirect peers when freeing GPU chunks
1695 
1696     uvm_mmu_chunk_unmap(gpu_chunk, &va_block->tracker);
1697     gpu_state->chunks[page_index] = NULL;
1698 }
1699 
1700 static NV_STATUS gpu_chunk_add(uvm_va_block_t *va_block,
1701                                uvm_page_index_t page_index,
1702                                struct page *page)
1703 {
1704     uvm_va_block_gpu_state_t *gpu_state;
1705     uvm_gpu_chunk_t *gpu_chunk;
1706     uvm_gpu_id_t id;
1707     NV_STATUS status;
1708 
1709     id = uvm_pmm_devmem_page_to_gpu_id(page);
1710     gpu_state = uvm_va_block_gpu_state_get(va_block, id);
1711 
1712     // It's possible that this is a fresh va_block we're trying to add an
1713     // existing gpu_chunk to. This occurs for example when a GPU faults on a
1714     // virtual address that has been remapped with mremap().
1715     if (!gpu_state) {
1716         status = uvm_va_block_gpu_state_alloc(va_block);
1717         if (status != NV_OK)
1718             return status;
1719         gpu_state = uvm_va_block_gpu_state_get(va_block, id);
1720     }
1721 
1722     UVM_ASSERT(gpu_state);
1723 
1724     // Note that a mremap() might be to a CPU virtual address that is nolonger
1725     // aligned with a larger GPU chunk size. We would need to allocate a new
1726     // aligned GPU chunk and copy from old to new.
1727     // TODO: Bug 3368756: add support for large GPU pages.
1728     gpu_chunk = uvm_pmm_devmem_page_to_chunk(page);
1729     UVM_ASSERT(gpu_chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED);
1730     UVM_ASSERT(gpu_chunk->is_referenced);
1731     UVM_ASSERT(page->zone_device_data == va_block->hmm.va_space);
1732 
1733     if (gpu_state->chunks[page_index] == gpu_chunk)
1734         return NV_OK;
1735 
1736     UVM_ASSERT(!gpu_state->chunks[page_index]);
1737 
1738     // In some configurations such as SR-IOV heavy, the chunk cannot be
1739     // referenced using its physical address. Create a virtual mapping.
1740     status = uvm_mmu_chunk_map(gpu_chunk);
1741     if (status != NV_OK)
1742         return status;
1743 
1744     // TODO: Bug 3898467: map indirect peers.
1745 
1746     uvm_processor_mask_set(&va_block->resident, id);
1747     uvm_page_mask_set(&gpu_state->resident, page_index);
1748 
1749     // It is safe to modify the page index field without holding any PMM locks
1750     // because the chunk is allocated, which means that none of the other
1751     // fields in the bitmap can change.
1752     gpu_chunk->va_block = va_block;
1753     gpu_chunk->va_block_page_index = page_index;
1754 
1755     gpu_state->chunks[page_index] = gpu_chunk;
1756 
1757     return NV_OK;
1758 }
1759 
1760 // This is called just before calling migrate_vma_finalize() in order to wait
1761 // for GPU operations to complete and update the va_block state to match which
1762 // pages migrated (or not) and therefore which pages will be released by
1763 // migrate_vma_finalize().
1764 // 'migrated_pages' is the mask of pages that migrated,
1765 // 'same_devmem_page_mask' is the mask of pages that are the same in src_pfns
1766 // and dst_pfns and therefore appear to migrate_vma_*() to be not migrating.
1767 // 'region' is the page index region of all migrated, non-migrated, and
1768 // same_devmem_page_mask pages.
1769 static NV_STATUS sync_page_and_chunk_state(uvm_va_block_t *va_block,
1770                                            const unsigned long *src_pfns,
1771                                            const unsigned long *dst_pfns,
1772                                            uvm_va_block_region_t region,
1773                                            const uvm_page_mask_t *migrated_pages,
1774                                            const uvm_page_mask_t *same_devmem_page_mask)
1775 {
1776     uvm_page_index_t page_index;
1777     NV_STATUS status;
1778 
1779     // Wait for the GPU to finish. migrate_vma_finalize() will release the
1780     // migrated source pages (or non migrating destination pages), so GPU
1781     // opererations must be finished by then.
1782     status = uvm_tracker_wait(&va_block->tracker);
1783 
1784     for_each_va_block_page_in_region(page_index, region) {
1785         struct page *page;
1786 
1787         if (uvm_page_mask_test(same_devmem_page_mask, page_index))
1788             continue;
1789 
1790         // If a page migrated, clean up the source page.
1791         // Otherwise, clean up the destination page.
1792         if (uvm_page_mask_test(migrated_pages, page_index))
1793             page = migrate_pfn_to_page(src_pfns[page_index]);
1794         else
1795             page = migrate_pfn_to_page(dst_pfns[page_index]);
1796 
1797         if (!page)
1798             continue;
1799 
1800         if (is_device_private_page(page)) {
1801             gpu_chunk_remove(va_block, page_index, page);
1802         }
1803         else {
1804             // If the source page is a system memory page,
1805             // migrate_vma_finalize() will release the reference so we should
1806             // clear our pointer to it.
1807             // TODO: Bug 3660922: Need to handle read duplication at some point.
1808             hmm_va_block_cpu_page_unpopulate(va_block, page_index, page);
1809         }
1810     }
1811 
1812     return status;
1813 }
1814 
1815 // Update va_block state to reflect that the page isn't migrating.
1816 static void clean_up_non_migrating_page(uvm_va_block_t *va_block,
1817                                         const unsigned long *src_pfns,
1818                                         unsigned long *dst_pfns,
1819                                         uvm_page_index_t page_index)
1820 {
1821     struct page *dst_page = migrate_pfn_to_page(dst_pfns[page_index]);
1822 
1823     if (!dst_page)
1824         return;
1825 
1826     // migrate_vma_finalize() will release the dst_page reference so don't keep
1827     // a pointer to it.
1828     if (is_device_private_page(dst_page)) {
1829         gpu_chunk_remove(va_block, page_index, dst_page);
1830     }
1831     else {
1832         UVM_ASSERT(page_ref_count(dst_page) == 1);
1833 
1834         hmm_va_block_cpu_page_unpopulate(va_block, page_index, dst_page);
1835     }
1836 
1837     unlock_page(dst_page);
1838     put_page(dst_page);
1839     dst_pfns[page_index] = 0;
1840 }
1841 
1842 static void clean_up_non_migrating_pages(uvm_va_block_t *va_block,
1843                                          const unsigned long *src_pfns,
1844                                          unsigned long *dst_pfns,
1845                                          uvm_va_block_region_t region,
1846                                          uvm_page_mask_t *page_mask)
1847 {
1848     uvm_page_index_t page_index;
1849     NV_STATUS status;
1850 
1851     status = uvm_tracker_wait(&va_block->tracker);
1852     UVM_ASSERT(status == NV_OK);
1853 
1854     for_each_va_block_page_in_region_mask(page_index, page_mask, region) {
1855         clean_up_non_migrating_page(va_block, src_pfns, dst_pfns, page_index);
1856     }
1857 }
1858 
1859 // CPU page fault handling.
1860 
1861 // Fill in the dst_pfns[page_index] entry given that there is an allocated
1862 // CPU page.
1863 static void lock_block_cpu_page(uvm_va_block_t *va_block,
1864                                 uvm_page_index_t page_index,
1865                                 struct page *src_page,
1866                                 unsigned long *dst_pfns,
1867                                 uvm_page_mask_t *same_devmem_page_mask)
1868 {
1869     uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, page_to_nid(src_page), page_index);
1870     uvm_va_block_region_t chunk_region;
1871     struct page *dst_page;
1872 
1873     UVM_ASSERT(chunk);
1874     UVM_ASSERT(chunk->page);
1875 
1876     chunk_region = uvm_va_block_chunk_region(va_block, uvm_cpu_chunk_get_size(chunk), page_index);
1877 
1878     dst_page = chunk->page + (page_index - chunk_region.first);
1879 
1880     UVM_ASSERT(dst_page != ZERO_PAGE(uvm_va_block_cpu_page_address(va_block, page_index)));
1881     UVM_ASSERT(!is_device_private_page(dst_page));
1882 
1883     // The source page is usually a device private page but it could be a GPU
1884     // remote mapped system memory page. It could also be a driver allocated
1885     // page for GPU-to-GPU staged copies (i.e., not a resident copy and owned
1886     // by the driver).
1887     if (is_device_private_page(src_page)) {
1888         // Since the page isn't mirrored, it was allocated by alloc_pages()
1889         // and UVM owns the reference. We leave the reference count unchanged
1890         // and mark the page pointer as mirrored since UVM is transferring
1891         // ownership to Linux and we don't want UVM to double free the page in
1892         // hmm_va_block_cpu_page_unpopulate() or block_kill(). If the page
1893         // does not migrate, it will be freed though.
1894         UVM_ASSERT(!uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU) ||
1895                    !uvm_va_block_cpu_is_page_resident_on(va_block, NUMA_NO_NODE, page_index));
1896         UVM_ASSERT(chunk->type == UVM_CPU_CHUNK_TYPE_PHYSICAL);
1897         UVM_ASSERT(page_ref_count(dst_page) == 1);
1898         uvm_cpu_chunk_make_hmm(chunk);
1899     }
1900     else {
1901         UVM_ASSERT(same_devmem_page_mask);
1902         UVM_ASSERT(src_page == dst_page);
1903         uvm_page_mask_set(same_devmem_page_mask, page_index);
1904 
1905         // The call to migrate_vma_setup() will have inserted a migration PTE
1906         // so the CPU has no access.
1907         cpu_mapping_clear(va_block, page_index);
1908         return;
1909     }
1910 
1911     lock_page(dst_page);
1912     dst_pfns[page_index] = migrate_pfn(page_to_pfn(dst_page));
1913 }
1914 
1915 static void hmm_mark_gpu_chunk_referenced(uvm_va_block_t *va_block,
1916                                           uvm_gpu_t *gpu,
1917                                           uvm_gpu_chunk_t *gpu_chunk)
1918 {
1919     // Tell PMM to expect a callback from Linux to free the page since the
1920     // device private struct page reference count will determine when the
1921     // GPU chunk is free.
1922     UVM_ASSERT(gpu_chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED);
1923     list_del_init(&gpu_chunk->list);
1924     uvm_pmm_gpu_unpin_referenced(&gpu->pmm, gpu_chunk, va_block);
1925 }
1926 
1927 static void fill_dst_pfn(uvm_va_block_t *va_block,
1928                          uvm_gpu_t *gpu,
1929                          const unsigned long *src_pfns,
1930                          unsigned long *dst_pfns,
1931                          uvm_page_index_t page_index,
1932                          uvm_page_mask_t *same_devmem_page_mask)
1933 {
1934     unsigned long src_pfn = src_pfns[page_index];
1935     uvm_gpu_chunk_t *gpu_chunk;
1936     unsigned long pfn;
1937     struct page *dpage;
1938 
1939     gpu_chunk = uvm_va_block_lookup_gpu_chunk(va_block, gpu, uvm_va_block_cpu_page_address(va_block, page_index));
1940     UVM_ASSERT(gpu_chunk);
1941     UVM_ASSERT(gpu_chunk->log2_size == PAGE_SHIFT);
1942     pfn = uvm_pmm_gpu_devmem_get_pfn(&gpu->pmm, gpu_chunk);
1943 
1944     // If the same GPU page is both source and destination, migrate_vma_pages()
1945     // will see the wrong "expected" reference count and not migrate it, so we
1946     // mark it as not migrating but we keep track of this so we don't confuse
1947     // it with a page that migrate_vma_pages() actually does not migrate.
1948     if ((src_pfn & MIGRATE_PFN_VALID) && (src_pfn >> MIGRATE_PFN_SHIFT) == pfn) {
1949         uvm_page_mask_set(same_devmem_page_mask, page_index);
1950         return;
1951     }
1952 
1953     dpage = pfn_to_page(pfn);
1954     UVM_ASSERT(is_device_private_page(dpage));
1955     UVM_ASSERT(dpage->pgmap->owner == &g_uvm_global);
1956 
1957     hmm_mark_gpu_chunk_referenced(va_block, gpu, gpu_chunk);
1958     UVM_ASSERT(!page_count(dpage));
1959     zone_device_page_init(dpage);
1960     dpage->zone_device_data = va_block->hmm.va_space;
1961 
1962     dst_pfns[page_index] = migrate_pfn(pfn);
1963 }
1964 
1965 static void fill_dst_pfns(uvm_va_block_t *va_block,
1966                           const unsigned long *src_pfns,
1967                           unsigned long *dst_pfns,
1968                           uvm_va_block_region_t region,
1969                           uvm_page_mask_t *page_mask,
1970                           uvm_page_mask_t *same_devmem_page_mask,
1971                           uvm_processor_id_t dest_id)
1972 {
1973     uvm_gpu_t *gpu = uvm_va_space_get_gpu(va_block->hmm.va_space, dest_id);
1974     uvm_page_index_t page_index;
1975 
1976     uvm_page_mask_zero(same_devmem_page_mask);
1977 
1978     for_each_va_block_page_in_region_mask(page_index, page_mask, region) {
1979         if (!(src_pfns[page_index] & MIGRATE_PFN_MIGRATE))
1980             continue;
1981 
1982         fill_dst_pfn(va_block,
1983                      gpu,
1984                      src_pfns,
1985                      dst_pfns,
1986                      page_index,
1987                      same_devmem_page_mask);
1988     }
1989 }
1990 
1991 static NV_STATUS alloc_page_on_cpu(uvm_va_block_t *va_block,
1992                                    uvm_page_index_t page_index,
1993                                    const unsigned long *src_pfns,
1994                                    unsigned long *dst_pfns,
1995                                    uvm_page_mask_t *same_devmem_page_mask,
1996                                    uvm_va_block_context_t *block_context)
1997 {
1998     NV_STATUS status;
1999     struct page *src_page;
2000     struct page *dst_page;
2001 
2002     // This is the page that will be copied to system memory.
2003     src_page = migrate_pfn_to_page(src_pfns[page_index]);
2004 
2005     if (src_page) {
2006         // mremap may have caused us to lose the gpu_chunk associated with
2007         // this va_block/page_index so make sure we have the correct chunk.
2008         if (is_device_private_page(src_page))
2009             gpu_chunk_add(va_block, page_index, src_page);
2010 
2011         if (uvm_page_mask_test(&va_block->cpu.allocated, page_index)) {
2012             lock_block_cpu_page(va_block, page_index, src_page, dst_pfns, same_devmem_page_mask);
2013             return NV_OK;
2014         }
2015     }
2016 
2017     UVM_ASSERT(!uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU) ||
2018                 !uvm_va_block_cpu_is_page_resident_on(va_block, NUMA_NO_NODE, page_index));
2019 
2020     status = uvm_va_block_populate_page_cpu(va_block, page_index, block_context);
2021     if (status != NV_OK)
2022         return status;
2023 
2024     // TODO: Bug 3368756: add support for transparent huge pages
2025     // Support for large CPU pages means the page_index may need fixing
2026     dst_page = migrate_pfn_to_page(block_context->hmm.dst_pfns[page_index]);
2027 
2028     // Note that we don't call get_page(dst_page) since alloc_page_vma()
2029     // returns with a page reference count of one and we are passing
2030     // ownership to Linux. Also, uvm_va_block_cpu_page_populate() recorded
2031     // the page as "mirrored" so that migrate_vma_finalize() and
2032     // hmm_va_block_cpu_page_unpopulate() don't double free the page.
2033     lock_page(dst_page);
2034     dst_pfns[page_index] = migrate_pfn(page_to_pfn(dst_page));
2035 
2036     return NV_OK;
2037 }
2038 
2039 // Allocates pages on the CPU to handle migration due to a page fault
2040 static NV_STATUS fault_alloc_on_cpu(uvm_va_block_t *va_block,
2041                                     const unsigned long *src_pfns,
2042                                     unsigned long *dst_pfns,
2043                                     uvm_va_block_region_t region,
2044                                     uvm_page_mask_t *page_mask,
2045                                     uvm_page_mask_t *same_devmem_page_mask,
2046                                     uvm_processor_id_t fault_processor_id,
2047                                     uvm_service_block_context_t *service_context)
2048 {
2049     uvm_page_index_t page_index;
2050     NV_STATUS status = NV_OK;
2051 
2052     UVM_ASSERT(service_context);
2053 
2054     for_each_va_block_page_in_region_mask(page_index, page_mask, region) {
2055         if (!(src_pfns[page_index] & MIGRATE_PFN_MIGRATE)) {
2056             // Device exclusive PTEs are not selected but we still want to
2057             // process the page so record it as such.
2058             if (!UVM_ID_IS_CPU(fault_processor_id) &&
2059                 service_context->access_type[page_index] == UVM_FAULT_ACCESS_TYPE_ATOMIC_STRONG) {
2060                 uvm_page_mask_set(same_devmem_page_mask, page_index);
2061                 continue;
2062             }
2063 
2064             // We have previously found a page that is CPU resident which can't
2065             // be migrated (probably a shared mapping) so make sure we establish
2066             // a remote mapping for it.
2067             if (uvm_page_mask_test(same_devmem_page_mask, page_index))
2068                 continue;
2069 
2070             goto clr_mask;
2071         }
2072 
2073         status = alloc_page_on_cpu(va_block, page_index, src_pfns, dst_pfns, same_devmem_page_mask, service_context->block_context);
2074         if (status != NV_OK) {
2075             // Ignore errors if the page is only for prefetching.
2076             if (service_context &&
2077                 service_context->access_type[page_index] == UVM_FAULT_ACCESS_TYPE_PREFETCH)
2078                 goto clr_mask;
2079             break;
2080         }
2081         continue;
2082 
2083     clr_mask:
2084         // TODO: Bug 3900774: clean up murky mess of mask clearing.
2085         uvm_page_mask_clear(page_mask, page_index);
2086         clear_service_context_masks(service_context, UVM_ID_CPU, page_index);
2087     }
2088 
2089     if (status != NV_OK)
2090         clean_up_non_migrating_pages(va_block, src_pfns, dst_pfns, region, page_mask);
2091     else if (uvm_page_mask_empty(page_mask))
2092         return NV_WARN_MORE_PROCESSING_REQUIRED;
2093 
2094     return status;
2095 }
2096 
2097 // Allocates pages on the CPU for explicit migration calls.
2098 static NV_STATUS migrate_alloc_on_cpu(uvm_va_block_t *va_block,
2099                                       const unsigned long *src_pfns,
2100                                       unsigned long *dst_pfns,
2101                                       uvm_va_block_region_t region,
2102                                       uvm_page_mask_t *page_mask,
2103                                       uvm_page_mask_t *same_devmem_page_mask,
2104                                       uvm_va_block_context_t *block_context)
2105 {
2106     uvm_page_index_t page_index;
2107     NV_STATUS status = NV_OK;
2108 
2109     for_each_va_block_page_in_region_mask(page_index, page_mask, region) {
2110         if (!(src_pfns[page_index] & MIGRATE_PFN_MIGRATE)) {
2111             // We have previously found a page that is CPU resident which can't
2112             // be migrated (probably a shared mapping) so make sure we establish
2113             // a remote mapping for it.
2114             if (uvm_page_mask_test(same_devmem_page_mask, page_index))
2115                 continue;
2116 
2117             uvm_page_mask_clear(page_mask, page_index);
2118             continue;
2119         }
2120 
2121         status = alloc_page_on_cpu(va_block, page_index, src_pfns, dst_pfns, same_devmem_page_mask, block_context);
2122     }
2123 
2124     if (status != NV_OK)
2125         clean_up_non_migrating_pages(va_block, src_pfns, dst_pfns, region, page_mask);
2126     else if (uvm_page_mask_empty(page_mask))
2127         return NV_WARN_MORE_PROCESSING_REQUIRED;
2128 
2129     return status;
2130 }
2131 static NV_STATUS uvm_hmm_devmem_fault_alloc_and_copy(uvm_hmm_devmem_fault_context_t *devmem_fault_context)
2132 {
2133     uvm_processor_id_t processor_id;
2134     uvm_service_block_context_t *service_context;
2135     uvm_va_block_retry_t *va_block_retry;
2136     const unsigned long *src_pfns;
2137     unsigned long *dst_pfns;
2138     uvm_page_mask_t *page_mask;
2139     uvm_page_mask_t *same_devmem_page_mask = &devmem_fault_context->same_devmem_page_mask;
2140     uvm_va_block_t *va_block;
2141     NV_STATUS status = NV_OK;
2142 
2143     processor_id = devmem_fault_context->processor_id;
2144     service_context = devmem_fault_context->service_context;
2145     va_block_retry = devmem_fault_context->va_block_retry;
2146     va_block = devmem_fault_context->va_block;
2147     src_pfns = service_context->block_context->hmm.src_pfns;
2148     dst_pfns = service_context->block_context->hmm.dst_pfns;
2149 
2150     // Build the migration page mask.
2151     // Note that thrashing pinned pages and prefetch pages are already
2152     // accounted for in service_context->per_processor_masks.
2153     page_mask = &devmem_fault_context->page_mask;
2154     uvm_page_mask_copy(page_mask, &service_context->per_processor_masks[UVM_ID_CPU_VALUE].new_residency);
2155 
2156     status = fault_alloc_on_cpu(va_block,
2157                                 src_pfns,
2158                                 dst_pfns,
2159                                 service_context->region,
2160                                 page_mask,
2161                                 same_devmem_page_mask,
2162                                 processor_id,
2163                                 service_context);
2164     if (status != NV_OK)
2165         return status;
2166 
2167     // Do the copy but don't update the residency or mapping for the new
2168     // location yet.
2169     return uvm_va_block_service_copy(processor_id, UVM_ID_CPU, va_block, va_block_retry, service_context);
2170 }
2171 
2172 static NV_STATUS uvm_hmm_devmem_fault_finalize_and_map(uvm_hmm_devmem_fault_context_t *devmem_fault_context)
2173 {
2174     uvm_processor_id_t processor_id;
2175     uvm_service_block_context_t *service_context;
2176     uvm_perf_prefetch_hint_t *prefetch_hint;
2177     uvm_va_block_retry_t *va_block_retry;
2178     const unsigned long *src_pfns;
2179     unsigned long *dst_pfns;
2180     uvm_page_mask_t *page_mask;
2181     uvm_va_block_t *va_block;
2182     uvm_va_block_region_t region;
2183     uvm_page_index_t page_index;
2184     NV_STATUS status, tracker_status;
2185 
2186     processor_id = devmem_fault_context->processor_id;
2187     service_context = devmem_fault_context->service_context;
2188     prefetch_hint = &service_context->prefetch_hint;
2189     va_block = devmem_fault_context->va_block;
2190     va_block_retry = devmem_fault_context->va_block_retry;
2191     src_pfns = service_context->block_context->hmm.src_pfns;
2192     dst_pfns = service_context->block_context->hmm.dst_pfns;
2193     region = service_context->region;
2194 
2195     page_mask = &devmem_fault_context->page_mask;
2196 
2197     // There are a number of reasons why HMM will mark a page as not migrating
2198     // even if we set a valid entry in dst_pfns[]. Mark these pages accordingly.
2199     for_each_va_block_page_in_region_mask(page_index, page_mask, region) {
2200         if (src_pfns[page_index] & MIGRATE_PFN_MIGRATE)
2201             continue;
2202 
2203         // If a page isn't migrating and only the GPU page table is being
2204         // updated, continue to process it normally.
2205         if (uvm_page_mask_test(&devmem_fault_context->same_devmem_page_mask, page_index))
2206             continue;
2207 
2208         // TODO: Bug 3900774: clean up murky mess of mask clearing.
2209         uvm_page_mask_clear(page_mask, page_index);
2210         clear_service_context_masks(service_context, UVM_ID_CPU, page_index);
2211     }
2212 
2213     if (uvm_page_mask_empty(page_mask))
2214         status = NV_WARN_MORE_PROCESSING_REQUIRED;
2215     else
2216         status = uvm_va_block_service_finish(processor_id, va_block, service_context);
2217 
2218     tracker_status = sync_page_and_chunk_state(va_block,
2219                                                src_pfns,
2220                                                dst_pfns,
2221                                                region,
2222                                                page_mask,
2223                                                &devmem_fault_context->same_devmem_page_mask);
2224 
2225     return status == NV_OK ? tracker_status : status;
2226 }
2227 
2228 static NV_STATUS populate_region(uvm_va_block_t *va_block,
2229                                  unsigned long *pfns,
2230                                  uvm_va_block_region_t region,
2231                                  uvm_page_mask_t *populated_page_mask)
2232 {
2233     uvm_page_index_t page_index;
2234     NV_STATUS status;
2235 
2236     // Make sure GPU state is allocated or else the GPU DMA mappings to
2237     // system memory won't be saved.
2238     status = uvm_va_block_gpu_state_alloc(va_block);
2239     if (status != NV_OK)
2240         return status;
2241 
2242     for_each_va_block_page_in_region(page_index, region) {
2243         struct page *page;
2244 
2245         // This case should only happen when querying CPU residency and we ask
2246         // for something not covered by a VMA. Otherwise, hmm_range_fault()
2247         // returns -EFAULT instead of setting the HMM_PFN_ERROR bit.
2248         if (pfns[page_index] & HMM_PFN_ERROR)
2249             return NV_ERR_INVALID_ADDRESS;
2250 
2251         if (pfns[page_index] & HMM_PFN_VALID) {
2252             page = hmm_pfn_to_page(pfns[page_index]);
2253         }
2254         else {
2255             // The page can't be evicted since it has to be migrated to the GPU
2256             // first which would leave a device private page entry so this has
2257             // to be a pte_none(), swapped out, or similar entry.
2258             // The page would have been allocated if populate_region() is being
2259             // called from uvm_hmm_va_block_service_locked() so this must be
2260             // for uvm_hmm_va_block_update_residency_info(). Just leave the
2261             // residency/populated information unchanged since
2262             // uvm_hmm_invalidate() should handle that if the underlying page
2263             // is invalidated.
2264             // Also note there can be an allocated page due to GPU-to-GPU
2265             // migration between non-peer or indirect peer GPUs.
2266             continue;
2267         }
2268 
2269         if (is_device_private_page(page)) {
2270             // Linux can call hmm_invalidate() and we have to clear the GPU
2271             // chunk pointer in uvm_va_block_gpu_state_t::chunks[] but it might
2272             // not release the device private struct page reference. Since
2273             // hmm_range_fault() did find a device private PTE, we can
2274             // re-establish the GPU chunk pointer.
2275             status = gpu_chunk_add(va_block, page_index, page);
2276             if (status != NV_OK)
2277                 return status;
2278             continue;
2279         }
2280 
2281         // If a CPU chunk is already allocated, check to see it matches what
2282         // hmm_range_fault() found.
2283         if (uvm_page_mask_test(&va_block->cpu.allocated, page_index)) {
2284             UVM_ASSERT(hmm_va_block_cpu_page_is_same(va_block, page_index, page));
2285         }
2286         else {
2287             status = hmm_va_block_cpu_page_populate(va_block, page_index, page);
2288             if (status != NV_OK)
2289                 return status;
2290 
2291             // Record that we populated this page. hmm_block_cpu_fault_locked()
2292             // uses this to ensure pages that don't migrate get remote mapped.
2293             if (populated_page_mask)
2294                 uvm_page_mask_set(populated_page_mask, page_index);
2295         }
2296 
2297         // Since we have a stable snapshot of the CPU pages, we can
2298         // update the residency and protection information.
2299         uvm_va_block_cpu_set_resident_page(va_block, page_to_nid(page), page_index);
2300 
2301         cpu_mapping_set(va_block, pfns[page_index] & HMM_PFN_WRITE, page_index);
2302     }
2303 
2304     return NV_OK;
2305 }
2306 
2307 static void hmm_range_fault_begin(uvm_va_block_t *va_block)
2308 {
2309     uvm_thread_context_t *uvm_context = uvm_thread_context();
2310 
2311     uvm_assert_mutex_locked(&va_block->lock);
2312     uvm_context->hmm_invalidate_seqnum = va_block->hmm.changed;
2313 }
2314 
2315 static bool hmm_range_fault_retry(uvm_va_block_t *va_block)
2316 {
2317     uvm_thread_context_t *uvm_context = uvm_thread_context();
2318 
2319     uvm_assert_mutex_locked(&va_block->lock);
2320     return uvm_context->hmm_invalidate_seqnum != va_block->hmm.changed;
2321 }
2322 
2323 // Make the region be resident on the CPU by calling hmm_range_fault() to fault
2324 // in CPU pages.
2325 static NV_STATUS hmm_make_resident_cpu(uvm_va_block_t *va_block,
2326                                        struct vm_area_struct *vma,
2327                                        unsigned long *hmm_pfns,
2328                                        uvm_va_block_region_t region,
2329                                        NvU8 *access_type,
2330                                        uvm_page_mask_t *populated_page_mask)
2331 {
2332     uvm_page_index_t page_index;
2333     int ret;
2334     struct hmm_range range = {
2335         .notifier = &va_block->hmm.notifier,
2336         .start = uvm_va_block_region_start(va_block, region),
2337         .end = uvm_va_block_region_end(va_block, region) + 1,
2338         .hmm_pfns = hmm_pfns + region.first,
2339         .pfn_flags_mask = HMM_PFN_REQ_FAULT | HMM_PFN_REQ_WRITE,
2340         .dev_private_owner = &g_uvm_global,
2341     };
2342 
2343     for_each_va_block_page_in_region(page_index, region) {
2344         if ((access_type && access_type[page_index] >= UVM_FAULT_ACCESS_TYPE_WRITE) ||
2345             (vma->vm_flags & VM_WRITE))
2346             hmm_pfns[page_index] = HMM_PFN_REQ_FAULT | HMM_PFN_REQ_WRITE;
2347         else
2348             hmm_pfns[page_index] = HMM_PFN_REQ_FAULT;
2349     }
2350 
2351     hmm_range_fault_begin(va_block);
2352 
2353     // Mirror the VA block to the HMM address range.
2354     // Note that we request HMM to handle page faults, which means that it will
2355     // populate and map potentially not-yet-existing pages to the VMA.
2356     // Also note that mmu_interval_read_begin() calls wait_event() for any
2357     // parallel invalidation callbacks to finish so we can't hold locks that
2358     // the invalidation callback acquires.
2359     uvm_mutex_unlock(&va_block->lock);
2360 
2361     range.notifier_seq = mmu_interval_read_begin(range.notifier);
2362     ret = hmm_range_fault(&range);
2363 
2364     uvm_mutex_lock(&va_block->lock);
2365 
2366     if (ret)
2367         return (ret == -EBUSY) ? NV_WARN_MORE_PROCESSING_REQUIRED : errno_to_nv_status(ret);
2368 
2369     if (hmm_range_fault_retry(va_block))
2370         return NV_WARN_MORE_PROCESSING_REQUIRED;
2371 
2372     return populate_region(va_block,
2373                            hmm_pfns,
2374                            region,
2375                            populated_page_mask);
2376 }
2377 
2378 // Release the reference count on any pages that were made device exclusive.
2379 static void hmm_release_atomic_pages(uvm_va_block_t *va_block,
2380                                      uvm_service_block_context_t *service_context)
2381 {
2382     uvm_va_block_region_t region = service_context->region;
2383     uvm_page_index_t page_index;
2384 
2385     for_each_va_block_page_in_region(page_index, region) {
2386         struct page *page = service_context->block_context->hmm.pages[page_index];
2387 
2388         if (!page)
2389             continue;
2390 
2391         unlock_page(page);
2392         put_page(page);
2393     }
2394 }
2395 
2396 static NV_STATUS hmm_block_atomic_fault_locked(uvm_processor_id_t processor_id,
2397                                                uvm_va_block_t *va_block,
2398                                                uvm_va_block_retry_t *va_block_retry,
2399                                                uvm_service_block_context_t *service_context)
2400 {
2401     uvm_va_block_region_t region = service_context->region;
2402     struct page **pages = service_context->block_context->hmm.pages;
2403     int npages;
2404     uvm_page_index_t page_index;
2405     uvm_make_resident_cause_t cause;
2406     NV_STATUS status;
2407 
2408     if (!uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU) ||
2409         !uvm_va_block_cpu_is_region_resident_on(va_block, NUMA_NO_NODE, region)) {
2410         // There is an atomic GPU fault. We need to make sure no pages are
2411         // GPU resident so that make_device_exclusive_range() doesn't call
2412         // migrate_to_ram() and cause a va_space lock recursion problem.
2413         if (service_context->operation == UVM_SERVICE_OPERATION_REPLAYABLE_FAULTS)
2414             cause = UVM_MAKE_RESIDENT_CAUSE_REPLAYABLE_FAULT;
2415         else if (service_context->operation == UVM_SERVICE_OPERATION_NON_REPLAYABLE_FAULTS)
2416             cause = UVM_MAKE_RESIDENT_CAUSE_NON_REPLAYABLE_FAULT;
2417         else
2418             cause = UVM_MAKE_RESIDENT_CAUSE_ACCESS_COUNTER;
2419 
2420         status = uvm_hmm_va_block_migrate_locked(va_block,
2421                                                  va_block_retry,
2422                                                  service_context->block_context,
2423                                                  UVM_ID_CPU,
2424                                                  region,
2425                                                  cause);
2426         if (status != NV_OK)
2427             goto done;
2428 
2429         // make_device_exclusive_range() will try to call migrate_to_ram()
2430         // and deadlock with ourself if the data isn't CPU resident.
2431         if (!uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU) ||
2432             !uvm_va_block_cpu_is_region_resident_on(va_block, NUMA_NO_NODE, region)) {
2433             status = NV_WARN_MORE_PROCESSING_REQUIRED;
2434             goto done;
2435         }
2436     }
2437 
2438     // TODO: Bug 4014681: atomic GPU operations are not supported on MAP_SHARED
2439     // mmap() files so we check for that here and report a fatal fault.
2440     // Otherwise with the current Linux 6.1 make_device_exclusive_range(),
2441     // it doesn't make the page exclusive and we end up in an endless loop.
2442     if (service_context->block_context->hmm.vma->vm_flags & (VM_SHARED | VM_HUGETLB)) {
2443         status = NV_ERR_NOT_SUPPORTED;
2444         goto done;
2445     }
2446 
2447     hmm_range_fault_begin(va_block);
2448 
2449     uvm_mutex_unlock(&va_block->lock);
2450 
2451     npages = make_device_exclusive_range(service_context->block_context->mm,
2452         uvm_va_block_cpu_page_address(va_block, region.first),
2453         uvm_va_block_cpu_page_address(va_block, region.outer - 1) + PAGE_SIZE,
2454         pages + region.first,
2455         &g_uvm_global);
2456 
2457     uvm_mutex_lock(&va_block->lock);
2458 
2459     if (npages < 0) {
2460         status = (npages == -EBUSY) ? NV_WARN_MORE_PROCESSING_REQUIRED : errno_to_nv_status(npages);
2461         goto done;
2462     }
2463 
2464     while ((size_t)npages < uvm_va_block_region_num_pages(region))
2465         pages[region.first + npages++] = NULL;
2466 
2467     if (hmm_range_fault_retry(va_block)) {
2468         status = NV_WARN_MORE_PROCESSING_REQUIRED;
2469         goto release;
2470     }
2471 
2472     status = NV_OK;
2473 
2474     for_each_va_block_page_in_region(page_index, region) {
2475         struct page *page = pages[page_index];
2476 
2477         if (!page) {
2478             // Record that one of the pages isn't exclusive but keep converting
2479             // the others.
2480             status = NV_WARN_MORE_PROCESSING_REQUIRED;
2481             continue;
2482         }
2483 
2484         // If a CPU chunk is already allocated, check to see it matches what
2485         // make_device_exclusive_range() found.
2486         if (uvm_page_mask_test(&va_block->cpu.allocated, page_index)) {
2487             UVM_ASSERT(hmm_va_block_cpu_page_is_same(va_block, page_index, page));
2488             UVM_ASSERT(uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU));
2489             UVM_ASSERT(uvm_va_block_cpu_is_page_resident_on(va_block, NUMA_NO_NODE, page_index));
2490         }
2491         else {
2492             NV_STATUS s = hmm_va_block_cpu_page_populate(va_block, page_index, page);
2493 
2494             if (s == NV_OK)
2495                 uvm_va_block_cpu_set_resident_page(va_block, page_to_nid(page), page_index);
2496         }
2497 
2498         cpu_mapping_clear(va_block, page_index);
2499     }
2500 
2501     if (status != NV_OK)
2502         goto release;
2503 
2504     status = uvm_va_block_service_copy(processor_id, UVM_ID_CPU, va_block, va_block_retry, service_context);
2505     if (status != NV_OK)
2506         goto release;
2507 
2508     status = uvm_va_block_service_finish(processor_id, va_block, service_context);
2509 
2510 release:
2511     hmm_release_atomic_pages(va_block, service_context);
2512 
2513 done:
2514     return status;
2515 }
2516 
2517 static bool is_atomic_fault(NvU8 *access_type, uvm_va_block_region_t region)
2518 {
2519     uvm_page_index_t page_index;
2520 
2521     for_each_va_block_page_in_region(page_index, region) {
2522         if (access_type[page_index] == UVM_FAULT_ACCESS_TYPE_ATOMIC_STRONG)
2523             return true;
2524     }
2525 
2526     return false;
2527 }
2528 
2529 static bool is_gpu_resident(uvm_va_block_t *va_block, uvm_va_block_region_t region)
2530 {
2531     uvm_processor_id_t gpu_id;
2532 
2533     for_each_gpu_id_in_mask(gpu_id, &va_block->resident) {
2534         uvm_va_block_gpu_state_t *gpu_state;
2535 
2536         gpu_state = uvm_va_block_gpu_state_get(va_block, gpu_id);
2537         if (!uvm_page_mask_region_empty(&gpu_state->resident, region))
2538             return true;
2539     }
2540 
2541     return false;
2542 }
2543 
2544 static NV_STATUS hmm_block_cpu_fault_locked(uvm_processor_id_t processor_id,
2545                                             uvm_va_block_t *va_block,
2546                                             uvm_va_block_retry_t *va_block_retry,
2547                                             uvm_service_block_context_t *service_context)
2548 {
2549     uvm_va_block_region_t region = service_context->region;
2550     struct migrate_vma *args = &service_context->block_context->hmm.migrate_vma_args;
2551     NV_STATUS status;
2552     int ret;
2553     uvm_hmm_devmem_fault_context_t fault_context = {
2554         .processor_id = processor_id,
2555         .va_block = va_block,
2556         .va_block_retry = va_block_retry,
2557         .service_context = service_context,
2558     };
2559 
2560     // Normally the source page will be a device private page that is being
2561     // migrated to system memory. However, when it is a GPU fault, the source
2562     // page can be a system memory page that the GPU needs to remote map
2563     // instead. However migrate_vma_setup() won't select these types of
2564     // mappings/pages:
2565     //  - device exclusive PTEs
2566     //  - shared mappings
2567     //  - file backed mappings
2568     // Also, if the source and destination page are the same, the page reference
2569     // count won't be the "expected" count and migrate_vma_pages() won't migrate
2570     // it. This mask records that uvm_hmm_devmem_fault_alloc_and_copy() and
2571     // uvm_hmm_devmem_fault_finalize_and_map() still needs to process these
2572     // pages even if src_pfn indicates they are not migrating.
2573     uvm_page_mask_zero(&fault_context.same_devmem_page_mask);
2574 
2575     if (!UVM_ID_IS_CPU(processor_id)) {
2576         if (is_atomic_fault(service_context->access_type, region)) {
2577             return hmm_block_atomic_fault_locked(processor_id,
2578                                                  va_block,
2579                                                  va_block_retry,
2580                                                  service_context);
2581         }
2582 
2583         status = hmm_make_resident_cpu(va_block,
2584                                        service_context->block_context->hmm.vma,
2585                                        service_context->block_context->hmm.src_pfns,
2586                                        region,
2587                                        service_context->access_type,
2588                                        &fault_context.same_devmem_page_mask);
2589         if (status != NV_OK)
2590             return status;
2591 
2592         // If no GPU has a resident copy, we can skip the migrate_vma_*().
2593         // This is necessary if uvm_hmm_must_use_sysmem() returned true.
2594         if (!is_gpu_resident(va_block, region)) {
2595             status = uvm_va_block_service_copy(processor_id,
2596                                                UVM_ID_CPU,
2597                                                va_block,
2598                                                va_block_retry,
2599                                                service_context);
2600             if (status != NV_OK)
2601                 return status;
2602 
2603             return uvm_va_block_service_finish(processor_id, va_block, service_context);
2604         }
2605     }
2606 
2607     args->vma = service_context->block_context->hmm.vma;
2608     args->src = service_context->block_context->hmm.src_pfns + region.first;
2609     args->dst = service_context->block_context->hmm.dst_pfns + region.first;
2610     args->start = uvm_va_block_region_start(va_block, region);
2611     args->end = uvm_va_block_region_end(va_block, region) + 1;
2612     args->flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE;
2613     args->pgmap_owner = &g_uvm_global;
2614 
2615     if (UVM_ID_IS_CPU(processor_id)) {
2616         args->fault_page = service_context->cpu_fault.vmf->page;
2617     }
2618     else {
2619         args->flags |= MIGRATE_VMA_SELECT_SYSTEM;
2620         args->fault_page = NULL;
2621     }
2622 
2623     ret = migrate_vma_setup_locked(args, va_block);
2624     UVM_ASSERT(!ret);
2625 
2626     // The overall process here is to migrate pages from the GPU to the CPU
2627     // and possibly remote map the GPU to sysmem if accessed_by is set.
2628     // This is safe because we hold the va_block lock across the calls to
2629     // uvm_hmm_devmem_fault_alloc_and_copy(), migrate_vma_pages(),
2630     // uvm_hmm_devmem_fault_finalize_and_map(), and migrate_vma_finalize().
2631     // If uvm_hmm_devmem_fault_alloc_and_copy() needs to drop the va_block
2632     // lock, a sequence number is used to tell if an invalidate() callback
2633     // occurred while not holding the lock. If the sequence number changes,
2634     // all the locks need to be dropped (mm, va_space, va_block) and the whole
2635     // uvm_va_block_service_locked() called again. Otherwise, there were no
2636     // conflicting invalidate callbacks and our snapshots of the CPU page
2637     // tables are accurate and can be used to DMA pages and update GPU page
2638     // tables.
2639     status = uvm_hmm_devmem_fault_alloc_and_copy(&fault_context);
2640     if (status == NV_OK) {
2641         migrate_vma_pages(args);
2642         status = uvm_hmm_devmem_fault_finalize_and_map(&fault_context);
2643     }
2644 
2645     migrate_vma_finalize(args);
2646 
2647     if (status == NV_WARN_NOTHING_TO_DO)
2648         status = NV_OK;
2649 
2650     return status;
2651 }
2652 
2653 static NV_STATUS dmamap_src_sysmem_pages(uvm_va_block_t *va_block,
2654                                          struct vm_area_struct *vma,
2655                                          const unsigned long *src_pfns,
2656                                          unsigned long *dst_pfns,
2657                                          uvm_va_block_region_t region,
2658                                          uvm_page_mask_t *page_mask,
2659                                          uvm_processor_id_t dest_id,
2660                                          uvm_service_block_context_t *service_context)
2661 {
2662     uvm_page_index_t page_index;
2663     NV_STATUS status = NV_OK;
2664 
2665     for_each_va_block_page_in_region_mask(page_index, page_mask, region) {
2666         struct page *src_page;
2667 
2668         if (!(src_pfns[page_index] & MIGRATE_PFN_MIGRATE)) {
2669             // HMM currently has some limitations on what pages can be migrated.
2670             // For example, no file backed pages, device private pages owned by
2671             // a different device, device exclusive or swapped out pages.
2672             goto clr_mask;
2673         }
2674 
2675         // This is the page that will be copied to the destination GPU.
2676         src_page = migrate_pfn_to_page(src_pfns[page_index]);
2677         if (src_page) {
2678             if (is_device_private_page(src_page)) {
2679                 status = gpu_chunk_add(va_block, page_index, src_page);
2680                 if (status != NV_OK)
2681                     break;
2682                 continue;
2683             }
2684 
2685             if (PageSwapCache(src_page)) {
2686                 // TODO: Bug 4050579: Remove this when swap cached pages can be
2687                 // migrated.
2688                 status = NV_WARN_MISMATCHED_TARGET;
2689                 break;
2690             }
2691 
2692             // If the page is already allocated, it is most likely a mirrored
2693             // page. Check to be sure it matches what we have recorded. The
2694             // page shouldn't be a staging page from a GPU to GPU migration
2695             // or a remote mapped atomic sysmem page because migrate_vma_setup()
2696             // found a normal page and non-mirrored pages are only known
2697             // privately to the UVM driver.
2698             if (uvm_page_mask_test(&va_block->cpu.allocated, page_index)) {
2699                 UVM_ASSERT(hmm_va_block_cpu_page_is_same(va_block, page_index, src_page));
2700                 UVM_ASSERT(uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU));
2701                 UVM_ASSERT(uvm_va_block_cpu_is_page_resident_on(va_block, NUMA_NO_NODE, page_index));
2702             }
2703             else {
2704                 status = hmm_va_block_cpu_page_populate(va_block, page_index, src_page);
2705                 if (status != NV_OK)
2706                     goto clr_mask;
2707 
2708                 // Since there is a CPU resident page, there shouldn't be one
2709                 // anywhere else. TODO: Bug 3660922: Need to handle read
2710                 // duplication at some point.
2711                 UVM_ASSERT(!uvm_va_block_page_resident_processors_count(va_block, page_index));
2712 
2713                 // migrate_vma_setup() was able to isolate and lock the page;
2714                 // therefore, it is CPU resident and not mapped.
2715                 uvm_va_block_cpu_set_resident_page(va_block, page_to_nid(src_page), page_index);
2716             }
2717 
2718             // The call to migrate_vma_setup() will have inserted a migration
2719             // PTE so the CPU has no access.
2720             cpu_mapping_clear(va_block, page_index);
2721         }
2722         else {
2723             // It is OK to migrate an empty anonymous page, a zero page will
2724             // be allocated on the GPU. Just be sure to free any pages
2725             // used for GPU to GPU copies. It can't be an evicted page because
2726             // migrate_vma_setup() would have found a source page.
2727             if (uvm_page_mask_test(&va_block->cpu.allocated, page_index)) {
2728                 UVM_ASSERT(!uvm_va_block_page_resident_processors_count(va_block, page_index));
2729 
2730                 hmm_va_block_cpu_page_unpopulate(va_block, page_index, NULL);
2731             }
2732         }
2733 
2734         continue;
2735 
2736     clr_mask:
2737         // TODO: Bug 3900774: clean up murky mess of mask clearing.
2738         uvm_page_mask_clear(page_mask, page_index);
2739         if (service_context)
2740             clear_service_context_masks(service_context, dest_id, page_index);
2741     }
2742 
2743     if (uvm_page_mask_empty(page_mask))
2744         status = NV_WARN_MORE_PROCESSING_REQUIRED;
2745 
2746     if (status != NV_OK)
2747         clean_up_non_migrating_pages(va_block, src_pfns, dst_pfns, region, page_mask);
2748 
2749     return status;
2750 }
2751 
2752 static NV_STATUS uvm_hmm_gpu_fault_alloc_and_copy(struct vm_area_struct *vma,
2753                                                   uvm_hmm_gpu_fault_event_t *uvm_hmm_gpu_fault_event)
2754 {
2755     uvm_processor_id_t processor_id;
2756     uvm_processor_id_t new_residency;
2757     uvm_va_block_t *va_block;
2758     uvm_va_block_retry_t *va_block_retry;
2759     uvm_service_block_context_t *service_context;
2760     uvm_perf_prefetch_hint_t *prefetch_hint;
2761     const unsigned long *src_pfns;
2762     unsigned long *dst_pfns;
2763     uvm_va_block_region_t region;
2764     uvm_page_mask_t *page_mask;
2765     NV_STATUS status;
2766 
2767     processor_id = uvm_hmm_gpu_fault_event->processor_id;
2768     new_residency = uvm_hmm_gpu_fault_event->new_residency;
2769     va_block = uvm_hmm_gpu_fault_event->va_block;
2770     va_block_retry = uvm_hmm_gpu_fault_event->va_block_retry;
2771     service_context = uvm_hmm_gpu_fault_event->service_context;
2772     region = service_context->region;
2773     prefetch_hint = &service_context->prefetch_hint;
2774     src_pfns = service_context->block_context->hmm.src_pfns;
2775     dst_pfns = service_context->block_context->hmm.dst_pfns;
2776 
2777     // Build the migration mask.
2778     // Note that thrashing pinned pages are already accounted for in
2779     // service_context->resident_processors.
2780     page_mask = &uvm_hmm_gpu_fault_event->page_mask;
2781     uvm_page_mask_copy(page_mask,
2782                        &service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency);
2783 
2784     status = dmamap_src_sysmem_pages(va_block,
2785                                      vma,
2786                                      src_pfns,
2787                                      dst_pfns,
2788                                      region,
2789                                      page_mask,
2790                                      new_residency,
2791                                      service_context);
2792     if (status != NV_OK)
2793         return status;
2794 
2795     // Do the alloc and copy but don't update the residency or mapping for the
2796     // new location yet.
2797     status = uvm_va_block_service_copy(processor_id, new_residency, va_block, va_block_retry, service_context);
2798     if (status != NV_OK)
2799         return status;
2800 
2801     // Record the destination PFNs of device private struct pages now that
2802     // uvm_va_block_service_copy() has populated the GPU destination pages.
2803     fill_dst_pfns(va_block,
2804                   src_pfns,
2805                   dst_pfns,
2806                   region,
2807                   page_mask,
2808                   &uvm_hmm_gpu_fault_event->same_devmem_page_mask,
2809                   new_residency);
2810 
2811     return status;
2812 }
2813 
2814 static NV_STATUS uvm_hmm_gpu_fault_finalize_and_map(uvm_hmm_gpu_fault_event_t *uvm_hmm_gpu_fault_event)
2815 {
2816     uvm_processor_id_t processor_id;
2817     uvm_processor_id_t new_residency;
2818     uvm_va_block_t *va_block;
2819     uvm_va_block_retry_t *va_block_retry;
2820     uvm_service_block_context_t *service_context;
2821     const unsigned long *src_pfns;
2822     unsigned long *dst_pfns;
2823     uvm_va_block_region_t region;
2824     uvm_page_index_t page_index;
2825     uvm_page_mask_t *page_mask;
2826     NV_STATUS status, tracker_status;
2827 
2828     processor_id = uvm_hmm_gpu_fault_event->processor_id;
2829     new_residency = uvm_hmm_gpu_fault_event->new_residency;
2830     va_block = uvm_hmm_gpu_fault_event->va_block;
2831     va_block_retry = uvm_hmm_gpu_fault_event->va_block_retry;
2832     service_context = uvm_hmm_gpu_fault_event->service_context;
2833     src_pfns = service_context->block_context->hmm.src_pfns;
2834     dst_pfns = service_context->block_context->hmm.dst_pfns;
2835     region = service_context->region;
2836     page_mask = &uvm_hmm_gpu_fault_event->page_mask;
2837 
2838     // There are a number of reasons why HMM will mark a page as not migrating
2839     // even if we set a valid entry in dst_pfns[]. Mark these pages accordingly.
2840     for_each_va_block_page_in_region_mask(page_index, page_mask, region) {
2841         unsigned long src_pfn = src_pfns[page_index];
2842 
2843         if (src_pfn & MIGRATE_PFN_MIGRATE)
2844             continue;
2845 
2846         // If a device private page isn't migrating and only the GPU page table
2847         // is being updated, continue to process it normally.
2848         if (uvm_page_mask_test(&uvm_hmm_gpu_fault_event->same_devmem_page_mask, page_index))
2849             continue;
2850 
2851         // TODO: Bug 3900774: clean up murky mess of mask clearing.
2852         uvm_page_mask_clear(page_mask, page_index);
2853         clear_service_context_masks(service_context, new_residency, page_index);
2854     }
2855 
2856     if (uvm_page_mask_empty(page_mask))
2857         status = NV_WARN_MORE_PROCESSING_REQUIRED;
2858     else
2859         status = uvm_va_block_service_finish(processor_id, va_block, service_context);
2860 
2861     tracker_status = sync_page_and_chunk_state(va_block,
2862                                                src_pfns,
2863                                                dst_pfns,
2864                                                region,
2865                                                page_mask,
2866                                                &uvm_hmm_gpu_fault_event->same_devmem_page_mask);
2867 
2868     return status == NV_OK ? tracker_status : status;
2869 }
2870 
2871 NV_STATUS uvm_hmm_va_block_service_locked(uvm_processor_id_t processor_id,
2872                                           uvm_processor_id_t new_residency,
2873                                           uvm_va_block_t *va_block,
2874                                           uvm_va_block_retry_t *va_block_retry,
2875                                           uvm_service_block_context_t *service_context)
2876 {
2877     struct mm_struct *mm = service_context->block_context->mm;
2878     struct vm_area_struct *vma = service_context->block_context->hmm.vma;
2879     uvm_va_block_region_t region = service_context->region;
2880     uvm_hmm_gpu_fault_event_t uvm_hmm_gpu_fault_event;
2881     struct migrate_vma *args = &service_context->block_context->hmm.migrate_vma_args;
2882     int ret;
2883     NV_STATUS status = NV_ERR_INVALID_ADDRESS;
2884 
2885     if (!mm)
2886         return status;
2887 
2888     uvm_assert_mmap_lock_locked(mm);
2889     uvm_assert_rwsem_locked(&va_block->hmm.va_space->lock);
2890     uvm_assert_mutex_locked(&va_block->hmm.migrate_lock);
2891     uvm_assert_mutex_locked(&va_block->lock);
2892     UVM_ASSERT(vma);
2893 
2894     // If the desired destination is the CPU, try to fault in CPU pages.
2895     if (UVM_ID_IS_CPU(new_residency))
2896         return hmm_block_cpu_fault_locked(processor_id, va_block, va_block_retry, service_context);
2897 
2898     uvm_hmm_gpu_fault_event.processor_id = processor_id;
2899     uvm_hmm_gpu_fault_event.new_residency = new_residency;
2900     uvm_hmm_gpu_fault_event.va_block = va_block;
2901     uvm_hmm_gpu_fault_event.va_block_retry = va_block_retry;
2902     uvm_hmm_gpu_fault_event.service_context = service_context;
2903 
2904     args->vma = vma;
2905     args->src = service_context->block_context->hmm.src_pfns + region.first;
2906     args->dst = service_context->block_context->hmm.dst_pfns + region.first;
2907     args->start = uvm_va_block_region_start(va_block, region);
2908     args->end = uvm_va_block_region_end(va_block, region) + 1;
2909     args->flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE | MIGRATE_VMA_SELECT_SYSTEM;
2910     args->pgmap_owner = &g_uvm_global;
2911     args->fault_page = NULL;
2912 
2913     ret = migrate_vma_setup_locked(args, va_block);
2914     UVM_ASSERT(!ret);
2915 
2916     // The overall process here is to migrate pages from the CPU or GPUs to the
2917     // faulting GPU.
2918     // This is safe because we hold the va_block lock across the calls to
2919     // uvm_hmm_gpu_fault_alloc_and_copy(), migrate_vma_pages(),
2920     // uvm_hmm_gpu_fault_finalize_and_map(), and migrate_vma_finalize().
2921     // If uvm_hmm_gpu_fault_alloc_and_copy() needs to drop the va_block
2922     // lock, a sequence number is used to tell if an invalidate() callback
2923     // occurred while not holding the lock. If the sequence number changes,
2924     // all the locks need to be dropped (mm, va_space, va_block) and the whole
2925     // uvm_va_block_service_locked() called again. Otherwise, there were no
2926     // conflicting invalidate callbacks and our snapshots of the CPU page
2927     // tables are accurate and can be used to DMA pages and update GPU page
2928     // tables. TODO: Bug 3901904: there might be better ways of handling no
2929     // page being migrated.
2930     status = uvm_hmm_gpu_fault_alloc_and_copy(vma, &uvm_hmm_gpu_fault_event);
2931     if (status == NV_WARN_MORE_PROCESSING_REQUIRED) {
2932         migrate_vma_finalize(args);
2933 
2934         // migrate_vma_setup() might have not been able to lock/isolate any
2935         // pages because they are swapped out or are device exclusive.
2936         // We do know that none of the pages in the region are zero pages
2937         // since migrate_vma_setup() would have reported that information.
2938         // Try to make it resident in system memory and retry the migration.
2939         status = hmm_make_resident_cpu(va_block,
2940                                        service_context->block_context->hmm.vma,
2941                                        service_context->block_context->hmm.src_pfns,
2942                                        region,
2943                                        service_context->access_type,
2944                                        NULL);
2945         return NV_WARN_MORE_PROCESSING_REQUIRED;
2946     }
2947 
2948     if (status == NV_OK) {
2949         migrate_vma_pages(args);
2950         status = uvm_hmm_gpu_fault_finalize_and_map(&uvm_hmm_gpu_fault_event);
2951     }
2952 
2953     migrate_vma_finalize(args);
2954 
2955     if (status == NV_WARN_NOTHING_TO_DO)
2956         status = NV_OK;
2957 
2958     return status;
2959 }
2960 
2961 static NV_STATUS uvm_hmm_migrate_alloc_and_copy(struct vm_area_struct *vma,
2962                                                 uvm_hmm_migrate_event_t *uvm_hmm_migrate_event)
2963 {
2964     uvm_va_block_t *va_block;
2965     uvm_va_block_retry_t *va_block_retry;
2966     uvm_va_block_context_t *va_block_context;
2967     const unsigned long *src_pfns;
2968     unsigned long *dst_pfns;
2969     uvm_va_block_region_t region;
2970     uvm_processor_id_t dest_id;
2971     uvm_page_mask_t *page_mask;
2972     NV_STATUS status;
2973 
2974     va_block = uvm_hmm_migrate_event->va_block;
2975     va_block_retry = uvm_hmm_migrate_event->va_block_retry;
2976     va_block_context = uvm_hmm_migrate_event->va_block_context;
2977     src_pfns = va_block_context->hmm.src_pfns;
2978     dst_pfns = va_block_context->hmm.dst_pfns;
2979     region = uvm_hmm_migrate_event->region;
2980     dest_id = uvm_hmm_migrate_event->dest_id;
2981     page_mask = &uvm_hmm_migrate_event->page_mask;
2982     uvm_page_mask_init_from_region(page_mask, region, NULL);
2983     uvm_page_mask_zero(&uvm_hmm_migrate_event->same_devmem_page_mask);
2984 
2985     uvm_assert_mutex_locked(&va_block->lock);
2986 
2987     if (UVM_ID_IS_CPU(dest_id)) {
2988         status = migrate_alloc_on_cpu(va_block,
2989                                       src_pfns,
2990                                       dst_pfns,
2991                                       region,
2992                                       page_mask,
2993                                       &uvm_hmm_migrate_event->same_devmem_page_mask,
2994                                       va_block_context);
2995     }
2996     else {
2997         status = dmamap_src_sysmem_pages(va_block,
2998                                          vma,
2999                                          src_pfns,
3000                                          dst_pfns,
3001                                          region,
3002                                          page_mask,
3003                                          dest_id,
3004                                          NULL);
3005     }
3006     if (status != NV_OK)
3007         return status;
3008 
3009     status = uvm_va_block_make_resident_copy(va_block,
3010                                              va_block_retry,
3011                                              va_block_context,
3012                                              dest_id,
3013                                              region,
3014                                              page_mask,
3015                                              NULL,
3016                                              uvm_hmm_migrate_event->cause);
3017     if (status != NV_OK)
3018         return status;
3019 
3020     if (!UVM_ID_IS_CPU(dest_id)) {
3021         // Record the destination PFNs of device private struct pages now that
3022         // uvm_va_block_make_resident_copy() has populated the GPU destination
3023         // pages.
3024         fill_dst_pfns(va_block,
3025                       src_pfns,
3026                       dst_pfns,
3027                       region,
3028                       page_mask,
3029                       &uvm_hmm_migrate_event->same_devmem_page_mask,
3030                       dest_id);
3031     }
3032 
3033     return status;
3034 }
3035 
3036 static NV_STATUS uvm_hmm_migrate_finalize(uvm_hmm_migrate_event_t *uvm_hmm_migrate_event)
3037 {
3038     uvm_va_block_t *va_block;
3039     uvm_va_block_retry_t *va_block_retry;
3040     uvm_va_block_context_t *va_block_context;
3041     uvm_va_block_region_t region;
3042     uvm_processor_id_t dest_id;
3043     uvm_page_index_t page_index;
3044     uvm_page_mask_t *page_mask;
3045     const unsigned long *src_pfns;
3046     unsigned long *dst_pfns;
3047 
3048     va_block = uvm_hmm_migrate_event->va_block;
3049     va_block_retry = uvm_hmm_migrate_event->va_block_retry;
3050     va_block_context = uvm_hmm_migrate_event->va_block_context;
3051     region = uvm_hmm_migrate_event->region;
3052     dest_id = uvm_hmm_migrate_event->dest_id;
3053     page_mask = &uvm_hmm_migrate_event->page_mask;
3054     src_pfns = va_block_context->hmm.src_pfns;
3055     dst_pfns = va_block_context->hmm.dst_pfns;
3056 
3057     uvm_assert_mutex_locked(&va_block->lock);
3058 
3059     // There are a number of reasons why HMM will mark a page as not migrating
3060     // even if we set a valid entry in dst_pfns[]. Mark these pages accordingly.
3061     for_each_va_block_page_in_region_mask(page_index, page_mask, region) {
3062         unsigned long src_pfn = src_pfns[page_index];
3063 
3064         if (src_pfn & MIGRATE_PFN_MIGRATE)
3065             continue;
3066 
3067         // If a device private page isn't migrating and only the GPU page table
3068         // is being updated, continue to process it normally.
3069         if (uvm_page_mask_test(&uvm_hmm_migrate_event->same_devmem_page_mask, page_index))
3070             continue;
3071 
3072         uvm_page_mask_clear(page_mask, page_index);
3073     }
3074 
3075     uvm_va_block_make_resident_finish(va_block, va_block_context, region, page_mask);
3076 
3077     return sync_page_and_chunk_state(va_block,
3078                                      src_pfns,
3079                                      dst_pfns,
3080                                      region,
3081                                      page_mask,
3082                                      &uvm_hmm_migrate_event->same_devmem_page_mask);
3083 }
3084 
3085 // Note that migrate_vma_*() doesn't handle asynchronous migrations so the
3086 // migration flag UVM_MIGRATE_FLAG_SKIP_CPU_MAP doesn't have an effect.
3087 // TODO: Bug 3900785: investigate ways to implement async migration.
3088 NV_STATUS uvm_hmm_va_block_migrate_locked(uvm_va_block_t *va_block,
3089                                           uvm_va_block_retry_t *va_block_retry,
3090                                           uvm_va_block_context_t *va_block_context,
3091                                           uvm_processor_id_t dest_id,
3092                                           uvm_va_block_region_t region,
3093                                           uvm_make_resident_cause_t cause)
3094 {
3095     uvm_hmm_migrate_event_t uvm_hmm_migrate_event;
3096     struct vm_area_struct *vma = va_block_context->hmm.vma;
3097     NvU64 start;
3098     NvU64 end;
3099     struct migrate_vma *args = &va_block_context->hmm.migrate_vma_args;
3100     NV_STATUS status;
3101     int ret;
3102 
3103     UVM_ASSERT(vma);
3104     UVM_ASSERT(va_block_context->mm == vma->vm_mm);
3105     uvm_assert_mmap_lock_locked(va_block_context->mm);
3106     uvm_assert_rwsem_locked(&va_block->hmm.va_space->lock);
3107     uvm_assert_mutex_locked(&va_block->hmm.migrate_lock);
3108     uvm_assert_mutex_locked(&va_block->lock);
3109 
3110     start = uvm_va_block_region_start(va_block, region);
3111     end = uvm_va_block_region_end(va_block, region);
3112     UVM_ASSERT(vma->vm_start <= start && end < vma->vm_end);
3113 
3114     uvm_hmm_migrate_event.va_block = va_block;
3115     uvm_hmm_migrate_event.va_block_retry = va_block_retry;
3116     uvm_hmm_migrate_event.va_block_context = va_block_context;
3117     uvm_hmm_migrate_event.region = region;
3118     uvm_hmm_migrate_event.dest_id = dest_id;
3119     uvm_hmm_migrate_event.cause = cause;
3120 
3121     args->vma = vma;
3122     args->src = va_block_context->hmm.src_pfns + region.first;
3123     args->dst = va_block_context->hmm.dst_pfns + region.first;
3124     args->start = uvm_va_block_region_start(va_block, region);
3125     args->end = uvm_va_block_region_end(va_block, region) + 1;
3126     args->flags = UVM_ID_IS_CPU(dest_id) ? MIGRATE_VMA_SELECT_DEVICE_PRIVATE :
3127                                            MIGRATE_VMA_SELECT_DEVICE_PRIVATE | MIGRATE_VMA_SELECT_SYSTEM;
3128     args->pgmap_owner = &g_uvm_global;
3129     args->fault_page = NULL;
3130 
3131     // Note that migrate_vma_setup() doesn't handle file backed or VM_SPECIAL
3132     // VMAs so if UvmMigrate() tries to migrate such a region, -EINVAL will
3133     // be returned and we will only try to make the pages be CPU resident.
3134     ret = migrate_vma_setup_locked(args, va_block);
3135     if (ret)
3136         return hmm_make_resident_cpu(va_block,
3137                                      vma,
3138                                      va_block_context->hmm.src_pfns,
3139                                      region,
3140                                      NULL,
3141                                      NULL);
3142 
3143     // The overall process here is to migrate pages from the CPU or GPUs to the
3144     // destination processor. Note that block_migrate_add_mappings() handles
3145     // updating GPU mappings after the migration.
3146     // This is safe because we hold the va_block lock across the calls to
3147     // uvm_hmm_migrate_alloc_and_copy(), migrate_vma_pages(),
3148     // uvm_hmm_migrate_finalize(), migrate_vma_finalize() and
3149     // block_migrate_add_mappings().
3150     // If uvm_hmm_migrate_alloc_and_copy() needs to drop the va_block
3151     // lock, a sequence number is used to tell if an invalidate() callback
3152     // occurred while not holding the lock. If the sequence number changes,
3153     // all the locks need to be dropped (mm, va_space, va_block) and the whole
3154     // uvm_hmm_va_block_migrate_locked() called again. Otherwise, there were no
3155     // conflicting invalidate callbacks and our snapshots of the CPU page
3156     // tables are accurate and can be used to DMA pages and update GPU page
3157     // tables.
3158     status = uvm_hmm_migrate_alloc_and_copy(vma, &uvm_hmm_migrate_event);
3159     if (status == NV_WARN_MORE_PROCESSING_REQUIRED) {
3160         uvm_processor_id_t id;
3161         uvm_page_mask_t *page_mask;
3162 
3163         migrate_vma_finalize(args);
3164 
3165         // The CPU pages tables might contain only device private pages or
3166         // the migrate_vma_setup() might have not been able to lock/isolate
3167         // any pages because they are swapped out, or on another device.
3168         // We do know that none of the pages in the region are zero pages
3169         // since migrate_vma_setup() would have reported that information.
3170         // Collect all the pages that need to be faulted in and made CPU
3171         // resident, then do the hmm_range_fault() and retry.
3172         page_mask = &va_block_context->caller_page_mask;
3173         uvm_page_mask_init_from_region(page_mask, region, NULL);
3174 
3175         for_each_id_in_mask(id, &va_block->resident) {
3176             if (!uvm_page_mask_andnot(page_mask, page_mask, uvm_va_block_resident_mask_get(va_block, id, NUMA_NO_NODE)))
3177                 return NV_OK;
3178         }
3179 
3180         return hmm_make_resident_cpu(va_block,
3181                                      vma,
3182                                      va_block_context->hmm.src_pfns,
3183                                      region,
3184                                      NULL,
3185                                      NULL);
3186     }
3187 
3188     if (status == NV_OK) {
3189         migrate_vma_pages(args);
3190         status = uvm_hmm_migrate_finalize(&uvm_hmm_migrate_event);
3191     }
3192 
3193     migrate_vma_finalize(args);
3194 
3195     if (status == NV_WARN_NOTHING_TO_DO || status == NV_WARN_MISMATCHED_TARGET)
3196         status = NV_OK;
3197 
3198     return status;
3199 }
3200 
3201 NV_STATUS uvm_hmm_migrate_ranges(uvm_va_space_t *va_space,
3202                                  uvm_va_block_context_t *va_block_context,
3203                                  NvU64 base,
3204                                  NvU64 length,
3205                                  uvm_processor_id_t dest_id,
3206                                  uvm_migrate_mode_t mode,
3207                                  uvm_tracker_t *out_tracker)
3208 {
3209     struct mm_struct *mm;
3210     uvm_va_block_t *va_block;
3211     uvm_va_block_retry_t va_block_retry;
3212     NvU64 addr, end, last_address;
3213     NV_STATUS status = NV_OK;
3214 
3215     if (!uvm_hmm_is_enabled(va_space))
3216         return NV_ERR_INVALID_ADDRESS;
3217 
3218     mm = va_block_context->mm;
3219     UVM_ASSERT(mm == va_space->va_space_mm.mm);
3220     uvm_assert_mmap_lock_locked(mm);
3221     uvm_assert_rwsem_locked(&va_space->lock);
3222 
3223     last_address = base + length - 1;
3224 
3225     for (addr = base; addr < last_address; addr = end + 1) {
3226         struct vm_area_struct *vma;
3227 
3228         status = hmm_va_block_find_create(va_space, addr, false, &va_block_context->hmm.vma, &va_block);
3229         if (status != NV_OK)
3230             return status;
3231 
3232         end = va_block->end;
3233         if (end > last_address)
3234             end = last_address;
3235 
3236         vma = va_block_context->hmm.vma;
3237         if (end > vma->vm_end - 1)
3238             end = vma->vm_end - 1;
3239 
3240         status = hmm_migrate_range(va_block,
3241                                    &va_block_retry,
3242                                    va_block_context,
3243                                    dest_id,
3244                                    addr,
3245                                    end,
3246                                    mode,
3247                                    out_tracker);
3248         if (status != NV_OK)
3249             break;
3250     }
3251 
3252     return status;
3253 }
3254 
3255 NV_STATUS uvm_hmm_va_block_evict_chunk_prep(uvm_va_block_t *va_block,
3256                                             uvm_va_block_context_t *va_block_context,
3257                                             uvm_gpu_chunk_t *gpu_chunk,
3258                                             uvm_va_block_region_t chunk_region)
3259 {
3260     uvm_thread_context_t *uvm_context = uvm_thread_context();
3261     unsigned long *src_pfns = va_block_context->hmm.src_pfns;
3262     uvm_gpu_t *gpu = uvm_gpu_chunk_get_gpu(gpu_chunk);
3263     unsigned long pfn = uvm_pmm_gpu_devmem_get_pfn(&gpu->pmm, gpu_chunk);
3264     uvm_page_index_t page_index = chunk_region.first;
3265     int ret;
3266 
3267     uvm_assert_mutex_locked(&va_block->lock);
3268     // TODO: Bug 3368756: add support for large GPU pages.
3269     UVM_ASSERT(uvm_va_block_region_num_pages(chunk_region) == 1);
3270 
3271     uvm_context->ignore_hmm_invalidate_va_block = va_block;
3272     ret = migrate_device_range(src_pfns + page_index, pfn, uvm_va_block_region_num_pages(chunk_region));
3273     uvm_context->ignore_hmm_invalidate_va_block = NULL;
3274     if (ret)
3275         return errno_to_nv_status(ret);
3276 
3277     return NV_OK;
3278 }
3279 
3280 // Note that the caller must initialize va_block_context->hmm.src_pfns by
3281 // calling uvm_hmm_va_block_evict_chunk_prep() before calling this.
3282 static NV_STATUS hmm_va_block_evict_chunks(uvm_va_block_t *va_block,
3283                                            uvm_va_block_context_t *va_block_context,
3284                                            const uvm_page_mask_t *pages_to_evict,
3285                                            uvm_va_block_region_t region,
3286                                            uvm_make_resident_cause_t cause,
3287                                            bool *out_accessed_by_set)
3288 {
3289     NvU64 start = uvm_va_block_region_start(va_block, region);
3290     NvU64 end = uvm_va_block_region_end(va_block, region);
3291     unsigned long *src_pfns = va_block_context->hmm.src_pfns;
3292     unsigned long *dst_pfns = va_block_context->hmm.dst_pfns;
3293     uvm_hmm_migrate_event_t uvm_hmm_migrate_event = {
3294         .va_block = va_block,
3295         .va_block_retry = NULL,
3296         .va_block_context = va_block_context,
3297         .region = region,
3298         .dest_id = UVM_ID_CPU,
3299         .cause = cause,
3300     };
3301     uvm_page_mask_t *page_mask = &uvm_hmm_migrate_event.page_mask;
3302     const uvm_va_policy_t *policy;
3303     uvm_va_policy_node_t *node;
3304     uvm_page_mask_t *cpu_resident_mask = uvm_va_block_resident_mask_get(va_block, UVM_ID_CPU, NUMA_NO_NODE);
3305     unsigned long npages;
3306     NV_STATUS status;
3307 
3308     uvm_assert_mutex_locked(&va_block->lock);
3309 
3310     if (out_accessed_by_set)
3311         *out_accessed_by_set = false;
3312 
3313     // Note that there is no VMA available when evicting HMM pages.
3314     va_block_context->hmm.vma = NULL;
3315 
3316     uvm_page_mask_copy(page_mask, pages_to_evict);
3317 
3318     uvm_for_each_va_policy_in(policy, va_block, start, end, node, region) {
3319         npages = uvm_va_block_region_num_pages(region);
3320 
3321         if (out_accessed_by_set && uvm_processor_mask_get_count(&policy->accessed_by) > 0)
3322             *out_accessed_by_set = true;
3323 
3324         // Pages resident on the GPU should not have a resident page in system
3325         // memory.
3326         // TODO: Bug 3660922: Need to handle read duplication at some point.
3327         UVM_ASSERT(uvm_page_mask_region_empty(cpu_resident_mask, region));
3328 
3329         status = migrate_alloc_on_cpu(va_block,
3330                                       src_pfns,
3331                                       dst_pfns,
3332                                       region,
3333                                       page_mask,
3334                                       NULL,
3335                                       va_block_context);
3336         if (status != NV_OK)
3337             goto err;
3338 
3339         status = uvm_va_block_make_resident_copy(va_block,
3340                                                  NULL,
3341                                                  va_block_context,
3342                                                  UVM_ID_CPU,
3343                                                  region,
3344                                                  page_mask,
3345                                                  NULL,
3346                                                  cause);
3347         if (status != NV_OK)
3348             goto err;
3349 
3350         migrate_device_pages(src_pfns + region.first, dst_pfns + region.first, npages);
3351 
3352         uvm_hmm_migrate_event.region = region;
3353 
3354         status = uvm_hmm_migrate_finalize(&uvm_hmm_migrate_event);
3355         if (status != NV_OK)
3356             goto err;
3357 
3358         migrate_device_finalize(src_pfns + region.first, dst_pfns + region.first, npages);
3359     }
3360 
3361     return NV_OK;
3362 
3363 err:
3364     migrate_device_finalize(src_pfns + region.first, dst_pfns + region.first, npages);
3365     return status;
3366 }
3367 
3368 NV_STATUS uvm_hmm_va_block_evict_chunks(uvm_va_block_t *va_block,
3369                                         uvm_va_block_context_t *va_block_context,
3370                                         const uvm_page_mask_t *pages_to_evict,
3371                                         uvm_va_block_region_t region,
3372                                         bool *out_accessed_by_set)
3373 {
3374     return hmm_va_block_evict_chunks(va_block,
3375                                      va_block_context,
3376                                      pages_to_evict,
3377                                      region,
3378                                      UVM_MAKE_RESIDENT_CAUSE_EVICTION,
3379                                      out_accessed_by_set);
3380 }
3381 
3382 NV_STATUS uvm_hmm_va_block_evict_pages_from_gpu(uvm_va_block_t *va_block,
3383                                                 uvm_gpu_t *gpu,
3384                                                 uvm_va_block_context_t *va_block_context,
3385                                                 const uvm_page_mask_t *pages_to_evict,
3386                                                 uvm_va_block_region_t region)
3387 {
3388     unsigned long *src_pfns = va_block_context->hmm.src_pfns;
3389     uvm_va_block_gpu_state_t *gpu_state;
3390     uvm_page_index_t page_index;
3391     uvm_gpu_chunk_t *gpu_chunk;
3392     NV_STATUS status;
3393 
3394     uvm_assert_mutex_locked(&va_block->lock);
3395 
3396     gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id);
3397     UVM_ASSERT(gpu_state);
3398     UVM_ASSERT(gpu_state->chunks);
3399 
3400     // Fill in the src_pfns[] with the ZONE_DEVICE private PFNs of the GPU.
3401     memset(src_pfns, 0, sizeof(va_block_context->hmm.src_pfns));
3402 
3403     // TODO: Bug 3368756: add support for large GPU pages.
3404     for_each_va_block_page_in_region_mask(page_index, pages_to_evict, region) {
3405         gpu_chunk = uvm_va_block_lookup_gpu_chunk(va_block,
3406                                                   gpu,
3407                                                   uvm_va_block_cpu_page_address(va_block, page_index));
3408         status = uvm_hmm_va_block_evict_chunk_prep(va_block,
3409                                                    va_block_context,
3410                                                    gpu_chunk,
3411                                                    uvm_va_block_region_for_page(page_index));
3412         if (status != NV_OK)
3413             return status;
3414     }
3415 
3416     return hmm_va_block_evict_chunks(va_block,
3417                                      va_block_context,
3418                                      pages_to_evict,
3419                                      region,
3420                                      UVM_MAKE_RESIDENT_CAUSE_API_MIGRATE,
3421                                      NULL);
3422 }
3423 
3424 NV_STATUS uvm_hmm_remote_cpu_fault(struct vm_fault *vmf)
3425 {
3426     NV_STATUS status = NV_OK;
3427     unsigned long src_pfn;
3428     unsigned long dst_pfn;
3429     struct migrate_vma args;
3430     struct page *src_page = vmf->page;
3431     int ret;
3432 
3433     args.vma = vmf->vma;
3434     args.src = &src_pfn;
3435     args.dst = &dst_pfn;
3436     args.start = nv_page_fault_va(vmf);
3437     args.end = args.start + PAGE_SIZE;
3438     args.pgmap_owner = &g_uvm_global;
3439     args.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE;
3440     args.fault_page = src_page;
3441 
3442     // We don't call migrate_vma_setup_locked() here because we don't
3443     // have a va_block and don't want to ignore invalidations.
3444     ret = migrate_vma_setup(&args);
3445     UVM_ASSERT(!ret);
3446 
3447     if (src_pfn & MIGRATE_PFN_MIGRATE) {
3448         struct page *dst_page;
3449 
3450         dst_page = alloc_page(GFP_HIGHUSER_MOVABLE);
3451         if (!dst_page) {
3452             status = NV_ERR_NO_MEMORY;
3453             goto out;
3454         }
3455 
3456         lock_page(dst_page);
3457         dst_pfn = migrate_pfn(page_to_pfn(dst_page));
3458 
3459         hmm_copy_devmem_page(dst_page, src_page);
3460     }
3461 
3462     migrate_vma_pages(&args);
3463 
3464 out:
3465     migrate_vma_finalize(&args);
3466 
3467     return status;
3468 }
3469 
3470 // The routines below are all for UVM-HMM tests.
3471 
3472 NV_STATUS uvm_hmm_va_block_range_bounds(uvm_va_space_t *va_space,
3473                                         struct mm_struct *mm,
3474                                         NvU64 lookup_address,
3475                                         NvU64 *startp,
3476                                         NvU64 *endp,
3477                                         UVM_TEST_VA_RESIDENCY_INFO_PARAMS *params)
3478 {
3479     struct vm_area_struct *vma;
3480     NvU64 start;
3481     NvU64 end;
3482 
3483     if (!uvm_hmm_is_enabled(va_space) || !mm)
3484         return NV_ERR_INVALID_ADDRESS;
3485 
3486     uvm_assert_mmap_lock_locked(mm);
3487     uvm_assert_rwsem_locked(&va_space->lock);
3488 
3489     // The VMA might have changed while not holding mmap_lock so check it.
3490     vma = find_vma(mm, lookup_address);
3491     if (!uvm_hmm_vma_is_valid(vma, lookup_address, false))
3492         return NV_ERR_INVALID_ADDRESS;
3493 
3494     // Since managed VA ranges don't cover more than one VMA, return only the
3495     // intersecting range of the VA block and VMA.
3496     start = UVM_VA_BLOCK_ALIGN_DOWN(lookup_address);
3497     end = start + UVM_VA_BLOCK_SIZE - 1;
3498     if (start < vma->vm_start)
3499         start = vma->vm_start;
3500     if (end > vma->vm_end - 1)
3501         end = vma->vm_end - 1;
3502 
3503     *startp = start;
3504     *endp   = end;
3505 
3506     if (params) {
3507         uvm_va_space_processor_uuid(va_space, &params->resident_on[0], UVM_ID_CPU);
3508         params->resident_physical_size[0] = PAGE_SIZE;
3509         params->resident_on_count = 1;
3510 
3511         uvm_va_space_processor_uuid(va_space, &params->mapped_on[0], UVM_ID_CPU);
3512         params->mapping_type[0] = (vma->vm_flags & VM_WRITE) ?
3513                                   UVM_PROT_READ_WRITE_ATOMIC : UVM_PROT_READ_ONLY;
3514         params->page_size[0] = PAGE_SIZE;
3515         params->mapped_on_count = 1;
3516 
3517         uvm_va_space_processor_uuid(va_space, &params->populated_on[0], UVM_ID_CPU);
3518         params->populated_on_count = 1;
3519     }
3520 
3521     return NV_OK;
3522 }
3523 
3524 NV_STATUS uvm_hmm_va_block_update_residency_info(uvm_va_block_t *va_block,
3525                                                  struct mm_struct *mm,
3526                                                  NvU64 lookup_address,
3527                                                  bool populate)
3528 {
3529     uvm_va_space_t *va_space = va_block->hmm.va_space;
3530     struct vm_area_struct *vma;
3531     struct hmm_range range;
3532     uvm_va_block_region_t region;
3533     unsigned long pfn;
3534     NvU64 end;
3535     int ret;
3536     NV_STATUS status;
3537 
3538     if (!uvm_hmm_is_enabled(va_space) || !mm)
3539         return NV_ERR_INVALID_ADDRESS;
3540 
3541     uvm_assert_mmap_lock_locked(mm);
3542     uvm_assert_rwsem_locked(&va_space->lock);
3543 
3544     // The VMA might have changed while not holding mmap_lock so check it.
3545     vma = find_vma(mm, lookup_address);
3546     if (!uvm_hmm_vma_is_valid(vma, lookup_address, false))
3547         return NV_ERR_INVALID_ADDRESS;
3548 
3549     end = lookup_address + PAGE_SIZE;
3550     region = uvm_va_block_region_from_start_end(va_block, lookup_address, end - 1);
3551 
3552     range.notifier = &va_block->hmm.notifier;
3553     range.start = lookup_address;
3554     range.end = end;
3555     range.hmm_pfns = &pfn;
3556     range.default_flags = 0;
3557     range.pfn_flags_mask = 0;
3558     range.dev_private_owner = &g_uvm_global;
3559 
3560     if (populate) {
3561         range.default_flags = HMM_PFN_REQ_FAULT;
3562         if (vma->vm_flags & VM_WRITE)
3563             range.default_flags |= HMM_PFN_REQ_WRITE;
3564     }
3565 
3566     uvm_hmm_migrate_begin_wait(va_block);
3567 
3568     while (true) {
3569         range.notifier_seq = mmu_interval_read_begin(range.notifier);
3570         ret = hmm_range_fault(&range);
3571         if (ret == -EBUSY)
3572             continue;
3573         if (ret) {
3574             uvm_hmm_migrate_finish(va_block);
3575             return errno_to_nv_status(ret);
3576         }
3577 
3578         uvm_mutex_lock(&va_block->lock);
3579 
3580         if (!mmu_interval_read_retry(range.notifier, range.notifier_seq))
3581             break;
3582 
3583         uvm_mutex_unlock(&va_block->lock);
3584     }
3585 
3586     // Update the va_block CPU state based on the snapshot.
3587     // Note that we have to adjust the pfns address since it will be indexed
3588     // by region.first.
3589     status = populate_region(va_block, &pfn - region.first, region, NULL);
3590 
3591     uvm_mutex_unlock(&va_block->lock);
3592     uvm_hmm_migrate_finish(va_block);
3593 
3594     return NV_OK;
3595 }
3596 
3597 NV_STATUS uvm_test_split_invalidate_delay(UVM_TEST_SPLIT_INVALIDATE_DELAY_PARAMS *params, struct file *filp)
3598 {
3599     uvm_va_space_t *va_space = uvm_va_space_get(filp);
3600 
3601     atomic64_set(&va_space->test.split_invalidate_delay_us, params->delay_us);
3602 
3603     return NV_OK;
3604 }
3605 
3606 NV_STATUS uvm_hmm_va_range_info(uvm_va_space_t *va_space,
3607                                 struct mm_struct *mm,
3608                                 UVM_TEST_VA_RANGE_INFO_PARAMS *params)
3609 {
3610     uvm_range_tree_node_t *tree_node;
3611     const uvm_va_policy_node_t *node;
3612     struct vm_area_struct *vma;
3613     uvm_va_block_t *va_block;
3614 
3615     if (!mm || !uvm_hmm_is_enabled(va_space))
3616         return NV_ERR_INVALID_ADDRESS;
3617 
3618     uvm_assert_mmap_lock_locked(mm);
3619     uvm_assert_rwsem_locked(&va_space->lock);
3620 
3621     params->type = UVM_TEST_VA_RANGE_TYPE_MANAGED;
3622     params->managed.subtype = UVM_TEST_RANGE_SUBTYPE_HMM;
3623     params->va_range_start = 0;
3624     params->va_range_end = ULONG_MAX;
3625     params->read_duplication = UVM_TEST_READ_DUPLICATION_UNSET;
3626     memset(&params->preferred_location, 0, sizeof(params->preferred_location));
3627     params->preferred_cpu_nid = NUMA_NO_NODE;
3628     params->accessed_by_count = 0;
3629     params->managed.vma_start = 0;
3630     params->managed.vma_end = 0;
3631     params->managed.is_zombie = NV_FALSE;
3632     params->managed.owned_by_calling_process = (mm == current->mm ? NV_TRUE : NV_FALSE);
3633 
3634     vma = find_vma(mm, params->lookup_address);
3635     if (!uvm_hmm_vma_is_valid(vma, params->lookup_address, false))
3636         return NV_ERR_INVALID_ADDRESS;
3637 
3638     params->va_range_start = vma->vm_start;
3639     params->va_range_end   = vma->vm_end - 1;
3640     params->managed.vma_start = vma->vm_start;
3641     params->managed.vma_end   = vma->vm_end - 1;
3642 
3643     uvm_mutex_lock(&va_space->hmm.blocks_lock);
3644     tree_node = uvm_range_tree_find(&va_space->hmm.blocks, params->lookup_address);
3645     if (!tree_node) {
3646         UVM_ASSERT(uvm_range_tree_find_hole_in(&va_space->hmm.blocks, params->lookup_address,
3647                                                &params->va_range_start, &params->va_range_end) == NV_OK);
3648         uvm_mutex_unlock(&va_space->hmm.blocks_lock);
3649         return NV_OK;
3650     }
3651 
3652     uvm_mutex_unlock(&va_space->hmm.blocks_lock);
3653     va_block = hmm_va_block_from_node(tree_node);
3654     uvm_mutex_lock(&va_block->lock);
3655 
3656     params->va_range_start = va_block->start;
3657     params->va_range_end   = va_block->end;
3658 
3659     node = uvm_va_policy_node_find(va_block, params->lookup_address);
3660     if (node) {
3661         uvm_processor_id_t processor_id;
3662 
3663         if (params->va_range_start < node->node.start)
3664             params->va_range_start = node->node.start;
3665         if (params->va_range_end > node->node.end)
3666             params->va_range_end = node->node.end;
3667 
3668         params->read_duplication = node->policy.read_duplication;
3669 
3670         if (!UVM_ID_IS_INVALID(node->policy.preferred_location)) {
3671             uvm_va_space_processor_uuid(va_space, &params->preferred_location, node->policy.preferred_location);
3672             params->preferred_cpu_nid = node->policy.preferred_nid;
3673         }
3674 
3675         for_each_id_in_mask(processor_id, &node->policy.accessed_by)
3676             uvm_va_space_processor_uuid(va_space, &params->accessed_by[params->accessed_by_count++], processor_id);
3677     }
3678     else {
3679         uvm_range_tree_find_hole_in(&va_block->hmm.va_policy_tree, params->lookup_address,
3680                                     &params->va_range_start, &params->va_range_end);
3681     }
3682 
3683     uvm_mutex_unlock(&va_block->lock);
3684 
3685     return NV_OK;
3686 }
3687 
3688 // TODO: Bug 3660968: Remove this hack as soon as HMM migration is implemented
3689 // for VMAs other than anonymous private memory.
3690 bool uvm_hmm_must_use_sysmem(uvm_va_block_t *va_block,
3691                              struct vm_area_struct *vma)
3692 {
3693     uvm_assert_mutex_locked(&va_block->lock);
3694 
3695     if (!uvm_va_block_is_hmm(va_block))
3696         return false;
3697 
3698     UVM_ASSERT(vma);
3699     UVM_ASSERT(va_block->hmm.va_space->va_space_mm.mm == vma->vm_mm);
3700     uvm_assert_mmap_lock_locked(vma->vm_mm);
3701 
3702     // migrate_vma_setup() can't migrate VM_SPECIAL so we have to force GPU
3703     // remote mapping.
3704     // TODO: Bug 3660968: add support for file-backed migrations.
3705     // TODO: Bug 3368756: add support for transparent huge page migrations.
3706     return !vma_is_anonymous(vma) ||
3707            (vma->vm_flags & VM_SPECIAL) ||
3708            vma_is_dax(vma) ||
3709            is_vm_hugetlb_page(vma);
3710 }
3711 
3712 #endif // UVM_IS_CONFIG_HMM()
3713