1 /*******************************************************************************
2     Copyright (c) 2018-2022 NVIDIA Corporation
3 
4     Permission is hereby granted, free of charge, to any person obtaining a copy
5     of this software and associated documentation files (the "Software"), to
6     deal in the Software without restriction, including without limitation the
7     rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
8     sell copies of the Software, and to permit persons to whom the Software is
9     furnished to do so, subject to the following conditions:
10 
11         The above copyright notice and this permission notice shall be
12         included in all copies or substantial portions of the Software.
13 
14     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17     THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19     FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20     DEALINGS IN THE SOFTWARE.
21 
22 *******************************************************************************/
23 
24 #include "uvm_common.h"
25 #include "uvm_linux.h"
26 #include "uvm_gpu.h"
27 #include "uvm_lock.h"
28 #include "uvm_va_space.h"
29 #include "uvm_tracker.h"
30 #include "uvm_api.h"
31 #include "uvm_push.h"
32 #include "uvm_hal.h"
33 #include "uvm_migrate_pageable.h"
34 #include "uvm_populate_pageable.h"
35 
36 #ifdef UVM_MIGRATE_VMA_SUPPORTED
37 
38 static struct kmem_cache *g_uvm_migrate_vma_state_cache __read_mostly;
39 
40 static const gfp_t g_migrate_vma_gfp_flags = NV_UVM_GFP_FLAGS | GFP_HIGHUSER_MOVABLE | __GFP_THISNODE;
41 
42 // Compute the address needed for copying_gpu to access the given page,
43 // resident on resident_id.
44 static NV_STATUS migrate_vma_page_copy_address(struct page *page,
45                                                unsigned long page_index,
46                                                uvm_processor_id_t resident_id,
47                                                uvm_gpu_t *copying_gpu,
48                                                migrate_vma_state_t *state,
49                                                uvm_gpu_address_t *gpu_addr)
50 {
51     uvm_va_space_t *va_space = state->uvm_migrate_args->va_space;
52     uvm_gpu_t *owning_gpu = UVM_ID_IS_CPU(resident_id)? NULL: uvm_va_space_get_gpu(va_space, resident_id);
53     const bool can_copy_from = uvm_processor_mask_test(&va_space->can_copy_from[uvm_id_value(copying_gpu->id)],
54                                                        resident_id);
55     const bool direct_peer = owning_gpu &&
56                              (owning_gpu != copying_gpu) &&
57                              can_copy_from &&
58                              !uvm_gpu_peer_caps(owning_gpu, copying_gpu)->is_indirect_peer;
59 
60     UVM_ASSERT(page_index < state->num_pages);
61 
62     memset(gpu_addr, 0, sizeof(*gpu_addr));
63 
64     if (owning_gpu == copying_gpu) {
65         // Local vidmem address
66         *gpu_addr = uvm_gpu_address_from_phys(uvm_gpu_page_to_phys_address(owning_gpu, page));
67     }
68     else if (direct_peer) {
69         // Direct GPU peer
70         uvm_gpu_identity_mapping_t *gpu_peer_mappings = uvm_gpu_get_peer_mapping(copying_gpu, owning_gpu->id);
71         uvm_gpu_phys_address_t phys_addr = uvm_gpu_page_to_phys_address(owning_gpu, page);
72 
73         *gpu_addr = uvm_gpu_address_virtual(gpu_peer_mappings->base + phys_addr.address);
74     }
75     else {
76         // Sysmem/Indirect Peer
77         NV_STATUS status = uvm_gpu_map_cpu_page(copying_gpu->parent, page, &state->dma.addrs[page_index]);
78 
79         if (status != NV_OK)
80             return status;
81 
82         state->dma.addrs_gpus[page_index] = copying_gpu;
83 
84         if (state->dma.num_pages++ == 0)
85             bitmap_zero(state->dma.page_mask, state->num_pages);
86 
87         UVM_ASSERT(!test_bit(page_index, state->dma.page_mask));
88 
89         __set_bit(page_index, state->dma.page_mask);
90 
91         *gpu_addr = uvm_gpu_address_physical(UVM_APERTURE_SYS, state->dma.addrs[page_index]);
92     }
93 
94     return NV_OK;
95 }
96 
97 // Return the GPU identified with the given NUMA node id
98 static uvm_gpu_t *get_gpu_from_node_id(uvm_va_space_t *va_space, int node_id)
99 {
100     uvm_gpu_t *gpu;
101 
102     for_each_va_space_gpu(gpu, va_space) {
103         if (uvm_gpu_numa_info(gpu)->node_id == node_id)
104             return gpu;
105     }
106 
107     return NULL;
108 }
109 
110 // Create a new push to zero pages on dst_id
111 static NV_STATUS migrate_vma_zero_begin_push(uvm_va_space_t *va_space,
112                                              uvm_processor_id_t dst_id,
113                                              uvm_gpu_t *gpu,
114                                              unsigned long start,
115                                              unsigned long outer,
116                                              uvm_push_t *push)
117 {
118     uvm_channel_type_t channel_type;
119 
120     if (UVM_ID_IS_CPU(dst_id)) {
121         channel_type = UVM_CHANNEL_TYPE_GPU_TO_CPU;
122     }
123     else {
124         UVM_ASSERT(uvm_id_equal(dst_id, gpu->id));
125         channel_type = UVM_CHANNEL_TYPE_GPU_INTERNAL;
126     }
127 
128     return uvm_push_begin(gpu->channel_manager,
129                           channel_type,
130                           push,
131                           "Zero %s from %s VMA region [0x%lx, 0x%lx]",
132                           uvm_va_space_processor_name(va_space, dst_id),
133                           uvm_va_space_processor_name(va_space, gpu->id),
134                           start,
135                           outer);
136 }
137 
138 // Create a new push to copy pages between src_id and dst_id
139 static NV_STATUS migrate_vma_copy_begin_push(uvm_va_space_t *va_space,
140                                              uvm_processor_id_t dst_id,
141                                              uvm_processor_id_t src_id,
142                                              unsigned long start,
143                                              unsigned long outer,
144                                              uvm_push_t *push)
145 {
146     uvm_channel_type_t channel_type;
147     uvm_gpu_t *gpu;
148 
149     UVM_ASSERT_MSG(!uvm_id_equal(src_id, dst_id),
150                    "Unexpected copy to self, processor %s\n",
151                    uvm_va_space_processor_name(va_space, src_id));
152 
153     if (UVM_ID_IS_CPU(src_id)) {
154         gpu = uvm_va_space_get_gpu(va_space, dst_id);
155         channel_type = UVM_CHANNEL_TYPE_CPU_TO_GPU;
156     }
157     else if (UVM_ID_IS_CPU(dst_id)) {
158         gpu = uvm_va_space_get_gpu(va_space, src_id);
159         channel_type = UVM_CHANNEL_TYPE_GPU_TO_CPU;
160     }
161     else {
162         // For GPU to GPU copies, prefer to "push" the data from the source as
163         // that works better
164         gpu = uvm_va_space_get_gpu(va_space, src_id);
165 
166         channel_type = UVM_CHANNEL_TYPE_GPU_TO_GPU;
167     }
168 
169     // NUMA-enabled GPUs can copy to any other NUMA node in the system even if
170     // P2P access has not been explicitly enabled (ie va_space->can_copy_from
171     // is not set).
172     if (!gpu->parent->numa_info.enabled) {
173         UVM_ASSERT_MSG(uvm_processor_mask_test(&va_space->can_copy_from[uvm_id_value(gpu->id)], dst_id),
174                        "GPU %s dst %s src %s\n",
175                        uvm_va_space_processor_name(va_space, gpu->id),
176                        uvm_va_space_processor_name(va_space, dst_id),
177                        uvm_va_space_processor_name(va_space, src_id));
178         UVM_ASSERT_MSG(uvm_processor_mask_test(&va_space->can_copy_from[uvm_id_value(gpu->id)], src_id),
179                        "GPU %s dst %s src %s\n",
180                        uvm_va_space_processor_name(va_space, gpu->id),
181                        uvm_va_space_processor_name(va_space, dst_id),
182                        uvm_va_space_processor_name(va_space, src_id));
183     }
184 
185     if (channel_type == UVM_CHANNEL_TYPE_GPU_TO_GPU) {
186         uvm_gpu_t *dst_gpu = uvm_va_space_get_gpu(va_space, dst_id);
187         return uvm_push_begin_gpu_to_gpu(gpu->channel_manager,
188                                          dst_gpu,
189                                          push,
190                                          "Copy from %s to %s for VMA region [0x%lx, 0x%lx]",
191                                          uvm_va_space_processor_name(va_space, src_id),
192                                          uvm_va_space_processor_name(va_space, dst_id),
193                                          start,
194                                          outer);
195     }
196 
197     return uvm_push_begin(gpu->channel_manager,
198                           channel_type,
199                           push,
200                           "Copy from %s to %s for VMA region [0x%lx, 0x%lx]",
201                           uvm_va_space_processor_name(va_space, src_id),
202                           uvm_va_space_processor_name(va_space, dst_id),
203                           start,
204                           outer);
205 }
206 
207 static void migrate_vma_compute_masks(struct vm_area_struct *vma, const unsigned long *src, migrate_vma_state_t *state)
208 {
209     unsigned long i;
210     const bool is_rw = vma->vm_flags & VM_WRITE;
211     uvm_migrate_args_t *uvm_migrate_args = state->uvm_migrate_args;
212     uvm_processor_id_t dst_id = uvm_migrate_args->dst_id;
213 
214     UVM_ASSERT(vma_is_anonymous(vma));
215 
216     bitmap_zero(state->populate_pages_mask, state->num_pages);
217     bitmap_zero(state->allocation_failed_mask, state->num_pages);
218     bitmap_zero(state->dst_resident_pages_mask, state->num_pages);
219 
220     uvm_processor_mask_zero(&state->src_processors);
221     state->num_populate_anon_pages = 0;
222     state->dma.num_pages = 0;
223 
224     for (i = 0; i < state->num_pages; ++i) {
225         uvm_processor_id_t src_id;
226         struct page *src_page = NULL;
227         int src_nid;
228         uvm_gpu_t *src_gpu = NULL;
229 
230         // Skip pages that cannot be migrated
231         if (!(src[i] & MIGRATE_PFN_MIGRATE)) {
232             // This can happen in two cases :
233             // - Page is populated but can't be migrated.
234             // - Page isn't populated
235             // In both the above cases, treat the page as failing migration and
236             // populate with get_user_pages.
237             if (!(src[i] & MIGRATE_PFN_VALID))
238                 __set_bit(i, state->populate_pages_mask);
239 
240             continue;
241         }
242 
243         src_page = migrate_pfn_to_page(src[i]);
244         if (!src_page) {
245             if (is_rw) {
246                 // Populate PROT_WRITE vmas in migrate_vma so we can use the
247                 // GPU's copy engines
248                 if (state->num_populate_anon_pages++ == 0)
249                     bitmap_zero(state->processors[uvm_id_value(dst_id)].page_mask, state->num_pages);
250 
251                 __set_bit(i, state->processors[uvm_id_value(dst_id)].page_mask);
252             }
253             else {
254                 // PROT_NONE vmas cannot be populated. PROT_READ anonymous vmas
255                 // are populated using the zero page. In order to match this
256                 // behavior, we tell the caller to populate using
257                 // get_user_pages.
258                 __set_bit(i, state->populate_pages_mask);
259             }
260 
261             continue;
262         }
263 
264         // Page is already mapped. Skip migration of this page if requested.
265         if (uvm_migrate_args->skip_mapped) {
266             __set_bit(i, state->populate_pages_mask);
267             continue;
268         }
269 
270         src_nid = page_to_nid(src_page);
271 
272         // Already at destination
273         if (src_nid == uvm_migrate_args->dst_node_id) {
274             __set_bit(i, state->dst_resident_pages_mask);
275             continue;
276         }
277 
278         // Already resident on a CPU node, don't move
279         if (UVM_ID_IS_CPU(dst_id) && node_state(src_nid, N_CPU)) {
280             __set_bit(i, state->dst_resident_pages_mask);
281             continue;
282         }
283 
284         src_gpu = get_gpu_from_node_id(uvm_migrate_args->va_space, src_nid);
285 
286         // Already resident on a node with no CPUs that doesn't belong to a
287         // GPU, don't move
288         if (UVM_ID_IS_CPU(dst_id) && !src_gpu) {
289             __set_bit(i, state->dst_resident_pages_mask);
290             continue;
291         }
292 
293         // TODO: Bug 2449272: Implement non-P2P copies. All systems that hit
294         // this path have P2P copy support between all GPUs in the system, but
295         // it could change in the future.
296 
297         if (src_gpu)
298             src_id = src_gpu->id;
299         else
300             src_id = UVM_ID_CPU;
301 
302         if (!uvm_processor_mask_test_and_set(&state->src_processors, src_id))
303             bitmap_zero(state->processors[uvm_id_value(src_id)].page_mask, state->num_pages);
304 
305         __set_bit(i, state->processors[uvm_id_value(src_id)].page_mask);
306     }
307 }
308 
309 static struct page *migrate_vma_alloc_page(migrate_vma_state_t *state)
310 {
311     struct page *dst_page;
312     uvm_migrate_args_t *uvm_migrate_args = state->uvm_migrate_args;
313     uvm_va_space_t *va_space = uvm_migrate_args->va_space;
314 
315     if (uvm_enable_builtin_tests && atomic_dec_if_positive(&va_space->test.migrate_vma_allocation_fail_nth) == 0) {
316         dst_page = NULL;
317     }
318     else {
319         dst_page = alloc_pages_node(uvm_migrate_args->dst_node_id, g_migrate_vma_gfp_flags, 0);
320 
321         // TODO: Bug 2399573: Linux commit
322         // 183f6371aac2a5496a8ef2b0b0a68562652c3cdb introduced a bug that makes
323         // __GFP_THISNODE not always be honored (this was later fixed in commit
324         // 7810e6781e0fcbca78b91cf65053f895bf59e85f). Therefore, we verify
325         // whether the flag has been honored and abort the allocation,
326         // otherwise. Remove this check when the fix is deployed on all
327         // production systems.
328         if (dst_page && page_to_nid(dst_page) != uvm_migrate_args->dst_node_id) {
329             __free_page(dst_page);
330             dst_page = NULL;
331         }
332     }
333 
334     return dst_page;
335 }
336 
337 static NV_STATUS migrate_vma_populate_anon_pages(struct vm_area_struct *vma,
338                                                  unsigned long *dst,
339                                                  unsigned long start,
340                                                  unsigned long outer,
341                                                  migrate_vma_state_t *state)
342 {
343     NV_STATUS status = NV_OK;
344     uvm_migrate_args_t *uvm_migrate_args = state->uvm_migrate_args;
345     uvm_processor_id_t dst_id = uvm_migrate_args->dst_id;
346     unsigned long *page_mask = state->processors[uvm_id_value(dst_id)].page_mask;
347     uvm_gpu_t *copying_gpu = NULL;
348     uvm_va_space_t *va_space = uvm_migrate_args->va_space;
349     uvm_push_t push;
350     unsigned long i;
351 
352     // Nothing to do
353     if (state->num_populate_anon_pages == 0)
354         return NV_OK;
355 
356     UVM_ASSERT(state->num_populate_anon_pages == bitmap_weight(page_mask, state->num_pages));
357 
358     for_each_set_bit(i, page_mask, state->num_pages) {
359         uvm_gpu_address_t dst_address;
360         struct page *dst_page;
361 
362         dst_page = migrate_vma_alloc_page(state);
363         if (!dst_page) {
364             __set_bit(i, state->allocation_failed_mask);
365             continue;
366         }
367 
368         if (!copying_gpu) {
369             // Try to get a GPU attached to the node being populated. If there
370             // is none, use any of the GPUs registered in the VA space.
371             if (UVM_ID_IS_CPU(dst_id)) {
372                 copying_gpu = uvm_va_space_find_first_gpu_attached_to_cpu_node(va_space, uvm_migrate_args->dst_node_id);
373                 if (!copying_gpu)
374                     copying_gpu = uvm_va_space_find_first_gpu(va_space);
375             }
376             else {
377                 copying_gpu = uvm_va_space_get_gpu(va_space, dst_id);
378             }
379 
380             UVM_ASSERT(copying_gpu);
381 
382             status = migrate_vma_zero_begin_push(va_space, dst_id, copying_gpu, start, outer - 1, &push);
383             if (status != NV_OK) {
384                 __free_page(dst_page);
385                 return status;
386             }
387         }
388         else {
389             uvm_push_set_flag(&push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
390         }
391 
392         status = migrate_vma_page_copy_address(dst_page, i, dst_id, copying_gpu, state, &dst_address);
393         if (status != NV_OK) {
394             __free_page(dst_page);
395             break;
396         }
397 
398         lock_page(dst_page);
399 
400         // We'll push one membar later for all memsets in this loop
401         uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
402         copying_gpu->parent->ce_hal->memset_8(&push, dst_address, 0, PAGE_SIZE);
403 
404         dst[i] = migrate_pfn(page_to_pfn(dst_page));
405     }
406 
407     if (copying_gpu) {
408         NV_STATUS tracker_status;
409 
410         uvm_push_end(&push);
411 
412         tracker_status = uvm_tracker_add_push_safe(&state->tracker, &push);
413         if (status == NV_OK)
414             status = tracker_status;
415     }
416 
417     return status;
418 }
419 
420 static NV_STATUS migrate_vma_copy_pages_from(struct vm_area_struct *vma,
421                                              const unsigned long *src,
422                                              unsigned long *dst,
423                                              unsigned long start,
424                                              unsigned long outer,
425                                              uvm_processor_id_t src_id,
426                                              migrate_vma_state_t *state)
427 {
428     NV_STATUS status = NV_OK;
429     uvm_push_t push;
430     unsigned long i;
431     uvm_gpu_t *copying_gpu = NULL;
432     uvm_migrate_args_t *uvm_migrate_args = state->uvm_migrate_args;
433     uvm_processor_id_t dst_id = uvm_migrate_args->dst_id;
434     unsigned long *page_mask = state->processors[uvm_id_value(src_id)].page_mask;
435     uvm_va_space_t *va_space = uvm_migrate_args->va_space;
436 
437     UVM_ASSERT(!bitmap_empty(page_mask, state->num_pages));
438 
439     for_each_set_bit(i, page_mask, state->num_pages) {
440         uvm_gpu_address_t src_address;
441         uvm_gpu_address_t dst_address;
442         struct page *src_page = migrate_pfn_to_page(src[i]);
443         struct page *dst_page;
444 
445         UVM_ASSERT(src[i] & MIGRATE_PFN_VALID);
446         UVM_ASSERT(src_page);
447 
448         dst_page = migrate_vma_alloc_page(state);
449         if (!dst_page) {
450             __set_bit(i, state->allocation_failed_mask);
451             continue;
452         }
453 
454         if (!copying_gpu) {
455             status = migrate_vma_copy_begin_push(va_space, dst_id, src_id, start, outer - 1, &push);
456             if (status != NV_OK) {
457                 __free_page(dst_page);
458                 return status;
459             }
460 
461             copying_gpu = uvm_push_get_gpu(&push);
462         }
463         else {
464             uvm_push_set_flag(&push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
465         }
466 
467         // We don't have a case where both src and dst use the SYS aperture, so
468         // the second call can't overwrite a dma addr set up by the first call.
469         status = migrate_vma_page_copy_address(src_page, i, src_id, copying_gpu, state, &src_address);
470         if (status == NV_OK)
471             status = migrate_vma_page_copy_address(dst_page, i, dst_id, copying_gpu, state, &dst_address);
472 
473         if (status != NV_OK) {
474             __free_page(dst_page);
475             break;
476         }
477 
478         lock_page(dst_page);
479 
480         // We'll push one membar later for all copies in this loop
481         uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
482         copying_gpu->parent->ce_hal->memcopy(&push, dst_address, src_address, PAGE_SIZE);
483 
484         dst[i] = migrate_pfn(page_to_pfn(dst_page));
485     }
486 
487     // TODO: Bug 1766424: If the destination is a GPU and the copy was done by
488     //       that GPU, use a GPU-local membar if no peer nor the CPU can
489     //       currently map this page. When peer access gets enabled, do a
490     //       MEMBAR_SYS at that point.
491     if (copying_gpu) {
492         NV_STATUS tracker_status;
493 
494         uvm_push_end(&push);
495 
496         tracker_status = uvm_tracker_add_push_safe(&state->tracker, &push);
497         if (status == NV_OK)
498             status = tracker_status;
499     }
500 
501     return status;
502 }
503 
504 static NV_STATUS migrate_vma_copy_pages(struct vm_area_struct *vma,
505                                         const unsigned long *src,
506                                         unsigned long *dst,
507                                         unsigned long start,
508                                         unsigned long outer,
509                                         migrate_vma_state_t *state)
510 {
511     uvm_processor_id_t src_id;
512 
513     for_each_id_in_mask(src_id, &state->src_processors) {
514         NV_STATUS status = migrate_vma_copy_pages_from(vma, src, dst, start, outer, src_id, state);
515         if (status != NV_OK)
516             return status;
517     }
518 
519     return NV_OK;
520 }
521 
522 void uvm_migrate_vma_alloc_and_copy(struct migrate_vma *args, migrate_vma_state_t *state)
523 {
524     struct vm_area_struct *vma = args->vma;
525     unsigned long start = args->start;
526     unsigned long outer = args->end;
527     NV_STATUS tracker_status;
528 
529     uvm_tracker_init(&state->tracker);
530 
531     state->num_pages = (outer - start) / PAGE_SIZE;
532     state->status = NV_OK;
533 
534     migrate_vma_compute_masks(vma, args->src, state);
535 
536     state->status = migrate_vma_populate_anon_pages(vma, args->dst, start, outer, state);
537 
538     if (state->status == NV_OK)
539         state->status = migrate_vma_copy_pages(vma, args->src, args->dst, start, outer, state);
540 
541     // Wait for tracker since all copies must have completed before returning
542     tracker_status = uvm_tracker_wait_deinit(&state->tracker);
543 
544     if (state->status == NV_OK)
545         state->status = tracker_status;
546 }
547 
548 void uvm_migrate_vma_alloc_and_copy_helper(struct vm_area_struct *vma,
549                                 const unsigned long *src,
550                                 unsigned long *dst,
551                                 unsigned long start,
552                                 unsigned long end,
553                                 void *private)
554 {
555     struct migrate_vma args =
556     {
557         .vma = vma,
558         .dst = dst,
559         .src = (unsigned long *) src,
560         .start = start,
561         .end = end,
562     };
563 
564     uvm_migrate_vma_alloc_and_copy(&args, (migrate_vma_state_t *) private);
565 }
566 
567 void uvm_migrate_vma_finalize_and_map(struct migrate_vma *args, migrate_vma_state_t *state)
568 {
569     unsigned long i;
570 
571     for (i = 0; i < state->num_pages; i++) {
572         bool needs_touch = false;
573         uvm_migrate_args_t *uvm_migrate_args = state->uvm_migrate_args;
574 
575         // The page was successfully migrated.
576         if (args->src[i] & MIGRATE_PFN_MIGRATE) {
577             // Touch if requested since population of these pages won't be tried
578             // later.
579             needs_touch = true;
580         }
581         else {
582             // The page was not migrated. This can happen for two reasons.
583             //
584             // 1. Page is already resident at the destination.
585             // 2. Page failed migration because the page state could not be
586             // migrated by the kernel.
587             //
588             // So, only set the corresponding populate_pages bit if both the
589             // following conditions are true.
590             //
591             // 1.Trying to populate pages (with gup) which are already resident
592             // at the destination is wasteful but usually harmless except in the
593             // PROT_NONE case. gup returns NV_ERR_INVALID_ADDRESS for such pages
594             // and will incorrectly lead to API migration failures even though
595             // migration worked as expected.
596             //
597             // 2. Migration failure was not because of allocation failure in
598             // uvm_migrate_vma_finalize_and_map() since such failures would be
599             // indicated in allocation_failed_mask. Failures other than
600             // allocation failures likely means that the page is populated
601             // somewhere. So, set the corresponding bit in populate_pages_mask.
602             if (test_bit(i, state->dst_resident_pages_mask)) {
603 
604                 // If touch was requested, pages in allocation_failed and
605                 // populate_pages masks will be touched during population. But pages
606                 // which are already resident at the destination need to be touched
607                 // here since population isn't tried later for such pages.
608                 needs_touch = true;
609             }
610             else if (!test_bit(i, state->allocation_failed_mask)) {
611                 __set_bit(i, state->populate_pages_mask);
612             }
613         }
614 
615         // Touch if requested and needed.
616         if (uvm_migrate_args->touch && needs_touch) {
617             struct page *dst_page;
618 
619             UVM_ASSERT(args->dst[i] & MIGRATE_PFN_VALID);
620 
621             dst_page = migrate_pfn_to_page(args->dst[i]);
622             UVM_ASSERT(dst_page);
623             uvm_touch_page(dst_page);
624         }
625     }
626 
627     // Remove the IOMMU mappings created during the copy
628     if (state->dma.num_pages > 0) {
629 
630         for_each_set_bit(i, state->dma.page_mask, state->num_pages)
631             uvm_gpu_unmap_cpu_page(state->dma.addrs_gpus[i]->parent, state->dma.addrs[i]);
632     }
633 
634     UVM_ASSERT(!bitmap_intersects(state->populate_pages_mask, state->allocation_failed_mask, state->num_pages));
635 }
636 
637 void uvm_migrate_vma_finalize_and_map_helper(struct vm_area_struct *vma,
638                                              const unsigned long *src,
639                                              const unsigned long *dst,
640                                              unsigned long start,
641                                              unsigned long end,
642                                              void *private)
643 {
644     struct migrate_vma args =
645     {
646         .vma = vma,
647         .dst = (unsigned long *) dst,
648         .src = (unsigned long *) src,
649         .start = start,
650         .end = end,
651     };
652 
653     uvm_migrate_vma_finalize_and_map(&args, (migrate_vma_state_t *) private);
654 }
655 
656 static NV_STATUS nv_migrate_vma(struct migrate_vma *args, migrate_vma_state_t *state)
657 {
658     int ret;
659 
660 #if defined(CONFIG_MIGRATE_VMA_HELPER)
661     static const struct migrate_vma_ops uvm_migrate_vma_ops =
662     {
663         .alloc_and_copy = uvm_migrate_vma_alloc_and_copy_helper,
664         .finalize_and_map = uvm_migrate_vma_finalize_and_map_helper,
665     };
666 
667     ret = migrate_vma(&uvm_migrate_vma_ops, args->vma, args->start, args->end, args->src, args->dst, state);
668     if (ret < 0)
669         return errno_to_nv_status(ret);
670 #else // CONFIG_MIGRATE_VMA_HELPER
671 
672 #if defined(NV_MIGRATE_VMA_FLAGS_PRESENT)
673     args->flags = MIGRATE_VMA_SELECT_SYSTEM;
674 #endif // NV_MIGRATE_VMA_FLAGS_PRESENT
675 
676     ret = migrate_vma_setup(args);
677     if (ret < 0)
678         return errno_to_nv_status(ret);
679 
680     uvm_migrate_vma_alloc_and_copy(args, state);
681     if (state->status == NV_OK) {
682         migrate_vma_pages(args);
683         uvm_migrate_vma_finalize_and_map(args, state);
684     }
685 
686     migrate_vma_finalize(args);
687 #endif // CONFIG_MIGRATE_VMA_HELPER
688 
689     return state->status;
690 }
691 
692 static NV_STATUS migrate_pageable_vma_populate_mask(struct vm_area_struct *vma,
693                                                     unsigned long start,
694                                                     unsigned long outer,
695                                                     const unsigned long *mask,
696                                                     migrate_vma_state_t *state)
697 {
698     const unsigned long num_pages = (outer - start) / PAGE_SIZE;
699     unsigned long subregion_first = find_first_bit(mask, num_pages);
700     uvm_migrate_args_t *uvm_migrate_args = state->uvm_migrate_args;
701 
702     while (subregion_first < num_pages) {
703         NV_STATUS status;
704         unsigned long subregion_outer = find_next_zero_bit(mask, num_pages, subregion_first + 1);
705 
706         status = uvm_populate_pageable_vma(vma,
707                                            start + subregion_first * PAGE_SIZE,
708                                            (subregion_outer - subregion_first) * PAGE_SIZE,
709                                            0,
710                                            uvm_migrate_args->touch,
711                                            uvm_migrate_args->populate_permissions);
712         if (status != NV_OK)
713             return status;
714 
715         subregion_first = find_next_bit(mask, num_pages, subregion_outer + 1);
716     }
717 
718     return NV_OK;
719 }
720 
721 static NV_STATUS migrate_pageable_vma_migrate_mask(struct vm_area_struct *vma,
722                                                    unsigned long start,
723                                                    unsigned long outer,
724                                                    const unsigned long *mask,
725                                                    migrate_vma_state_t *state)
726 {
727     NV_STATUS status;
728     const unsigned long num_pages = (outer - start) / PAGE_SIZE;
729     unsigned long subregion_first = find_first_bit(mask, num_pages);
730     uvm_migrate_args_t *uvm_migrate_args = state->uvm_migrate_args;
731     struct migrate_vma args =
732     {
733         .vma = vma,
734         .src = state->src_pfn_array,
735         .dst = state->dst_pfn_array,
736     };
737 
738     UVM_ASSERT(!uvm_migrate_args->skip_mapped);
739 
740     while (subregion_first < num_pages) {
741         unsigned long subregion_outer = find_next_zero_bit(mask, num_pages, subregion_first + 1);
742 
743         args.start = start + subregion_first * PAGE_SIZE;
744         args.end = start + subregion_outer * PAGE_SIZE;
745 
746         status = nv_migrate_vma(&args, state);
747         if (status != NV_OK)
748             return status;
749 
750         // We ignore allocation failure here as we are just retrying migration,
751         // but pages must have already been populated by the caller
752 
753         subregion_first = find_next_bit(mask, num_pages, subregion_outer + 1);
754     }
755 
756     return NV_OK;
757 }
758 
759 static NV_STATUS migrate_pageable_vma_region(struct vm_area_struct *vma,
760                                              unsigned long start,
761                                              unsigned long outer,
762                                              migrate_vma_state_t *state,
763                                              unsigned long *next_addr)
764 {
765     NV_STATUS status;
766     const unsigned long num_pages = (outer - start) / PAGE_SIZE;
767     struct mm_struct *mm = vma->vm_mm;
768     uvm_migrate_args_t *uvm_migrate_args = state->uvm_migrate_args;
769     struct migrate_vma args =
770     {
771         .vma = vma,
772         .src = state->src_pfn_array,
773         .dst = state->dst_pfn_array,
774         .start = start,
775         .end = outer,
776     };
777 
778     UVM_ASSERT(PAGE_ALIGNED(start));
779     UVM_ASSERT(PAGE_ALIGNED(outer));
780     UVM_ASSERT(start < outer);
781     UVM_ASSERT(start >= vma->vm_start);
782     UVM_ASSERT(outer <= vma->vm_end);
783     UVM_ASSERT(outer - start <= UVM_MIGRATE_VMA_MAX_SIZE);
784     uvm_assert_mmap_lock_locked(mm);
785     uvm_assert_rwsem_locked(&uvm_migrate_args->va_space->lock);
786 
787     status = nv_migrate_vma(&args, state);
788     if (status != NV_OK)
789         return status;
790 
791     // Save the returned page masks because they can be overwritten by
792     // migrate_pageable_vma_migrate_mask().
793     bitmap_copy(state->scratch1_mask, state->populate_pages_mask, num_pages);
794     bitmap_copy(state->scratch2_mask, state->allocation_failed_mask, num_pages);
795 
796     if (!bitmap_empty(state->scratch1_mask, state->num_pages)) {
797         // Populate pages using get_user_pages
798         status = migrate_pageable_vma_populate_mask(vma, start, outer, state->scratch1_mask, state);
799         if (status != NV_OK)
800             return status;
801 
802         if (!uvm_migrate_args->skip_mapped) {
803             status = migrate_pageable_vma_migrate_mask(vma, start, outer, state->scratch1_mask, state);
804             if (status != NV_OK)
805                 return status;
806         }
807     }
808 
809     // There is no need to copy the masks again after the migration is retried.
810     // We ignore the allocation_failed, populate_pages and dst_resident_pages
811     // masks set by the retried migration.
812 
813     if (!bitmap_empty(state->scratch2_mask, state->num_pages)) {
814         // If the destination is the CPU, signal user-space to retry with a
815         // different node. Otherwise, just try to populate anywhere in the
816         // system
817         if (UVM_ID_IS_CPU(uvm_migrate_args->dst_id)) {
818             *next_addr = start + find_first_bit(state->scratch2_mask, num_pages) * PAGE_SIZE;
819             return NV_ERR_MORE_PROCESSING_REQUIRED;
820         }
821         else {
822             status = migrate_pageable_vma_populate_mask(vma, start, outer, state->scratch2_mask, state);
823             if (status != NV_OK)
824                 return status;
825         }
826     }
827 
828     return NV_OK;
829 }
830 
831 static NV_STATUS migrate_pageable_vma(struct vm_area_struct *vma,
832                                       unsigned long start,
833                                       unsigned long outer,
834                                       migrate_vma_state_t *state,
835                                       unsigned long *next_addr)
836 {
837     NV_STATUS status = NV_OK;
838     struct mm_struct *mm = vma->vm_mm;
839     uvm_migrate_args_t *uvm_migrate_args = state->uvm_migrate_args;
840     uvm_va_space_t *va_space = uvm_migrate_args->va_space;
841 
842     UVM_ASSERT(PAGE_ALIGNED(start));
843     UVM_ASSERT(PAGE_ALIGNED(outer));
844     UVM_ASSERT(vma->vm_end > start);
845     UVM_ASSERT(vma->vm_start < outer);
846     uvm_assert_mmap_lock_locked(mm);
847     uvm_assert_rwsem_locked(&va_space->lock);
848 
849     // Adjust to input range boundaries
850     start = max(start, vma->vm_start);
851     outer = min(outer, vma->vm_end);
852 
853     // TODO: Bug 2419180: support file-backed pages in migrate_vma, when
854     //       support for it is added to the Linux kernel
855     if (!vma_is_anonymous(vma))
856         return NV_WARN_NOTHING_TO_DO;
857 
858     if (uvm_processor_mask_empty(&va_space->registered_gpus))
859         return NV_WARN_NOTHING_TO_DO;
860 
861     while (start < outer) {
862         const size_t region_size = min(outer - start, UVM_MIGRATE_VMA_MAX_SIZE);
863 
864         status = migrate_pageable_vma_region(vma, start, start + region_size, state, next_addr);
865         if (status == NV_ERR_MORE_PROCESSING_REQUIRED) {
866             UVM_ASSERT(*next_addr >= start);
867             UVM_ASSERT(*next_addr < outer);
868         }
869 
870         if (status != NV_OK)
871             break;
872 
873         start += region_size;
874     };
875 
876     return status;
877 }
878 
879 static NV_STATUS migrate_pageable(migrate_vma_state_t *state)
880 {
881     uvm_migrate_args_t *uvm_migrate_args = state->uvm_migrate_args;
882     uvm_va_space_t *va_space = uvm_migrate_args->va_space;
883     const unsigned long length = uvm_migrate_args->length;
884     NvU64 *user_space_start = uvm_migrate_args->user_space_start;
885     NvU64 *user_space_length = uvm_migrate_args->user_space_length;
886     struct mm_struct *mm = uvm_migrate_args->mm;
887     unsigned long start = uvm_migrate_args->start;
888     unsigned long outer = start + length;
889     unsigned long prev_outer = outer;
890     struct vm_area_struct *vma;
891 
892     UVM_ASSERT(PAGE_ALIGNED(start));
893     UVM_ASSERT(PAGE_ALIGNED(length));
894     uvm_assert_mmap_lock_locked(mm);
895 
896     vma = find_vma_intersection(mm, start, outer);
897     if (!vma || (start < vma->vm_start))
898         return NV_ERR_INVALID_ADDRESS;
899 
900     // VMAs are validated and migrated one at a time, since migrate_vma works
901     // on one vma at a time
902     for (; vma->vm_start <= prev_outer; vma = find_vma_intersection(mm, prev_outer, outer)) {
903         unsigned long next_addr = 0;
904         NV_STATUS status;
905 
906         // Callers have already validated the range so the vma should be valid.
907         UVM_ASSERT(vma);
908 
909         status = migrate_pageable_vma(vma, start, outer, state, &next_addr);
910         if (status == NV_WARN_NOTHING_TO_DO) {
911             NV_STATUS populate_status = NV_OK;
912             bool touch = uvm_migrate_args->touch;
913             uvm_populate_permissions_t populate_permissions = uvm_migrate_args->populate_permissions;
914 
915             UVM_ASSERT(!vma_is_anonymous(vma) || uvm_processor_mask_empty(&va_space->registered_gpus));
916 
917             // We can't use migrate_vma to move the pages as desired. Normally
918             // this fallback path is supposed to populate the memory then inform
919             // user mode that it should call move_pages, but that move_pages
920             // call won't work as expected if the caller is in the wrong
921             // process. Make that failure explicit so the caller is aware that
922             // move_pages won't behave as expected.
923             //
924             // If the caller is a kernel thread, such as the GPU BH, continue
925             // with population since there's no move_pages fallback.
926             if (current->mm != mm && !(current->flags & PF_KTHREAD))
927                 return NV_ERR_NOT_SUPPORTED;
928 
929             // Populate pages with uvm_populate_pageable
930             populate_status = uvm_populate_pageable_vma(vma, start, length, 0, touch, populate_permissions);
931             if (populate_status == NV_OK) {
932                 *user_space_start = max(vma->vm_start, start);
933                 *user_space_length = min(vma->vm_end, outer) - *user_space_start;
934             }
935             else {
936                 status = populate_status;
937             }
938         }
939         else if (status == NV_ERR_MORE_PROCESSING_REQUIRED) {
940             UVM_ASSERT(next_addr >= start);
941             UVM_ASSERT(next_addr < outer);
942             UVM_ASSERT(UVM_ID_IS_CPU(uvm_migrate_args->dst_id));
943 
944             *user_space_start = next_addr;
945         }
946 
947         if (status != NV_OK)
948             return status;
949 
950         if (vma->vm_end >= outer)
951             return NV_OK;
952 
953         prev_outer = vma->vm_end;
954     }
955 
956     // Input range not fully covered by VMAs.
957     return NV_ERR_INVALID_ADDRESS;
958 }
959 
960 NV_STATUS uvm_migrate_pageable(uvm_migrate_args_t *uvm_migrate_args)
961 {
962     migrate_vma_state_t *state = NULL;
963     NV_STATUS status;
964     uvm_va_space_t *va_space = uvm_migrate_args->va_space;
965     uvm_processor_id_t dst_id = uvm_migrate_args->dst_id;
966     int dst_node_id = uvm_migrate_args->dst_node_id;
967 
968     UVM_ASSERT(PAGE_ALIGNED(uvm_migrate_args->start));
969     UVM_ASSERT(PAGE_ALIGNED(uvm_migrate_args->length));
970     uvm_assert_mmap_lock_locked(uvm_migrate_args->mm);
971 
972     if (UVM_ID_IS_CPU(dst_id)) {
973         // We only check that dst_node_id is a valid node in the system and it
974         // doesn't correspond to a GPU node. This is fine because
975         // alloc_pages_node will clamp the allocation to
976         // cpuset_current_mems_allowed, and uvm_migrate_pageable is only called
977         // from process context (uvm_migrate) when dst_id is CPU. UVM bottom
978         // half never calls uvm_migrate_pageable when dst_id is CPU. So, assert
979         // that we're in a user thread. However, this would need to change if we
980         // wanted to call this function from a bottom half with CPU dst_id.
981         UVM_ASSERT(!(current->flags & PF_KTHREAD));
982 
983         if (!nv_numa_node_has_memory(dst_node_id) || get_gpu_from_node_id(va_space, dst_node_id) != NULL)
984             return NV_ERR_INVALID_ARGUMENT;
985     }
986     else {
987         // Incoming dst_node_id is only valid if dst_id belongs to the CPU. Use
988         // dst_node_id as the GPU node id if dst_id doesn't belong to the CPU.
989         uvm_migrate_args->dst_node_id = uvm_gpu_numa_info(uvm_va_space_get_gpu(va_space, dst_id))->node_id;
990     }
991 
992     state = kmem_cache_alloc(g_uvm_migrate_vma_state_cache, NV_UVM_GFP_FLAGS);
993     if (!state)
994         return NV_ERR_NO_MEMORY;
995 
996     state->uvm_migrate_args = uvm_migrate_args;
997     status = migrate_pageable(state);
998 
999     kmem_cache_free(g_uvm_migrate_vma_state_cache, state);
1000 
1001     return status;
1002 }
1003 
1004 NV_STATUS uvm_migrate_pageable_init(void)
1005 {
1006     g_uvm_migrate_vma_state_cache = NV_KMEM_CACHE_CREATE("migrate_vma_state_t", migrate_vma_state_t);
1007     if (!g_uvm_migrate_vma_state_cache)
1008         return NV_ERR_NO_MEMORY;
1009 
1010     return NV_OK;
1011 }
1012 
1013 void uvm_migrate_pageable_exit(void)
1014 {
1015     kmem_cache_destroy_safe(&g_uvm_migrate_vma_state_cache);
1016 }
1017 #endif
1018