1 /*******************************************************************************
2     Copyright (c) 2018-2022 NVIDIA Corporation
3 
4     Permission is hereby granted, free of charge, to any person obtaining a copy
5     of this software and associated documentation files (the "Software"), to
6     deal in the Software without restriction, including without limitation the
7     rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
8     sell copies of the Software, and to permit persons to whom the Software is
9     furnished to do so, subject to the following conditions:
10 
11         The above copyright notice and this permission notice shall be
12         included in all copies or substantial portions of the Software.
13 
14     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17     THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19     FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20     DEALINGS IN THE SOFTWARE.
21 
22 *******************************************************************************/
23 
24 #include "uvm_common.h"
25 #include "uvm_linux.h"
26 #include "uvm_gpu.h"
27 #include "uvm_lock.h"
28 #include "uvm_va_space.h"
29 #include "uvm_tracker.h"
30 #include "uvm_api.h"
31 #include "uvm_push.h"
32 #include "uvm_hal.h"
33 #include "uvm_migrate_pageable.h"
34 #include "uvm_populate_pageable.h"
35 
36 #ifdef UVM_MIGRATE_VMA_SUPPORTED
37 
38 static struct kmem_cache *g_uvm_migrate_vma_state_cache __read_mostly;
39 
40 static const gfp_t g_migrate_vma_gfp_flags = NV_UVM_GFP_FLAGS | GFP_HIGHUSER_MOVABLE | __GFP_THISNODE;
41 
42 // Compute the address needed for copying_gpu to access the given page,
43 // resident on resident_id.
44 static NV_STATUS migrate_vma_page_copy_address(struct page *page,
45                                                unsigned long page_index,
46                                                uvm_processor_id_t resident_id,
47                                                uvm_gpu_t *copying_gpu,
48                                                migrate_vma_state_t *state,
49                                                uvm_gpu_address_t *gpu_addr)
50 {
51     uvm_va_space_t *va_space = state->uvm_migrate_args->va_space;
52     uvm_gpu_t *owning_gpu = UVM_ID_IS_CPU(resident_id)? NULL: uvm_va_space_get_gpu(va_space, resident_id);
53     const bool can_copy_from = uvm_processor_mask_test(&va_space->can_copy_from[uvm_id_value(copying_gpu->id)],
54                                                        resident_id);
55     const bool direct_peer = owning_gpu &&
56                              (owning_gpu != copying_gpu) &&
57                              can_copy_from &&
58                              !uvm_gpu_peer_caps(owning_gpu, copying_gpu)->is_indirect_peer;
59 
60     UVM_ASSERT(page_index < state->num_pages);
61 
62     memset(gpu_addr, 0, sizeof(*gpu_addr));
63 
64     if (owning_gpu == copying_gpu) {
65         // Local vidmem address
66         *gpu_addr = uvm_gpu_address_copy(owning_gpu, uvm_gpu_page_to_phys_address(owning_gpu, page));
67     }
68     else if (direct_peer) {
69         // Direct GPU peer
70         uvm_gpu_identity_mapping_t *gpu_peer_mappings = uvm_gpu_get_peer_mapping(copying_gpu, owning_gpu->id);
71         uvm_gpu_phys_address_t phys_addr = uvm_gpu_page_to_phys_address(owning_gpu, page);
72 
73         *gpu_addr = uvm_gpu_address_virtual(gpu_peer_mappings->base + phys_addr.address);
74     }
75     else {
76         // Sysmem/Indirect Peer
77         NV_STATUS status = uvm_gpu_map_cpu_page(copying_gpu->parent, page, &state->dma.addrs[page_index]);
78 
79         if (status != NV_OK)
80             return status;
81 
82         state->dma.addrs_gpus[page_index] = copying_gpu;
83 
84         if (state->dma.num_pages++ == 0)
85             bitmap_zero(state->dma.page_mask, state->num_pages);
86 
87         UVM_ASSERT(!test_bit(page_index, state->dma.page_mask));
88 
89         __set_bit(page_index, state->dma.page_mask);
90 
91         *gpu_addr = uvm_gpu_address_copy(copying_gpu,
92                                          uvm_gpu_phys_address(UVM_APERTURE_SYS, state->dma.addrs[page_index]));
93     }
94 
95     return NV_OK;
96 }
97 
98 // Create a new push to zero pages on dst_id
99 static NV_STATUS migrate_vma_zero_begin_push(uvm_va_space_t *va_space,
100                                              uvm_processor_id_t dst_id,
101                                              uvm_gpu_t *gpu,
102                                              unsigned long start,
103                                              unsigned long outer,
104                                              uvm_push_t *push)
105 {
106     uvm_channel_type_t channel_type;
107 
108     if (UVM_ID_IS_CPU(dst_id)) {
109         channel_type = UVM_CHANNEL_TYPE_GPU_TO_CPU;
110     }
111     else {
112         UVM_ASSERT(uvm_id_equal(dst_id, gpu->id));
113         channel_type = UVM_CHANNEL_TYPE_GPU_INTERNAL;
114     }
115 
116     return uvm_push_begin(gpu->channel_manager,
117                           channel_type,
118                           push,
119                           "Zero %s from %s VMA region [0x%lx, 0x%lx]",
120                           uvm_va_space_processor_name(va_space, dst_id),
121                           uvm_va_space_processor_name(va_space, gpu->id),
122                           start,
123                           outer);
124 }
125 
126 // Create a new push to copy pages between src_id and dst_id
127 static NV_STATUS migrate_vma_copy_begin_push(uvm_va_space_t *va_space,
128                                              uvm_processor_id_t dst_id,
129                                              uvm_processor_id_t src_id,
130                                              unsigned long start,
131                                              unsigned long outer,
132                                              uvm_push_t *push)
133 {
134     uvm_channel_type_t channel_type;
135     uvm_gpu_t *gpu;
136 
137     UVM_ASSERT_MSG(!uvm_id_equal(src_id, dst_id),
138                    "Unexpected copy to self, processor %s\n",
139                    uvm_va_space_processor_name(va_space, src_id));
140 
141     if (UVM_ID_IS_CPU(src_id)) {
142         gpu = uvm_va_space_get_gpu(va_space, dst_id);
143         channel_type = UVM_CHANNEL_TYPE_CPU_TO_GPU;
144     }
145     else if (UVM_ID_IS_CPU(dst_id)) {
146         gpu = uvm_va_space_get_gpu(va_space, src_id);
147         channel_type = UVM_CHANNEL_TYPE_GPU_TO_CPU;
148     }
149     else {
150         // For GPU to GPU copies, prefer to "push" the data from the source as
151         // that works better
152         gpu = uvm_va_space_get_gpu(va_space, src_id);
153 
154         channel_type = UVM_CHANNEL_TYPE_GPU_TO_GPU;
155     }
156 
157     // NUMA-enabled GPUs can copy to any other NUMA node in the system even if
158     // P2P access has not been explicitly enabled (ie va_space->can_copy_from
159     // is not set).
160     if (!gpu->mem_info.numa.enabled) {
161         UVM_ASSERT_MSG(uvm_processor_mask_test(&va_space->can_copy_from[uvm_id_value(gpu->id)], dst_id),
162                        "GPU %s dst %s src %s\n",
163                        uvm_va_space_processor_name(va_space, gpu->id),
164                        uvm_va_space_processor_name(va_space, dst_id),
165                        uvm_va_space_processor_name(va_space, src_id));
166         UVM_ASSERT_MSG(uvm_processor_mask_test(&va_space->can_copy_from[uvm_id_value(gpu->id)], src_id),
167                        "GPU %s dst %s src %s\n",
168                        uvm_va_space_processor_name(va_space, gpu->id),
169                        uvm_va_space_processor_name(va_space, dst_id),
170                        uvm_va_space_processor_name(va_space, src_id));
171     }
172 
173     if (channel_type == UVM_CHANNEL_TYPE_GPU_TO_GPU) {
174         uvm_gpu_t *dst_gpu = uvm_va_space_get_gpu(va_space, dst_id);
175         return uvm_push_begin_gpu_to_gpu(gpu->channel_manager,
176                                          dst_gpu,
177                                          push,
178                                          "Copy from %s to %s for VMA region [0x%lx, 0x%lx]",
179                                          uvm_va_space_processor_name(va_space, src_id),
180                                          uvm_va_space_processor_name(va_space, dst_id),
181                                          start,
182                                          outer);
183     }
184 
185     return uvm_push_begin(gpu->channel_manager,
186                           channel_type,
187                           push,
188                           "Copy from %s to %s for VMA region [0x%lx, 0x%lx]",
189                           uvm_va_space_processor_name(va_space, src_id),
190                           uvm_va_space_processor_name(va_space, dst_id),
191                           start,
192                           outer);
193 }
194 
195 static void migrate_vma_compute_masks(struct vm_area_struct *vma, const unsigned long *src, migrate_vma_state_t *state)
196 {
197     unsigned long i;
198     const bool is_rw = vma->vm_flags & VM_WRITE;
199     uvm_migrate_args_t *uvm_migrate_args = state->uvm_migrate_args;
200     uvm_processor_id_t dst_id = uvm_migrate_args->dst_id;
201 
202     UVM_ASSERT(vma_is_anonymous(vma));
203 
204     bitmap_zero(state->populate_pages_mask, state->num_pages);
205     bitmap_zero(state->allocation_failed_mask, state->num_pages);
206     bitmap_zero(state->dst_resident_pages_mask, state->num_pages);
207 
208     uvm_processor_mask_zero(&state->src_processors);
209     state->num_populate_anon_pages = 0;
210     state->dma.num_pages = 0;
211 
212     for (i = 0; i < state->num_pages; ++i) {
213         uvm_processor_id_t src_id;
214         struct page *src_page = NULL;
215         int src_nid;
216         uvm_gpu_t *src_gpu = NULL;
217 
218         // Skip pages that cannot be migrated
219         if (!(src[i] & MIGRATE_PFN_MIGRATE)) {
220             // This can happen in two cases :
221             // - Page is populated but can't be migrated.
222             // - Page isn't populated
223             // In both the above cases, treat the page as failing migration and
224             // populate with get_user_pages.
225             if (!(src[i] & MIGRATE_PFN_VALID))
226                 __set_bit(i, state->populate_pages_mask);
227 
228             continue;
229         }
230 
231         src_page = migrate_pfn_to_page(src[i]);
232         if (!src_page) {
233             if (is_rw) {
234                 // Populate PROT_WRITE vmas in migrate_vma so we can use the
235                 // GPU's copy engines
236                 if (state->num_populate_anon_pages++ == 0)
237                     bitmap_zero(state->processors[uvm_id_value(dst_id)].page_mask, state->num_pages);
238 
239                 __set_bit(i, state->processors[uvm_id_value(dst_id)].page_mask);
240             }
241             else {
242                 // PROT_NONE vmas cannot be populated. PROT_READ anonymous vmas
243                 // are populated using the zero page. In order to match this
244                 // behavior, we tell the caller to populate using
245                 // get_user_pages.
246                 __set_bit(i, state->populate_pages_mask);
247             }
248 
249             continue;
250         }
251 
252         // Page is already mapped. Skip migration of this page if requested.
253         if (uvm_migrate_args->skip_mapped) {
254             __set_bit(i, state->populate_pages_mask);
255             continue;
256         }
257 
258         src_nid = page_to_nid(src_page);
259 
260         // Already at destination
261         if (src_nid == uvm_migrate_args->dst_node_id) {
262             __set_bit(i, state->dst_resident_pages_mask);
263             continue;
264         }
265 
266         // Already resident on a CPU node, don't move
267         if (UVM_ID_IS_CPU(dst_id) && node_state(src_nid, N_CPU)) {
268             __set_bit(i, state->dst_resident_pages_mask);
269             continue;
270         }
271 
272         src_gpu = uvm_va_space_find_gpu_with_memory_node_id(uvm_migrate_args->va_space, src_nid);
273 
274         // Already resident on a node with no CPUs that doesn't belong to a
275         // GPU, don't move
276         if (UVM_ID_IS_CPU(dst_id) && !src_gpu) {
277             __set_bit(i, state->dst_resident_pages_mask);
278             continue;
279         }
280 
281         // TODO: Bug 2449272: Implement non-P2P copies. All systems that hit
282         // this path have P2P copy support between all GPUs in the system, but
283         // it could change in the future.
284 
285         if (src_gpu)
286             src_id = src_gpu->id;
287         else
288             src_id = UVM_ID_CPU;
289 
290         if (!uvm_processor_mask_test_and_set(&state->src_processors, src_id))
291             bitmap_zero(state->processors[uvm_id_value(src_id)].page_mask, state->num_pages);
292 
293         __set_bit(i, state->processors[uvm_id_value(src_id)].page_mask);
294     }
295 }
296 
297 static struct page *migrate_vma_alloc_page(migrate_vma_state_t *state)
298 {
299     struct page *dst_page;
300     uvm_migrate_args_t *uvm_migrate_args = state->uvm_migrate_args;
301     uvm_va_space_t *va_space = uvm_migrate_args->va_space;
302 
303     if (uvm_enable_builtin_tests && atomic_dec_if_positive(&va_space->test.migrate_vma_allocation_fail_nth) == 0) {
304         dst_page = NULL;
305     }
306     else {
307         dst_page = alloc_pages_node(uvm_migrate_args->dst_node_id, g_migrate_vma_gfp_flags, 0);
308 
309         // TODO: Bug 2399573: Linux commit
310         // 183f6371aac2a5496a8ef2b0b0a68562652c3cdb introduced a bug that makes
311         // __GFP_THISNODE not always be honored (this was later fixed in commit
312         // 7810e6781e0fcbca78b91cf65053f895bf59e85f). Therefore, we verify
313         // whether the flag has been honored and abort the allocation,
314         // otherwise. Remove this check when the fix is deployed on all
315         // production systems.
316         if (dst_page && page_to_nid(dst_page) != uvm_migrate_args->dst_node_id) {
317             __free_page(dst_page);
318             dst_page = NULL;
319         }
320     }
321 
322     return dst_page;
323 }
324 
325 static NV_STATUS migrate_vma_populate_anon_pages(struct vm_area_struct *vma,
326                                                  unsigned long *dst,
327                                                  unsigned long start,
328                                                  unsigned long outer,
329                                                  migrate_vma_state_t *state)
330 {
331     NV_STATUS status = NV_OK;
332     uvm_migrate_args_t *uvm_migrate_args = state->uvm_migrate_args;
333     uvm_processor_id_t dst_id = uvm_migrate_args->dst_id;
334     unsigned long *page_mask = state->processors[uvm_id_value(dst_id)].page_mask;
335     uvm_gpu_t *copying_gpu = NULL;
336     uvm_va_space_t *va_space = uvm_migrate_args->va_space;
337     uvm_push_t push;
338     unsigned long i;
339 
340     // Nothing to do
341     if (state->num_populate_anon_pages == 0)
342         return NV_OK;
343 
344     UVM_ASSERT(state->num_populate_anon_pages == bitmap_weight(page_mask, state->num_pages));
345 
346     for_each_set_bit(i, page_mask, state->num_pages) {
347         uvm_gpu_address_t dst_address;
348         struct page *dst_page;
349 
350         dst_page = migrate_vma_alloc_page(state);
351         if (!dst_page) {
352             __set_bit(i, state->allocation_failed_mask);
353             continue;
354         }
355 
356         if (!copying_gpu) {
357             // Try to get a GPU attached to the node being populated. If there
358             // is none, use any of the GPUs registered in the VA space.
359             if (UVM_ID_IS_CPU(dst_id)) {
360                 copying_gpu = uvm_va_space_find_first_gpu_attached_to_cpu_node(va_space, uvm_migrate_args->dst_node_id);
361                 if (!copying_gpu)
362                     copying_gpu = uvm_va_space_find_first_gpu(va_space);
363             }
364             else {
365                 copying_gpu = uvm_va_space_get_gpu(va_space, dst_id);
366             }
367 
368             UVM_ASSERT(copying_gpu);
369 
370             status = migrate_vma_zero_begin_push(va_space, dst_id, copying_gpu, start, outer - 1, &push);
371             if (status != NV_OK) {
372                 __free_page(dst_page);
373                 return status;
374             }
375         }
376         else {
377             uvm_push_set_flag(&push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
378         }
379 
380         status = migrate_vma_page_copy_address(dst_page, i, dst_id, copying_gpu, state, &dst_address);
381         if (status != NV_OK) {
382             __free_page(dst_page);
383             break;
384         }
385 
386         lock_page(dst_page);
387 
388         // We'll push one membar later for all memsets in this loop
389         uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
390         copying_gpu->parent->ce_hal->memset_8(&push, dst_address, 0, PAGE_SIZE);
391 
392         dst[i] = migrate_pfn(page_to_pfn(dst_page));
393     }
394 
395     if (copying_gpu) {
396         NV_STATUS tracker_status;
397 
398         uvm_push_end(&push);
399 
400         tracker_status = uvm_tracker_add_push_safe(&state->tracker, &push);
401         if (status == NV_OK)
402             status = tracker_status;
403     }
404 
405     return status;
406 }
407 
408 static NV_STATUS migrate_vma_copy_pages_from(struct vm_area_struct *vma,
409                                              const unsigned long *src,
410                                              unsigned long *dst,
411                                              unsigned long start,
412                                              unsigned long outer,
413                                              uvm_processor_id_t src_id,
414                                              migrate_vma_state_t *state)
415 {
416     NV_STATUS status = NV_OK;
417     uvm_push_t push;
418     unsigned long i;
419     uvm_gpu_t *copying_gpu = NULL;
420     uvm_migrate_args_t *uvm_migrate_args = state->uvm_migrate_args;
421     uvm_processor_id_t dst_id = uvm_migrate_args->dst_id;
422     unsigned long *page_mask = state->processors[uvm_id_value(src_id)].page_mask;
423     uvm_va_space_t *va_space = uvm_migrate_args->va_space;
424 
425     UVM_ASSERT(!bitmap_empty(page_mask, state->num_pages));
426 
427     for_each_set_bit(i, page_mask, state->num_pages) {
428         uvm_gpu_address_t src_address;
429         uvm_gpu_address_t dst_address;
430         struct page *src_page = migrate_pfn_to_page(src[i]);
431         struct page *dst_page;
432 
433         UVM_ASSERT(src[i] & MIGRATE_PFN_VALID);
434         UVM_ASSERT(src_page);
435 
436         dst_page = migrate_vma_alloc_page(state);
437         if (!dst_page) {
438             __set_bit(i, state->allocation_failed_mask);
439             continue;
440         }
441 
442         if (!copying_gpu) {
443             status = migrate_vma_copy_begin_push(va_space, dst_id, src_id, start, outer - 1, &push);
444             if (status != NV_OK) {
445                 __free_page(dst_page);
446                 return status;
447             }
448 
449             copying_gpu = uvm_push_get_gpu(&push);
450         }
451         else {
452             uvm_push_set_flag(&push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
453         }
454 
455         // We don't have a case where both src and dst use the SYS aperture, so
456         // the second call can't overwrite a dma addr set up by the first call.
457         status = migrate_vma_page_copy_address(src_page, i, src_id, copying_gpu, state, &src_address);
458         if (status == NV_OK)
459             status = migrate_vma_page_copy_address(dst_page, i, dst_id, copying_gpu, state, &dst_address);
460 
461         if (status != NV_OK) {
462             __free_page(dst_page);
463             break;
464         }
465 
466         lock_page(dst_page);
467 
468         // We'll push one membar later for all copies in this loop
469         uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
470         copying_gpu->parent->ce_hal->memcopy(&push, dst_address, src_address, PAGE_SIZE);
471 
472         dst[i] = migrate_pfn(page_to_pfn(dst_page));
473     }
474 
475     // TODO: Bug 1766424: If the destination is a GPU and the copy was done by
476     //       that GPU, use a GPU-local membar if no peer nor the CPU can
477     //       currently map this page. When peer access gets enabled, do a
478     //       MEMBAR_SYS at that point.
479     if (copying_gpu) {
480         NV_STATUS tracker_status;
481 
482         uvm_push_end(&push);
483 
484         tracker_status = uvm_tracker_add_push_safe(&state->tracker, &push);
485         if (status == NV_OK)
486             status = tracker_status;
487     }
488 
489     return status;
490 }
491 
492 static NV_STATUS migrate_vma_copy_pages(struct vm_area_struct *vma,
493                                         const unsigned long *src,
494                                         unsigned long *dst,
495                                         unsigned long start,
496                                         unsigned long outer,
497                                         migrate_vma_state_t *state)
498 {
499     uvm_processor_id_t src_id;
500 
501     for_each_id_in_mask(src_id, &state->src_processors) {
502         NV_STATUS status = migrate_vma_copy_pages_from(vma, src, dst, start, outer, src_id, state);
503         if (status != NV_OK)
504             return status;
505     }
506 
507     return NV_OK;
508 }
509 
510 void migrate_vma_cleanup_pages(unsigned long *dst, unsigned long npages)
511 {
512     unsigned long i;
513 
514     for (i = 0; i < npages; i++) {
515         struct page *dst_page = migrate_pfn_to_page(dst[i]);
516 
517         if (!dst_page)
518             continue;
519 
520         unlock_page(dst_page);
521         __free_page(dst_page);
522         dst[i] = 0;
523     }
524 }
525 
526 void uvm_migrate_vma_alloc_and_copy(struct migrate_vma *args, migrate_vma_state_t *state)
527 {
528     struct vm_area_struct *vma = args->vma;
529     unsigned long start = args->start;
530     unsigned long outer = args->end;
531     NV_STATUS tracker_status;
532 
533     uvm_tracker_init(&state->tracker);
534 
535     state->num_pages = (outer - start) / PAGE_SIZE;
536     state->status = NV_OK;
537 
538     migrate_vma_compute_masks(vma, args->src, state);
539 
540     state->status = migrate_vma_populate_anon_pages(vma, args->dst, start, outer, state);
541 
542     if (state->status == NV_OK)
543         state->status = migrate_vma_copy_pages(vma, args->src, args->dst, start, outer, state);
544 
545     // Wait for tracker since all copies must have completed before returning
546     tracker_status = uvm_tracker_wait_deinit(&state->tracker);
547 
548     if (state->status == NV_OK)
549         state->status = tracker_status;
550 
551     // Mark all pages as not migrating if we're failing
552     if (state->status != NV_OK)
553         migrate_vma_cleanup_pages(args->dst, state->num_pages);
554 }
555 
556 void uvm_migrate_vma_alloc_and_copy_helper(struct vm_area_struct *vma,
557                                 const unsigned long *src,
558                                 unsigned long *dst,
559                                 unsigned long start,
560                                 unsigned long end,
561                                 void *private)
562 {
563     struct migrate_vma args =
564     {
565         .vma = vma,
566         .dst = dst,
567         .src = (unsigned long *) src,
568         .start = start,
569         .end = end,
570     };
571 
572     uvm_migrate_vma_alloc_and_copy(&args, (migrate_vma_state_t *) private);
573 }
574 
575 void uvm_migrate_vma_finalize_and_map(struct migrate_vma *args, migrate_vma_state_t *state)
576 {
577     unsigned long i;
578 
579     for (i = 0; i < state->num_pages; i++) {
580         bool needs_touch = false;
581         uvm_migrate_args_t *uvm_migrate_args = state->uvm_migrate_args;
582 
583         // The page was successfully migrated.
584         if (args->src[i] & MIGRATE_PFN_MIGRATE) {
585             // Touch if requested since population of these pages won't be tried
586             // later.
587             needs_touch = true;
588         }
589         else {
590             // The page was not migrated. This can happen for two reasons.
591             //
592             // 1. Page is already resident at the destination.
593             // 2. Page failed migration because the page state could not be
594             // migrated by the kernel.
595             //
596             // So, only set the corresponding populate_pages bit if both the
597             // following conditions are true.
598             //
599             // 1.Trying to populate pages (with gup) which are already resident
600             // at the destination is wasteful but usually harmless except in the
601             // PROT_NONE case. gup returns NV_ERR_INVALID_ADDRESS for such pages
602             // and will incorrectly lead to API migration failures even though
603             // migration worked as expected.
604             //
605             // 2. Migration failure was not because of allocation failure in
606             // uvm_migrate_vma_finalize_and_map() since such failures would be
607             // indicated in allocation_failed_mask. Failures other than
608             // allocation failures likely means that the page is populated
609             // somewhere. So, set the corresponding bit in populate_pages_mask.
610             if (test_bit(i, state->dst_resident_pages_mask)) {
611 
612                 // If touch was requested, pages in allocation_failed and
613                 // populate_pages masks will be touched during population. But pages
614                 // which are already resident at the destination need to be touched
615                 // here since population isn't tried later for such pages.
616                 needs_touch = true;
617             }
618             else if (!test_bit(i, state->allocation_failed_mask)) {
619                 __set_bit(i, state->populate_pages_mask);
620             }
621         }
622 
623         // Touch if requested and needed.
624         if (uvm_migrate_args->touch && needs_touch) {
625             struct page *dst_page;
626 
627             UVM_ASSERT(args->dst[i] & MIGRATE_PFN_VALID);
628 
629             dst_page = migrate_pfn_to_page(args->dst[i]);
630             UVM_ASSERT(dst_page);
631             uvm_touch_page(dst_page);
632         }
633     }
634 
635     // Remove the IOMMU mappings created during the copy
636     if (state->dma.num_pages > 0) {
637 
638         for_each_set_bit(i, state->dma.page_mask, state->num_pages)
639             uvm_gpu_unmap_cpu_page(state->dma.addrs_gpus[i]->parent, state->dma.addrs[i]);
640     }
641 
642     UVM_ASSERT(!bitmap_intersects(state->populate_pages_mask, state->allocation_failed_mask, state->num_pages));
643 }
644 
645 void uvm_migrate_vma_finalize_and_map_helper(struct vm_area_struct *vma,
646                                              const unsigned long *src,
647                                              const unsigned long *dst,
648                                              unsigned long start,
649                                              unsigned long end,
650                                              void *private)
651 {
652     struct migrate_vma args =
653     {
654         .vma = vma,
655         .dst = (unsigned long *) dst,
656         .src = (unsigned long *) src,
657         .start = start,
658         .end = end,
659     };
660 
661     uvm_migrate_vma_finalize_and_map(&args, (migrate_vma_state_t *) private);
662 }
663 
664 static NV_STATUS nv_migrate_vma(struct migrate_vma *args, migrate_vma_state_t *state)
665 {
666     int ret;
667 
668 #if defined(CONFIG_MIGRATE_VMA_HELPER)
669     static const struct migrate_vma_ops uvm_migrate_vma_ops =
670     {
671         .alloc_and_copy = uvm_migrate_vma_alloc_and_copy_helper,
672         .finalize_and_map = uvm_migrate_vma_finalize_and_map_helper,
673     };
674 
675     ret = migrate_vma(&uvm_migrate_vma_ops, args->vma, args->start, args->end, args->src, args->dst, state);
676     if (ret < 0)
677         return errno_to_nv_status(ret);
678 #else // CONFIG_MIGRATE_VMA_HELPER
679 
680 #if defined(NV_MIGRATE_VMA_FLAGS_PRESENT)
681     args->flags = MIGRATE_VMA_SELECT_SYSTEM;
682 #endif // NV_MIGRATE_VMA_FLAGS_PRESENT
683 
684     ret = migrate_vma_setup(args);
685     if (ret < 0)
686         return errno_to_nv_status(ret);
687 
688     uvm_migrate_vma_alloc_and_copy(args, state);
689     if (state->status == NV_OK) {
690         migrate_vma_pages(args);
691         uvm_migrate_vma_finalize_and_map(args, state);
692     }
693 
694     migrate_vma_finalize(args);
695 #endif // CONFIG_MIGRATE_VMA_HELPER
696 
697     return state->status;
698 }
699 
700 static NV_STATUS migrate_pageable_vma_populate_mask(struct vm_area_struct *vma,
701                                                     unsigned long start,
702                                                     unsigned long outer,
703                                                     const unsigned long *mask,
704                                                     migrate_vma_state_t *state)
705 {
706     const unsigned long num_pages = (outer - start) / PAGE_SIZE;
707     unsigned long subregion_first = find_first_bit(mask, num_pages);
708     uvm_migrate_args_t *uvm_migrate_args = state->uvm_migrate_args;
709 
710     while (subregion_first < num_pages) {
711         NV_STATUS status;
712         unsigned long subregion_outer = find_next_zero_bit(mask, num_pages, subregion_first + 1);
713 
714         status = uvm_populate_pageable_vma(vma,
715                                            start + subregion_first * PAGE_SIZE,
716                                            (subregion_outer - subregion_first) * PAGE_SIZE,
717                                            0,
718                                            uvm_migrate_args->touch,
719                                            uvm_migrate_args->populate_permissions);
720         if (status != NV_OK)
721             return status;
722 
723         subregion_first = find_next_bit(mask, num_pages, subregion_outer + 1);
724     }
725 
726     return NV_OK;
727 }
728 
729 static NV_STATUS migrate_pageable_vma_migrate_mask(struct vm_area_struct *vma,
730                                                    unsigned long start,
731                                                    unsigned long outer,
732                                                    const unsigned long *mask,
733                                                    migrate_vma_state_t *state)
734 {
735     NV_STATUS status;
736     const unsigned long num_pages = (outer - start) / PAGE_SIZE;
737     unsigned long subregion_first = find_first_bit(mask, num_pages);
738     uvm_migrate_args_t *uvm_migrate_args = state->uvm_migrate_args;
739     struct migrate_vma args =
740     {
741         .vma = vma,
742         .src = state->src_pfn_array,
743         .dst = state->dst_pfn_array,
744     };
745 
746     UVM_ASSERT(!uvm_migrate_args->skip_mapped);
747 
748     while (subregion_first < num_pages) {
749         unsigned long subregion_outer = find_next_zero_bit(mask, num_pages, subregion_first + 1);
750 
751         args.start = start + subregion_first * PAGE_SIZE;
752         args.end = start + subregion_outer * PAGE_SIZE;
753 
754         status = nv_migrate_vma(&args, state);
755         if (status != NV_OK)
756             return status;
757 
758         // We ignore allocation failure here as we are just retrying migration,
759         // but pages must have already been populated by the caller
760 
761         subregion_first = find_next_bit(mask, num_pages, subregion_outer + 1);
762     }
763 
764     return NV_OK;
765 }
766 
767 static NV_STATUS migrate_pageable_vma_region(struct vm_area_struct *vma,
768                                              unsigned long start,
769                                              unsigned long outer,
770                                              migrate_vma_state_t *state,
771                                              unsigned long *next_addr)
772 {
773     NV_STATUS status;
774     const unsigned long num_pages = (outer - start) / PAGE_SIZE;
775     struct mm_struct *mm = vma->vm_mm;
776     uvm_migrate_args_t *uvm_migrate_args = state->uvm_migrate_args;
777     struct migrate_vma args =
778     {
779         .vma = vma,
780         .src = state->src_pfn_array,
781         .dst = state->dst_pfn_array,
782         .start = start,
783         .end = outer,
784     };
785 
786     UVM_ASSERT(PAGE_ALIGNED(start));
787     UVM_ASSERT(PAGE_ALIGNED(outer));
788     UVM_ASSERT(start < outer);
789     UVM_ASSERT(start >= vma->vm_start);
790     UVM_ASSERT(outer <= vma->vm_end);
791     UVM_ASSERT(outer - start <= UVM_MIGRATE_VMA_MAX_SIZE);
792     uvm_assert_mmap_lock_locked(mm);
793     uvm_assert_rwsem_locked(&uvm_migrate_args->va_space->lock);
794 
795     status = nv_migrate_vma(&args, state);
796     if (status != NV_OK)
797         return status;
798 
799     // Save the returned page masks because they can be overwritten by
800     // migrate_pageable_vma_migrate_mask().
801     bitmap_copy(state->scratch1_mask, state->populate_pages_mask, num_pages);
802     bitmap_copy(state->scratch2_mask, state->allocation_failed_mask, num_pages);
803 
804     if (!bitmap_empty(state->scratch1_mask, state->num_pages)) {
805         // Populate pages using get_user_pages
806         status = migrate_pageable_vma_populate_mask(vma, start, outer, state->scratch1_mask, state);
807         if (status != NV_OK)
808             return status;
809 
810         if (!uvm_migrate_args->skip_mapped) {
811             status = migrate_pageable_vma_migrate_mask(vma, start, outer, state->scratch1_mask, state);
812             if (status != NV_OK)
813                 return status;
814         }
815     }
816 
817     // There is no need to copy the masks again after the migration is retried.
818     // We ignore the allocation_failed, populate_pages and dst_resident_pages
819     // masks set by the retried migration.
820 
821     if (!bitmap_empty(state->scratch2_mask, state->num_pages)) {
822         // If the destination is the CPU, signal user-space to retry with a
823         // different node. Otherwise, just try to populate anywhere in the
824         // system
825         if (UVM_ID_IS_CPU(uvm_migrate_args->dst_id) && !uvm_migrate_args->populate_on_cpu_alloc_failures) {
826             *next_addr = start + find_first_bit(state->scratch2_mask, num_pages) * PAGE_SIZE;
827             return NV_ERR_MORE_PROCESSING_REQUIRED;
828         }
829         else {
830             status = migrate_pageable_vma_populate_mask(vma, start, outer, state->scratch2_mask, state);
831             if (status != NV_OK)
832                 return status;
833         }
834     }
835 
836     return NV_OK;
837 }
838 
839 static NV_STATUS migrate_pageable_vma(struct vm_area_struct *vma,
840                                       unsigned long start,
841                                       unsigned long outer,
842                                       migrate_vma_state_t *state,
843                                       unsigned long *next_addr)
844 {
845     NV_STATUS status = NV_OK;
846     struct mm_struct *mm = vma->vm_mm;
847     uvm_migrate_args_t *uvm_migrate_args = state->uvm_migrate_args;
848     uvm_va_space_t *va_space = uvm_migrate_args->va_space;
849 
850     UVM_ASSERT(PAGE_ALIGNED(start));
851     UVM_ASSERT(PAGE_ALIGNED(outer));
852     UVM_ASSERT(vma->vm_end > start);
853     UVM_ASSERT(vma->vm_start < outer);
854     uvm_assert_mmap_lock_locked(mm);
855     uvm_assert_rwsem_locked(&va_space->lock);
856 
857     // Adjust to input range boundaries
858     start = max(start, vma->vm_start);
859     outer = min(outer, vma->vm_end);
860 
861     // TODO: Bug 2419180: support file-backed pages in migrate_vma, when
862     //       support for it is added to the Linux kernel
863     if (!vma_is_anonymous(vma))
864         return NV_WARN_NOTHING_TO_DO;
865 
866     if (uvm_processor_mask_empty(&va_space->registered_gpus))
867         return NV_WARN_NOTHING_TO_DO;
868 
869     while (start < outer) {
870         const size_t region_size = min(outer - start, UVM_MIGRATE_VMA_MAX_SIZE);
871 
872         status = migrate_pageable_vma_region(vma, start, start + region_size, state, next_addr);
873         if (status == NV_ERR_MORE_PROCESSING_REQUIRED) {
874             UVM_ASSERT(*next_addr >= start);
875             UVM_ASSERT(*next_addr < outer);
876         }
877 
878         if (status != NV_OK)
879             break;
880 
881         start += region_size;
882     };
883 
884     return status;
885 }
886 
887 static NV_STATUS migrate_pageable(migrate_vma_state_t *state)
888 {
889     uvm_migrate_args_t *uvm_migrate_args = state->uvm_migrate_args;
890     uvm_va_space_t *va_space = uvm_migrate_args->va_space;
891     const unsigned long length = uvm_migrate_args->length;
892     NvU64 *user_space_start = uvm_migrate_args->user_space_start;
893     NvU64 *user_space_length = uvm_migrate_args->user_space_length;
894     struct mm_struct *mm = uvm_migrate_args->mm;
895     unsigned long start = uvm_migrate_args->start;
896     unsigned long outer = start + length;
897     unsigned long prev_outer = outer;
898     struct vm_area_struct *vma;
899 
900     UVM_ASSERT(PAGE_ALIGNED(start));
901     UVM_ASSERT(PAGE_ALIGNED(length));
902     uvm_assert_mmap_lock_locked(mm);
903 
904     vma = find_vma_intersection(mm, start, outer);
905     if (!vma || (start < vma->vm_start))
906         return NV_ERR_INVALID_ADDRESS;
907 
908     // VMAs are validated and migrated one at a time, since migrate_vma works
909     // on one vma at a time
910     for (; vma->vm_start <= prev_outer; vma = find_vma_intersection(mm, prev_outer, outer)) {
911         unsigned long next_addr = 0;
912         NV_STATUS status;
913 
914         // Callers have already validated the range so the vma should be valid.
915         UVM_ASSERT(vma);
916 
917         status = migrate_pageable_vma(vma, start, outer, state, &next_addr);
918         if (status == NV_WARN_NOTHING_TO_DO) {
919             NV_STATUS populate_status = NV_OK;
920             bool touch = uvm_migrate_args->touch;
921             uvm_populate_permissions_t populate_permissions = uvm_migrate_args->populate_permissions;
922 
923             UVM_ASSERT(!vma_is_anonymous(vma) || uvm_processor_mask_empty(&va_space->registered_gpus));
924 
925             // We can't use migrate_vma to move the pages as desired. Normally
926             // this fallback path is supposed to populate the memory then inform
927             // user mode that it should call move_pages, but that move_pages
928             // call won't work as expected if the caller is in the wrong
929             // process. Make that failure explicit so the caller is aware that
930             // move_pages won't behave as expected.
931             //
932             // If the caller is a kernel thread, such as the GPU BH, continue
933             // with population since there's no move_pages fallback.
934             if (current->mm != mm && !(current->flags & PF_KTHREAD))
935                 return NV_ERR_NOT_SUPPORTED;
936 
937             // Populate pages with uvm_populate_pageable
938             populate_status = uvm_populate_pageable_vma(vma, start, length, 0, touch, populate_permissions);
939             if (populate_status == NV_OK) {
940                 *user_space_start = max(vma->vm_start, start);
941                 *user_space_length = min(vma->vm_end, outer) - *user_space_start;
942             }
943             else {
944                 status = populate_status;
945             }
946         }
947         else if (status == NV_ERR_MORE_PROCESSING_REQUIRED) {
948             UVM_ASSERT(next_addr >= start);
949             UVM_ASSERT(next_addr < outer);
950             UVM_ASSERT(UVM_ID_IS_CPU(uvm_migrate_args->dst_id));
951 
952             *user_space_start = next_addr;
953         }
954 
955         if (status != NV_OK)
956             return status;
957 
958         if (vma->vm_end >= outer)
959             return NV_OK;
960 
961         prev_outer = vma->vm_end;
962     }
963 
964     // Input range not fully covered by VMAs.
965     return NV_ERR_INVALID_ADDRESS;
966 }
967 
968 NV_STATUS uvm_migrate_pageable(uvm_migrate_args_t *uvm_migrate_args)
969 {
970     migrate_vma_state_t *state = NULL;
971     NV_STATUS status;
972     uvm_va_space_t *va_space = uvm_migrate_args->va_space;
973     uvm_processor_id_t dst_id = uvm_migrate_args->dst_id;
974     int dst_node_id = uvm_migrate_args->dst_node_id;
975 
976     UVM_ASSERT(PAGE_ALIGNED(uvm_migrate_args->start));
977     UVM_ASSERT(PAGE_ALIGNED(uvm_migrate_args->length));
978     uvm_assert_mmap_lock_locked(uvm_migrate_args->mm);
979 
980     if (UVM_ID_IS_CPU(dst_id)) {
981         // We only check that dst_node_id is a valid node in the system and it
982         // doesn't correspond to a GPU node. This is fine because
983         // alloc_pages_node will clamp the allocation to
984         // cpuset_current_mems_allowed when uvm_migrate_pageable is called from
985         // process context (uvm_migrate) when dst_id is CPU. UVM bottom half
986         // calls uvm_migrate_pageable with CPU dst_id only when the VMA memory
987         // policy is set to dst_node_id and dst_node_id is not NUMA_NO_NODE.
988         if (!nv_numa_node_has_memory(dst_node_id) ||
989             uvm_va_space_find_gpu_with_memory_node_id(va_space, dst_node_id) != NULL)
990             return NV_ERR_INVALID_ARGUMENT;
991     }
992     else {
993         // Incoming dst_node_id is only valid if dst_id belongs to the CPU. Use
994         // dst_node_id as the GPU node id if dst_id doesn't belong to the CPU.
995         uvm_migrate_args->dst_node_id = uvm_gpu_numa_node(uvm_va_space_get_gpu(va_space, dst_id));
996     }
997 
998     state = kmem_cache_alloc(g_uvm_migrate_vma_state_cache, NV_UVM_GFP_FLAGS);
999     if (!state)
1000         return NV_ERR_NO_MEMORY;
1001 
1002     state->uvm_migrate_args = uvm_migrate_args;
1003     status = migrate_pageable(state);
1004 
1005     kmem_cache_free(g_uvm_migrate_vma_state_cache, state);
1006 
1007     return status;
1008 }
1009 
1010 NV_STATUS uvm_migrate_pageable_init(void)
1011 {
1012     g_uvm_migrate_vma_state_cache = NV_KMEM_CACHE_CREATE("migrate_vma_state_t", migrate_vma_state_t);
1013     if (!g_uvm_migrate_vma_state_cache)
1014         return NV_ERR_NO_MEMORY;
1015 
1016     return NV_OK;
1017 }
1018 
1019 void uvm_migrate_pageable_exit(void)
1020 {
1021     kmem_cache_destroy_safe(&g_uvm_migrate_vma_state_cache);
1022 }
1023 #endif
1024