1 /*******************************************************************************
2     Copyright (c) 2015-2023 NVIDIA Corporation
3 
4     Permission is hereby granted, free of charge, to any person obtaining a copy
5     of this software and associated documentation files (the "Software"), to
6     deal in the Software without restriction, including without limitation the
7     rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
8     sell copies of the Software, and to permit persons to whom the Software is
9     furnished to do so, subject to the following conditions:
10 
11         The above copyright notice and this permission notice shall be
12         included in all copies or substantial portions of the Software.
13 
14     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17     THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19     FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20     DEALINGS IN THE SOFTWARE.
21 
22 *******************************************************************************/
23 
24 #include "uvm_common.h"
25 #include "uvm_linux.h"
26 #include "uvm_types.h"
27 #include "uvm_api.h"
28 #include "uvm_global.h"
29 #include "uvm_hal.h"
30 #include "uvm_va_range.h"
31 #include "uvm_va_block.h"
32 #include "uvm_kvmalloc.h"
33 #include "uvm_map_external.h"
34 #include "uvm_perf_thrashing.h"
35 #include "nv_uvm_interface.h"
36 
37 static struct kmem_cache *g_uvm_va_range_cache __read_mostly;
38 static struct kmem_cache *g_uvm_vma_wrapper_cache __read_mostly;
39 
40 NV_STATUS uvm_va_range_init(void)
41 {
42     g_uvm_va_range_cache = NV_KMEM_CACHE_CREATE("uvm_va_range_t", uvm_va_range_t);
43     if (!g_uvm_va_range_cache)
44         return NV_ERR_NO_MEMORY;
45 
46     g_uvm_vma_wrapper_cache = NV_KMEM_CACHE_CREATE("uvm_vma_wrapper_t", uvm_vma_wrapper_t);
47     if (!g_uvm_vma_wrapper_cache)
48         return NV_ERR_NO_MEMORY;
49 
50     return uvm_va_block_init();
51 }
52 
53 void uvm_va_range_exit(void)
54 {
55     uvm_va_block_exit();
56     kmem_cache_destroy_safe(&g_uvm_va_range_cache);
57     kmem_cache_destroy_safe(&g_uvm_vma_wrapper_cache);
58 }
59 
60 static NvU64 block_calc_start(uvm_va_range_t *va_range, size_t index)
61 {
62     NvU64 range_start = UVM_VA_BLOCK_ALIGN_DOWN(va_range->node.start);
63     NvU64 block_start = range_start + index * UVM_VA_BLOCK_SIZE;
64     NvU64 start = max(va_range->node.start, block_start);
65     UVM_ASSERT(start < va_range->node.end);
66     return start;
67 }
68 
69 static NvU64 block_calc_end(uvm_va_range_t *va_range, size_t index)
70 {
71     NvU64 start = block_calc_start(va_range, index);
72     NvU64 block_end = UVM_VA_BLOCK_ALIGN_UP(start + 1) - 1; // Inclusive end
73     NvU64 end = min(va_range->node.end, block_end);
74     UVM_ASSERT(end > va_range->node.start);
75     return end;
76 }
77 
78 // Called before the range's bounds have been adjusted. This may not actually
79 // shrink the blocks array. For example, if the shrink attempt fails then
80 // va_range's old array is left intact. This may waste memory, but it means this
81 // function cannot fail.
82 static void blocks_array_shrink(uvm_va_range_t *va_range, size_t new_num_blocks)
83 {
84     size_t new_size = new_num_blocks * sizeof(va_range->blocks[0]);
85     atomic_long_t *new_blocks;
86 
87     UVM_ASSERT(va_range->type == UVM_VA_RANGE_TYPE_MANAGED);
88     UVM_ASSERT(va_range->blocks);
89     UVM_ASSERT(uvm_kvsize(va_range->blocks) >= uvm_va_range_num_blocks(va_range) * sizeof(va_range->blocks[0]));
90     UVM_ASSERT(new_num_blocks);
91     UVM_ASSERT(new_num_blocks <= uvm_va_range_num_blocks(va_range));
92 
93     // TODO: Bug 1766579: This could be optimized by only shrinking the array
94     //       when the new size is half of the old size or some similar
95     //       threshold. Need to profile this on real apps to see if that's worth
96     //       doing.
97 
98     new_blocks = uvm_kvrealloc(va_range->blocks, new_size);
99     if (!new_blocks) {
100         // If we failed to allocate a smaller array, just leave the old one as-is
101         UVM_DBG_PRINT("Failed to shrink range [0x%llx, 0x%llx] from %zu blocks to %zu blocks\n",
102                       va_range->node.start,
103                       va_range->node.end,
104                       uvm_kvsize(va_range->blocks) / sizeof(va_range->blocks[0]),
105                       new_num_blocks);
106         return;
107     }
108 
109     va_range->blocks = new_blocks;
110 }
111 
112 static uvm_va_range_t *uvm_va_range_alloc(uvm_va_space_t *va_space, NvU64 start, NvU64 end)
113 {
114     uvm_va_range_t *va_range = nv_kmem_cache_zalloc(g_uvm_va_range_cache, NV_UVM_GFP_FLAGS);
115     if (!va_range)
116         return NULL;
117 
118     uvm_assert_rwsem_locked_write(&va_space->lock);
119 
120     va_range->va_space = va_space;
121     va_range->node.start = start;
122     va_range->node.end = end;
123 
124     // The range is inserted into the VA space tree only at the end of creation,
125     // so clear the node so the destroy path knows whether to remove it.
126     RB_CLEAR_NODE(&va_range->node.rb_node);
127 
128     return va_range;
129 }
130 
131 static NV_STATUS uvm_va_range_alloc_reclaim(uvm_va_space_t *va_space,
132                                             struct mm_struct *mm,
133                                             uvm_va_range_type_t type,
134                                             NvU64 start,
135                                             NvU64 end,
136                                             uvm_va_range_t **out_va_range)
137 {
138     uvm_va_range_t *va_range;
139     NV_STATUS status;
140 
141     // Check for no overlap with HMM blocks.
142     status = uvm_hmm_va_block_reclaim(va_space, mm, start, end);
143     if (status != NV_OK)
144         return status;
145 
146     va_range = uvm_va_range_alloc(va_space, start, end);
147     if (!va_range)
148         return NV_ERR_NO_MEMORY;
149 
150     va_range->type = type;
151 
152     *out_va_range = va_range;
153     return NV_OK;
154 }
155 
156 static uvm_va_range_t *uvm_va_range_alloc_managed(uvm_va_space_t *va_space, NvU64 start, NvU64 end)
157 {
158     uvm_va_range_t *va_range = NULL;
159 
160     va_range = uvm_va_range_alloc(va_space, start, end);
161     if (!va_range)
162         goto error;
163 
164     va_range->type = UVM_VA_RANGE_TYPE_MANAGED;
165     va_range->managed.policy = uvm_va_policy_default;
166 
167     va_range->blocks = uvm_kvmalloc_zero(uvm_va_range_num_blocks(va_range) * sizeof(va_range->blocks[0]));
168     if (!va_range->blocks) {
169         UVM_DBG_PRINT("Failed to allocate %zu blocks\n", uvm_va_range_num_blocks(va_range));
170         goto error;
171     }
172 
173     return va_range;
174 
175 error:
176     uvm_va_range_destroy(va_range, NULL);
177     return NULL;
178 }
179 
180 NV_STATUS uvm_va_range_create_mmap(uvm_va_space_t *va_space,
181                                    struct mm_struct *mm,
182                                    uvm_vma_wrapper_t *vma_wrapper,
183                                    uvm_va_range_t **out_va_range)
184 {
185     NV_STATUS status;
186     struct vm_area_struct *vma = vma_wrapper->vma;
187     uvm_va_range_t *va_range = NULL;
188 
189     // Check for no overlap with HMM blocks.
190     status = uvm_hmm_va_block_reclaim(va_space, mm, vma->vm_start, vma->vm_end - 1);
191     if (status != NV_OK)
192         return status;
193 
194     // vma->vm_end is exclusive but va_range end is inclusive
195     va_range = uvm_va_range_alloc_managed(va_space, vma->vm_start, vma->vm_end - 1);
196     if (!va_range) {
197         status = NV_ERR_NO_MEMORY;
198         goto error;
199     }
200 
201     va_range->managed.vma_wrapper = vma_wrapper;
202 
203     status = uvm_range_tree_add(&va_space->va_range_tree, &va_range->node);
204     if (status != NV_OK)
205         goto error;
206 
207     if (out_va_range)
208         *out_va_range = va_range;
209 
210     return NV_OK;
211 
212 error:
213     uvm_va_range_destroy(va_range, NULL);
214     return status;
215 }
216 
217 NV_STATUS uvm_va_range_create_external(uvm_va_space_t *va_space,
218                                        struct mm_struct *mm,
219                                        NvU64 start,
220                                        NvU64 length,
221                                        uvm_va_range_t **out_va_range)
222 {
223     NV_STATUS status;
224     uvm_va_range_t *va_range = NULL;
225     NvU32 i;
226 
227     status = uvm_va_range_alloc_reclaim(va_space,
228                                         mm,
229                                         UVM_VA_RANGE_TYPE_EXTERNAL,
230                                         start,
231                                         start + length - 1,
232                                         &va_range);
233     if (status != NV_OK)
234         return status;
235 
236     for (i = 0; i < ARRAY_SIZE(va_range->external.gpu_ranges); i++) {
237         uvm_mutex_init(&va_range->external.gpu_ranges[i].lock, UVM_LOCK_ORDER_EXT_RANGE_TREE);
238         uvm_range_tree_init(&va_range->external.gpu_ranges[i].tree);
239     }
240 
241     status = uvm_range_tree_add(&va_space->va_range_tree, &va_range->node);
242     if (status != NV_OK)
243         goto error;
244 
245     if (out_va_range)
246         *out_va_range = va_range;
247 
248     return NV_OK;
249 
250 error:
251     uvm_va_range_destroy(va_range, NULL);
252     return status;
253 }
254 
255 NV_STATUS uvm_va_range_create_channel(uvm_va_space_t *va_space,
256                                       struct mm_struct *mm,
257                                       NvU64 start,
258                                       NvU64 end,
259                                       uvm_va_range_t **out_va_range)
260 {
261     NV_STATUS status;
262     uvm_va_range_t *va_range = NULL;
263 
264     status = uvm_va_range_alloc_reclaim(va_space,
265                                         mm,
266                                         UVM_VA_RANGE_TYPE_CHANNEL,
267                                         start,
268                                         end,
269                                         &va_range);
270     if (status != NV_OK)
271         return status;
272 
273     INIT_LIST_HEAD(&va_range->channel.list_node);
274 
275     status = uvm_range_tree_add(&va_space->va_range_tree, &va_range->node);
276     if (status != NV_OK)
277         goto error;
278 
279     if (out_va_range)
280         *out_va_range = va_range;
281 
282     return NV_OK;
283 
284 error:
285     uvm_va_range_destroy(va_range, NULL);
286     return status;
287 }
288 
289 NV_STATUS uvm_va_range_create_sked_reflected(uvm_va_space_t *va_space,
290                                              struct mm_struct *mm,
291                                              NvU64 start,
292                                              NvU64 length,
293                                              uvm_va_range_t **out_va_range)
294 {
295     NV_STATUS status;
296     uvm_va_range_t *va_range = NULL;
297 
298     status = uvm_va_range_alloc_reclaim(va_space,
299                                         mm,
300                                         UVM_VA_RANGE_TYPE_SKED_REFLECTED,
301                                         start,
302                                         start + length - 1,
303                                         &va_range);
304     if (status != NV_OK)
305         return status;
306 
307     status = uvm_range_tree_add(&va_space->va_range_tree, &va_range->node);
308     if (status != NV_OK)
309         goto error;
310 
311     if (out_va_range)
312         *out_va_range = va_range;
313 
314     return NV_OK;
315 
316 error:
317     uvm_va_range_destroy(va_range, NULL);
318     return status;
319 }
320 
321 NV_STATUS uvm_va_range_create_semaphore_pool(uvm_va_space_t *va_space,
322                                              struct mm_struct *mm,
323                                              NvU64 start,
324                                              NvU64 length,
325                                              const UvmGpuMappingAttributes *per_gpu_attrs,
326                                              NvU32 per_gpu_attrs_count,
327                                              uvm_va_range_t **out_va_range)
328 {
329     static const uvm_mem_gpu_mapping_attrs_t default_attrs = {
330             .protection = UVM_PROT_READ_WRITE_ATOMIC,
331             .is_cacheable = false
332     };
333 
334     NV_STATUS status;
335     uvm_va_range_t *va_range = NULL;
336     uvm_mem_alloc_params_t mem_alloc_params = { 0 };
337     NvU32 i;
338     uvm_gpu_id_t gpu_id;
339 
340     status = uvm_va_range_alloc_reclaim(va_space,
341                                         mm,
342                                         UVM_VA_RANGE_TYPE_SEMAPHORE_POOL,
343                                         start,
344                                         start + length - 1,
345                                         &va_range);
346     if (status != NV_OK)
347         return status;
348 
349     uvm_tracker_init(&va_range->semaphore_pool.tracker);
350     uvm_mutex_init(&va_range->semaphore_pool.tracker_lock, UVM_LOCK_ORDER_SEMA_POOL_TRACKER);
351 
352     status = uvm_range_tree_add(&va_space->va_range_tree, &va_range->node);
353     if (status != NV_OK)
354         goto error;
355 
356     // The semaphore pool memory is located in sysmem, and must be zeroed upon
357     // allocation because it may be mapped on the user VA space.
358     mem_alloc_params.page_size = UVM_PAGE_SIZE_DEFAULT;
359     mem_alloc_params.size = length;
360     mem_alloc_params.zero = true;
361     mem_alloc_params.mm = mm;
362 
363     va_range->semaphore_pool.default_gpu_attrs = default_attrs;
364     va_range->semaphore_pool.owner = NULL;
365 
366     for_each_gpu_id(gpu_id)
367         va_range->semaphore_pool.gpu_attrs[uvm_id_gpu_index(gpu_id)] = default_attrs;
368 
369     for (i = 0; i < per_gpu_attrs_count; i++) {
370         uvm_gpu_t *gpu;
371         uvm_mem_gpu_mapping_attrs_t attrs = default_attrs;
372 
373         status = uvm_mem_translate_gpu_attributes(&per_gpu_attrs[i], va_space, &gpu, &attrs);
374         if (status != NV_OK)
375             goto error;
376 
377         if (i == 0 && g_uvm_global.conf_computing_enabled)
378             mem_alloc_params.dma_owner = gpu;
379 
380         if (attrs.is_cacheable) {
381             // At most 1 GPU can have this memory cached, in which case it is
382             // the 'owner' GPU.
383             if (va_range->semaphore_pool.owner != NULL) {
384                 UVM_DBG_PRINT("Caching of semaphore pool requested on >1 GPU.");
385                 status = NV_ERR_INVALID_ARGUMENT;
386                 goto error;
387             }
388 
389             va_range->semaphore_pool.owner = gpu;
390         }
391 
392         va_range->semaphore_pool.gpu_attrs[uvm_id_gpu_index(gpu->id)] = attrs;
393     }
394 
395     status = uvm_mem_alloc(&mem_alloc_params, &va_range->semaphore_pool.mem);
396     if (status != NV_OK)
397         goto error;
398 
399     status = uvm_mem_map_cpu_kernel(va_range->semaphore_pool.mem);
400     if (status != NV_OK)
401         goto error;
402 
403     if (out_va_range)
404         *out_va_range = va_range;
405 
406     return NV_OK;
407 
408 error:
409     uvm_va_range_destroy(va_range, NULL);
410     return status;
411 }
412 
413 static void uvm_va_range_destroy_managed(uvm_va_range_t *va_range)
414 {
415     uvm_va_block_t *block;
416     uvm_va_block_t *block_tmp;
417     uvm_perf_event_data_t event_data;
418     NV_STATUS status;
419 
420     UVM_ASSERT(va_range->type == UVM_VA_RANGE_TYPE_MANAGED);
421 
422     if (va_range->blocks) {
423         // Unmap and drop our ref count on each block
424         for_each_va_block_in_va_range_safe(va_range, block, block_tmp)
425             uvm_va_block_kill(block);
426 
427         uvm_kvfree(va_range->blocks);
428     }
429 
430     event_data.range_destroy.range = va_range;
431     uvm_perf_event_notify(&va_range->va_space->perf_events, UVM_PERF_EVENT_RANGE_DESTROY, &event_data);
432 
433     status = uvm_range_group_assign_range(va_range->va_space, NULL, va_range->node.start, va_range->node.end);
434     UVM_ASSERT(status == NV_OK);
435 }
436 
437 static void uvm_va_range_destroy_external(uvm_va_range_t *va_range, struct list_head *deferred_free_list)
438 {
439     uvm_gpu_t *gpu;
440 
441     if (uvm_processor_mask_empty(&va_range->external.mapped_gpus))
442         return;
443 
444     UVM_ASSERT(deferred_free_list);
445 
446     for_each_va_space_gpu_in_mask(gpu, va_range->va_space, &va_range->external.mapped_gpus) {
447         uvm_ext_gpu_range_tree_t *range_tree = uvm_ext_gpu_range_tree(va_range, gpu);
448         uvm_ext_gpu_map_t *ext_map, *ext_map_next;
449 
450         uvm_mutex_lock(&range_tree->lock);
451         uvm_ext_gpu_map_for_each_safe(ext_map, ext_map_next, va_range, gpu)
452             uvm_ext_gpu_map_destroy(va_range, ext_map, deferred_free_list);
453         uvm_mutex_unlock(&range_tree->lock);
454     }
455 
456     UVM_ASSERT(uvm_processor_mask_empty(&va_range->external.mapped_gpus));
457 }
458 
459 static void uvm_va_range_destroy_channel(uvm_va_range_t *va_range)
460 {
461     uvm_gpu_va_space_t *gpu_va_space = va_range->channel.gpu_va_space;
462     uvm_membar_t membar;
463 
464     UVM_ASSERT(va_range->channel.ref_count == 0);
465 
466     // Unmap the buffer
467     if (gpu_va_space && va_range->channel.pt_range_vec.ranges) {
468         membar = uvm_hal_downgrade_membar_type(gpu_va_space->gpu, va_range->channel.aperture == UVM_APERTURE_VID);
469         uvm_page_table_range_vec_clear_ptes(&va_range->channel.pt_range_vec, membar);
470         uvm_page_table_range_vec_deinit(&va_range->channel.pt_range_vec);
471     }
472 
473     list_del(&va_range->channel.list_node);
474 
475     // Channel unregister handles releasing this descriptor back to RM
476     va_range->channel.rm_descriptor = 0;
477 }
478 
479 static void uvm_va_range_destroy_sked_reflected(uvm_va_range_t *va_range)
480 {
481     uvm_gpu_va_space_t *gpu_va_space = va_range->sked_reflected.gpu_va_space;
482 
483     if (!gpu_va_space || !va_range->sked_reflected.pt_range_vec.ranges)
484         return;
485 
486     // The SKED reflected mapping has no physical backing and hence no physical
487     // accesses can be pending to it and no membar is needed.
488     uvm_page_table_range_vec_clear_ptes(&va_range->sked_reflected.pt_range_vec, UVM_MEMBAR_NONE);
489     uvm_page_table_range_vec_deinit(&va_range->sked_reflected.pt_range_vec);
490 
491     va_range->sked_reflected.gpu_va_space = NULL;
492 }
493 
494 static void uvm_va_range_destroy_semaphore_pool(uvm_va_range_t *va_range)
495 {
496     NV_STATUS status = uvm_tracker_wait_deinit(&va_range->semaphore_pool.tracker);
497     if (status != NV_OK) {
498         UVM_ASSERT_MSG(status == uvm_global_get_status(),
499                        "uvm_tracker_wait() returned %d (%s) in uvm_va_range_destroy_semaphore_pool()\n",
500                        status,
501                        nvstatusToString(status));
502     }
503     uvm_mem_free(va_range->semaphore_pool.mem);
504     va_range->semaphore_pool.mem = NULL;
505 }
506 
507 void uvm_va_range_destroy(uvm_va_range_t *va_range, struct list_head *deferred_free_list)
508 {
509     if (!va_range)
510         return;
511 
512     if (!RB_EMPTY_NODE(&va_range->node.rb_node))
513         uvm_range_tree_remove(&va_range->va_space->va_range_tree, &va_range->node);
514 
515     switch (va_range->type) {
516         case UVM_VA_RANGE_TYPE_INVALID:
517             // Skip partially-created ranges with unset types
518             break;
519         case UVM_VA_RANGE_TYPE_MANAGED:
520             uvm_va_range_destroy_managed(va_range);
521             break;
522         case UVM_VA_RANGE_TYPE_EXTERNAL:
523             uvm_va_range_destroy_external(va_range, deferred_free_list);
524             break;
525         case UVM_VA_RANGE_TYPE_CHANNEL:
526             uvm_va_range_destroy_channel(va_range);
527             break;
528         case UVM_VA_RANGE_TYPE_SKED_REFLECTED:
529             uvm_va_range_destroy_sked_reflected(va_range);
530             break;
531         case UVM_VA_RANGE_TYPE_SEMAPHORE_POOL:
532             uvm_va_range_destroy_semaphore_pool(va_range);
533             break;
534         default:
535             UVM_ASSERT_MSG(0, "[0x%llx, 0x%llx] has type %d\n",
536                            va_range->node.start, va_range->node.end, va_range->type);
537     }
538 
539     kmem_cache_free(g_uvm_va_range_cache, va_range);
540 }
541 
542 void uvm_va_range_zombify(uvm_va_range_t *va_range)
543 {
544     if (!va_range)
545         return;
546 
547     UVM_ASSERT(va_range->type == UVM_VA_RANGE_TYPE_MANAGED);
548     UVM_ASSERT(va_range->managed.vma_wrapper);
549 
550     // Destroy will be done by uvm_destroy_vma_managed
551     va_range->managed.vma_wrapper = NULL;
552 }
553 
554 NV_STATUS uvm_api_clean_up_zombie_resources(UVM_CLEAN_UP_ZOMBIE_RESOURCES_PARAMS *params, struct file *filp)
555 {
556     uvm_va_space_t *va_space = uvm_va_space_get(filp);
557     uvm_va_range_t *va_range, *va_range_next;
558 
559     uvm_va_space_down_write(va_space);
560 
561     uvm_for_each_va_range_safe(va_range, va_range_next, va_space) {
562         if (uvm_va_range_is_managed_zombie(va_range))
563             uvm_va_range_destroy(va_range, NULL);
564     }
565 
566     uvm_va_space_up_write(va_space);
567 
568     return NV_OK;
569 }
570 
571 NV_STATUS uvm_api_validate_va_range(UVM_VALIDATE_VA_RANGE_PARAMS *params, struct file *filp)
572 {
573     NV_STATUS status = NV_ERR_INVALID_ADDRESS;
574     uvm_va_space_t *va_space = uvm_va_space_get(filp);
575     uvm_va_range_t *va_range;
576 
577     uvm_va_space_down_read(va_space);
578 
579     va_range = uvm_va_range_find(va_space, params->base);
580     if (va_range && va_range->node.start == params->base && va_range->node.end + 1 == params->base + params->length)
581         status = NV_OK;
582 
583     uvm_va_space_up_read(va_space);
584 
585     return status;
586 }
587 
588 static NV_STATUS va_range_add_gpu_va_space_managed(uvm_va_range_t *va_range,
589                                                    uvm_gpu_va_space_t *gpu_va_space,
590                                                    struct mm_struct *mm)
591 {
592     uvm_va_space_t *va_space = va_range->va_space;
593     uvm_gpu_t *gpu = gpu_va_space->gpu;
594     NV_STATUS status = NV_OK;
595     const bool should_add_remote_mappings =
596         uvm_processor_mask_test(&uvm_va_range_get_policy(va_range)->accessed_by, gpu->id) ||
597         uvm_processor_mask_test(&va_range->uvm_lite_gpus, gpu->id);
598 
599     // By this time, the gpu is already in the registration mask.
600     const bool should_disable_read_duplication =
601         uvm_va_range_get_policy(va_range)->read_duplication == UVM_READ_DUPLICATION_ENABLED &&
602         (uvm_va_space_can_read_duplicate(va_space, NULL) != uvm_va_space_can_read_duplicate(va_space, gpu));
603 
604     // Combine conditions to perform a single VA block traversal
605     if (gpu_va_space->ats.enabled || should_add_remote_mappings || should_disable_read_duplication) {
606         uvm_va_block_t *va_block;
607         uvm_va_block_context_t *va_block_context = uvm_va_space_block_context(va_space, mm);
608 
609 
610         // TODO: Bug 2090378. Consolidate all per-VA block operations within
611         // uvm_va_block_add_gpu_va_space so we only need to take the VA block
612         // once.
613         for_each_va_block_in_va_range(va_range, va_block) {
614             if (gpu_va_space->ats.enabled) {
615                 // Notify that a new GPU VA space has been created. This is only
616                 // currently used for PDE1 pre-population on ATS systems.
617                 status = UVM_VA_BLOCK_LOCK_RETRY(va_block, NULL, uvm_va_block_add_gpu_va_space(va_block, gpu_va_space));
618                 if (status != NV_OK)
619                     break;
620             }
621 
622             if (should_add_remote_mappings) {
623                 // Now that we have a GPU VA space, map any VA ranges for which
624                 // this GPU is a UVM-Lite GPU or has accessed_by set.
625                 status = uvm_va_block_set_accessed_by(va_block, va_block_context, gpu->id);
626                 if (status != NV_OK)
627                     break;
628             }
629 
630             if (should_disable_read_duplication) {
631                 status = uvm_va_block_unset_read_duplication(va_block, va_block_context);
632                 if (status != NV_OK)
633                     break;
634             }
635         }
636     }
637 
638     return status;
639 }
640 
641 static NV_STATUS va_range_add_gpu_va_space_semaphore_pool(uvm_va_range_t *va_range, uvm_gpu_t *gpu)
642 {
643     uvm_mem_gpu_mapping_attrs_t *attrs;
644 
645     UVM_ASSERT(va_range->type == UVM_VA_RANGE_TYPE_SEMAPHORE_POOL);
646     UVM_ASSERT(uvm_mem_mapped_on_gpu_kernel(va_range->semaphore_pool.mem, gpu));
647 
648     attrs = &va_range->semaphore_pool.gpu_attrs[uvm_id_gpu_index(gpu->id)];
649 
650     return uvm_mem_map_gpu_user(va_range->semaphore_pool.mem,
651                                 gpu,
652                                 va_range->va_space,
653                                 (void *)va_range->node.start,
654                                 attrs);
655 }
656 
657 NV_STATUS uvm_va_range_add_gpu_va_space(uvm_va_range_t *va_range,
658                                         uvm_gpu_va_space_t *gpu_va_space,
659                                         struct mm_struct *mm)
660 {
661     UVM_ASSERT(va_range->type < UVM_VA_RANGE_TYPE_MAX);
662 
663     if (va_range->inject_add_gpu_va_space_error) {
664         va_range->inject_add_gpu_va_space_error = false;
665         return NV_ERR_NO_MEMORY;
666     }
667 
668     switch (va_range->type) {
669         case UVM_VA_RANGE_TYPE_MANAGED:
670             return va_range_add_gpu_va_space_managed(va_range, gpu_va_space, mm);
671         case UVM_VA_RANGE_TYPE_SEMAPHORE_POOL:
672             return va_range_add_gpu_va_space_semaphore_pool(va_range, gpu_va_space->gpu);
673         default:
674             return NV_OK;
675     }
676 }
677 
678 static void va_range_remove_gpu_va_space_managed(uvm_va_range_t *va_range,
679                                                  uvm_gpu_va_space_t *gpu_va_space,
680                                                  struct mm_struct *mm)
681 {
682     uvm_va_block_t *va_block;
683     uvm_va_space_t *va_space = va_range->va_space;
684     bool should_enable_read_duplicate;
685     uvm_va_block_context_t *va_block_context = uvm_va_space_block_context(va_space, mm);
686 
687     should_enable_read_duplicate =
688         uvm_va_range_get_policy(va_range)->read_duplication == UVM_READ_DUPLICATION_ENABLED &&
689         uvm_va_space_can_read_duplicate(va_space, NULL) != uvm_va_space_can_read_duplicate(va_space, gpu_va_space->gpu);
690 
691     for_each_va_block_in_va_range(va_range, va_block) {
692         uvm_mutex_lock(&va_block->lock);
693         uvm_va_block_remove_gpu_va_space(va_block, gpu_va_space, va_block_context);
694         uvm_mutex_unlock(&va_block->lock);
695 
696         if (should_enable_read_duplicate)
697             uvm_va_block_set_read_duplication(va_block, va_block_context);
698     }
699 }
700 
701 static void va_range_remove_gpu_va_space_external(uvm_va_range_t *va_range,
702                                                   uvm_gpu_t *gpu,
703                                                   struct list_head *deferred_free_list)
704 {
705     uvm_ext_gpu_range_tree_t *range_tree;
706     uvm_ext_gpu_map_t *ext_map, *ext_map_next;
707 
708     UVM_ASSERT(deferred_free_list);
709 
710     range_tree = uvm_ext_gpu_range_tree(va_range, gpu);
711     uvm_mutex_lock(&range_tree->lock);
712 
713     uvm_ext_gpu_map_for_each_safe(ext_map, ext_map_next, va_range, gpu)
714         uvm_ext_gpu_map_destroy(va_range, ext_map, deferred_free_list);
715 
716     uvm_mutex_unlock(&range_tree->lock);
717 }
718 
719 static void va_range_remove_gpu_va_space_semaphore_pool(uvm_va_range_t *va_range, uvm_gpu_t *gpu)
720 {
721     UVM_ASSERT(va_range->type == UVM_VA_RANGE_TYPE_SEMAPHORE_POOL);
722 
723     if (g_uvm_global.conf_computing_enabled && (va_range->semaphore_pool.mem->dma_owner == gpu))
724         uvm_va_range_destroy(va_range, NULL);
725     else
726         uvm_mem_unmap_gpu_user(va_range->semaphore_pool.mem, gpu);
727 }
728 
729 void uvm_va_range_remove_gpu_va_space(uvm_va_range_t *va_range,
730                                       uvm_gpu_va_space_t *gpu_va_space,
731                                       struct mm_struct *mm,
732                                       struct list_head *deferred_free_list)
733 {
734     switch (va_range->type) {
735         case UVM_VA_RANGE_TYPE_MANAGED:
736             va_range_remove_gpu_va_space_managed(va_range, gpu_va_space, mm);
737             break;
738         case UVM_VA_RANGE_TYPE_EXTERNAL:
739             va_range_remove_gpu_va_space_external(va_range, gpu_va_space->gpu, deferred_free_list);
740             break;
741         case UVM_VA_RANGE_TYPE_CHANNEL:
742             // All channels under this GPU VA space should've been removed before
743             // removing the GPU VA space.
744             UVM_ASSERT(va_range->channel.gpu_va_space != gpu_va_space);
745             break;
746         case UVM_VA_RANGE_TYPE_SKED_REFLECTED:
747             if (va_range->sked_reflected.gpu_va_space == gpu_va_space)
748                 uvm_va_range_destroy_sked_reflected(va_range);
749             break;
750         case UVM_VA_RANGE_TYPE_SEMAPHORE_POOL:
751             va_range_remove_gpu_va_space_semaphore_pool(va_range, gpu_va_space->gpu);
752             break;
753         default:
754             UVM_ASSERT_MSG(0, "[0x%llx, 0x%llx] has type %d\n",
755                            va_range->node.start, va_range->node.end, va_range->type);
756     }
757 }
758 
759 static NV_STATUS uvm_va_range_enable_peer_managed(uvm_va_range_t *va_range, uvm_gpu_t *gpu0, uvm_gpu_t *gpu1)
760 {
761     NV_STATUS status;
762     uvm_va_block_t *va_block;
763     bool gpu0_accessed_by = uvm_processor_mask_test(&uvm_va_range_get_policy(va_range)->accessed_by, gpu0->id);
764     bool gpu1_accessed_by = uvm_processor_mask_test(&uvm_va_range_get_policy(va_range)->accessed_by, gpu1->id);
765     uvm_va_space_t *va_space = va_range->va_space;
766     uvm_va_block_context_t *va_block_context = uvm_va_space_block_context(va_space, NULL);
767 
768 
769     for_each_va_block_in_va_range(va_range, va_block) {
770         // TODO: Bug 1767224: Refactor the uvm_va_block_set_accessed_by logic
771         //       into uvm_va_block_enable_peer.
772         uvm_mutex_lock(&va_block->lock);
773         status = uvm_va_block_enable_peer(va_block, gpu0, gpu1);
774         uvm_mutex_unlock(&va_block->lock);
775 
776         if (status != NV_OK)
777             return status;
778 
779         // For UVM-Lite at most one GPU needs to map the peer GPU if it's the
780         // preferred location, but it doesn't hurt to just try mapping both.
781         if (gpu0_accessed_by) {
782             status = uvm_va_block_set_accessed_by(va_block,
783                                                   va_block_context,
784                                                   gpu0->id);
785             if (status != NV_OK)
786                 return status;
787         }
788 
789         if (gpu1_accessed_by) {
790             status = uvm_va_block_set_accessed_by(va_block,
791                                                   va_block_context,
792                                                   gpu1->id);
793             if (status != NV_OK)
794                 return status;
795         }
796     }
797 
798     return NV_OK;
799 }
800 
801 NV_STATUS uvm_va_range_enable_peer(uvm_va_range_t *va_range, uvm_gpu_t *gpu0, uvm_gpu_t *gpu1)
802 {
803     switch (va_range->type) {
804         case UVM_VA_RANGE_TYPE_MANAGED:
805             return uvm_va_range_enable_peer_managed(va_range, gpu0, gpu1);
806         case UVM_VA_RANGE_TYPE_EXTERNAL:
807             // UVM_VA_RANGE_TYPE_EXTERNAL doesn't create new mappings when enabling peer access
808             return NV_OK;
809         case UVM_VA_RANGE_TYPE_CHANNEL:
810             // UVM_VA_RANGE_TYPE_CHANNEL should never have peer mappings
811             return NV_OK;
812         case UVM_VA_RANGE_TYPE_SKED_REFLECTED:
813             // UVM_VA_RANGE_TYPE_SKED_REFLECTED should never have peer mappings
814             return NV_OK;
815         case UVM_VA_RANGE_TYPE_SEMAPHORE_POOL:
816             // UVM_VA_RANGE_TYPE_SEMAPHORE_POOL should never have peer mappings
817             return NV_OK;
818         default:
819             UVM_ASSERT_MSG(0, "[0x%llx, 0x%llx] has type %d\n",
820                            va_range->node.start, va_range->node.end, va_range->type);
821             return NV_ERR_NOT_SUPPORTED;
822     }
823 }
824 
825 static void uvm_va_range_disable_peer_external(uvm_va_range_t *va_range,
826                                                uvm_gpu_t *mapping_gpu,
827                                                uvm_gpu_t *owning_gpu,
828                                                struct list_head *deferred_free_list)
829 {
830     uvm_ext_gpu_range_tree_t *range_tree;
831     uvm_ext_gpu_map_t *ext_map, *ext_map_next;
832 
833     range_tree = uvm_ext_gpu_range_tree(va_range, mapping_gpu);
834     uvm_mutex_lock(&range_tree->lock);
835     uvm_ext_gpu_map_for_each_safe(ext_map, ext_map_next, va_range, mapping_gpu) {
836         if (ext_map->owning_gpu == owning_gpu && (!ext_map->is_sysmem || ext_map->is_egm)) {
837             UVM_ASSERT(deferred_free_list);
838             uvm_ext_gpu_map_destroy(va_range, ext_map, deferred_free_list);
839         }
840     }
841     uvm_mutex_unlock(&range_tree->lock);
842 }
843 
844 static void uvm_va_range_disable_peer_managed(uvm_va_range_t *va_range, uvm_gpu_t *gpu0, uvm_gpu_t *gpu1)
845 {
846     uvm_va_block_t *va_block;
847     uvm_gpu_t *uvm_lite_gpu_to_unmap = NULL;
848 
849     bool uvm_lite_mode = uvm_processor_mask_test(&va_range->uvm_lite_gpus, gpu0->id) &&
850                          uvm_processor_mask_test(&va_range->uvm_lite_gpus, gpu1->id);
851 
852     if (uvm_lite_mode) {
853         // In UVM-Lite mode, the UVM-Lite GPUs can only have mappings to the the
854         // preferred location. If peer mappings are being disabled to the
855         // preferred location, then unmap the other GPU.
856         // Nothing to do otherwise.
857         if (uvm_id_equal(uvm_va_range_get_policy(va_range)->preferred_location, gpu0->id))
858             uvm_lite_gpu_to_unmap = gpu1;
859         else if (uvm_id_equal(uvm_va_range_get_policy(va_range)->preferred_location, gpu1->id))
860             uvm_lite_gpu_to_unmap = gpu0;
861         else
862             return;
863     }
864 
865     for_each_va_block_in_va_range(va_range, va_block) {
866         uvm_mutex_lock(&va_block->lock);
867         if (uvm_lite_mode)
868             uvm_va_block_unmap_preferred_location_uvm_lite(va_block, uvm_lite_gpu_to_unmap);
869         else
870             uvm_va_block_disable_peer(va_block, gpu0, gpu1);
871         uvm_mutex_unlock(&va_block->lock);
872     }
873 
874     if (uvm_lite_mode && !uvm_range_group_all_migratable(va_range->va_space, va_range->node.start, va_range->node.end)) {
875         UVM_ASSERT(uvm_lite_gpu_to_unmap);
876 
877         // Migration is prevented, but we had to unmap a UVM-Lite GPU. Update
878         // the accessed by and UVM-Lite GPUs masks as it cannot be considered a
879         // UVM-Lite GPU any more.
880         uvm_va_range_unset_accessed_by(va_range, uvm_lite_gpu_to_unmap->id, NULL);
881     }
882 }
883 
884 void uvm_va_range_disable_peer(uvm_va_range_t *va_range,
885                                uvm_gpu_t *gpu0,
886                                uvm_gpu_t *gpu1,
887                                struct list_head *deferred_free_list)
888 {
889 
890     switch (va_range->type) {
891         case UVM_VA_RANGE_TYPE_MANAGED:
892             uvm_va_range_disable_peer_managed(va_range, gpu0, gpu1);
893             break;
894         case UVM_VA_RANGE_TYPE_EXTERNAL:
895             // If GPU 0 has a mapping to GPU 1, remove GPU 0's mapping
896             uvm_va_range_disable_peer_external(va_range, gpu0, gpu1, deferred_free_list);
897             // If GPU 1 has a mapping to GPU 0, remove GPU 1's mapping
898             uvm_va_range_disable_peer_external(va_range, gpu1, gpu0, deferred_free_list);
899             break;
900         case UVM_VA_RANGE_TYPE_CHANNEL:
901             // UVM_VA_RANGE_TYPE_CHANNEL should never have peer mappings
902             break;
903         case UVM_VA_RANGE_TYPE_SKED_REFLECTED:
904             // UVM_VA_RANGE_TYPE_SKED_REFLECTED should never have peer mappings
905             break;
906         case UVM_VA_RANGE_TYPE_SEMAPHORE_POOL:
907             // UVM_VA_RANGE_TYPE_SEMAPHORE_POOL should never have peer mappings
908             break;
909         default:
910             UVM_ASSERT_MSG(0, "[0x%llx, 0x%llx] has type %d\n",
911                            va_range->node.start, va_range->node.end, va_range->type);
912     }
913 }
914 
915 static NV_STATUS va_range_register_gpu_semaphore_pool(uvm_va_range_t *va_range, uvm_gpu_t *gpu)
916 {
917     // TODO: Bug 1812419: pass GPU mapping attributes to uvm_mem_map_gpu_kernel
918     // once that function accepts them.
919     return uvm_mem_map_gpu_kernel(va_range->semaphore_pool.mem, gpu);
920 }
921 
922 NV_STATUS uvm_va_range_register_gpu(uvm_va_range_t *va_range, uvm_gpu_t *gpu)
923 {
924     UVM_ASSERT(va_range->type < UVM_VA_RANGE_TYPE_MAX);
925     uvm_assert_rwsem_locked_write(&va_range->va_space->lock);
926 
927     if (va_range->type == UVM_VA_RANGE_TYPE_SEMAPHORE_POOL)
928         return va_range_register_gpu_semaphore_pool(va_range, gpu);
929 
930     return NV_OK;
931 }
932 
933 static void va_range_unregister_gpu_managed(uvm_va_range_t *va_range, uvm_gpu_t *gpu, struct mm_struct *mm)
934 {
935     uvm_va_block_t *va_block;
936 
937     // Reset preferred location and accessed-by of VA ranges if needed
938     // Note: ignoring the return code of uvm_va_range_set_preferred_location since this
939     // will only return on error when setting a preferred location, not on a reset
940     if (uvm_id_equal(uvm_va_range_get_policy(va_range)->preferred_location, gpu->id))
941         (void)uvm_va_range_set_preferred_location(va_range, UVM_ID_INVALID, NUMA_NO_NODE, mm, NULL);
942 
943     uvm_va_range_unset_accessed_by(va_range, gpu->id, NULL);
944 
945     // Migrate and free any remaining resident allocations on this GPU
946     for_each_va_block_in_va_range(va_range, va_block)
947         uvm_va_block_unregister_gpu(va_block, gpu, mm);
948 }
949 
950 // The GPU being unregistered can't have any remaining mappings, since those
951 // were removed when the corresponding GPU VA space was removed. However, other
952 // GPUs could still have mappings to memory resident on this GPU, so we have to
953 // unmap those.
954 static void va_range_unregister_gpu_external(uvm_va_range_t *va_range,
955                                              uvm_gpu_t *gpu,
956                                              struct list_head *deferred_free_list)
957 {
958     uvm_ext_gpu_map_t *ext_map, *ext_map_next;
959     uvm_gpu_t *other_gpu;
960 
961     for_each_va_space_gpu_in_mask(other_gpu, va_range->va_space, &va_range->external.mapped_gpus) {
962         uvm_ext_gpu_range_tree_t *range_tree = uvm_ext_gpu_range_tree(va_range, other_gpu);
963         UVM_ASSERT(other_gpu != gpu);
964 
965         uvm_mutex_lock(&range_tree->lock);
966         uvm_ext_gpu_map_for_each_safe(ext_map, ext_map_next, va_range, other_gpu) {
967             if (ext_map->owning_gpu == gpu) {
968                 UVM_ASSERT(deferred_free_list);
969                 uvm_ext_gpu_map_destroy(va_range, ext_map, deferred_free_list);
970             }
971         }
972         uvm_mutex_unlock(&range_tree->lock);
973     }
974 }
975 
976 static void va_range_unregister_gpu_semaphore_pool(uvm_va_range_t *va_range, uvm_gpu_t *gpu)
977 {
978     NV_STATUS status;
979 
980     // Ranges for this GPU should have been previously unmapped from the user VA
981     // space during GPU VA space unregister, which should have already happened.
982     UVM_ASSERT(!uvm_mem_mapped_on_gpu_user(va_range->semaphore_pool.mem, gpu));
983     UVM_ASSERT(uvm_mem_mapped_on_gpu_kernel(va_range->semaphore_pool.mem, gpu));
984 
985     uvm_mutex_lock(&va_range->semaphore_pool.tracker_lock);
986     status = uvm_tracker_wait(&va_range->semaphore_pool.tracker);
987     uvm_mutex_unlock(&va_range->semaphore_pool.tracker_lock);
988     if (status != NV_OK)
989         UVM_ASSERT(status == uvm_global_get_status());
990 
991     uvm_mem_unmap_gpu_phys(va_range->semaphore_pool.mem, gpu);
992 
993     va_range->semaphore_pool.gpu_attrs[uvm_id_gpu_index(gpu->id)] = va_range->semaphore_pool.default_gpu_attrs;
994     if (va_range->semaphore_pool.owner == gpu)
995         va_range->semaphore_pool.owner = NULL;
996 }
997 
998 void uvm_va_range_unregister_gpu(uvm_va_range_t *va_range,
999                                  uvm_gpu_t *gpu,
1000                                  struct mm_struct *mm,
1001                                  struct list_head *deferred_free_list)
1002 {
1003     switch (va_range->type) {
1004         case UVM_VA_RANGE_TYPE_MANAGED:
1005             va_range_unregister_gpu_managed(va_range, gpu, mm);
1006             break;
1007         case UVM_VA_RANGE_TYPE_EXTERNAL:
1008             va_range_unregister_gpu_external(va_range, gpu, deferred_free_list);
1009             break;
1010         case UVM_VA_RANGE_TYPE_CHANNEL:
1011             // All ranges should have been destroyed by GPU VA space unregister,
1012             // which should have already happened.
1013             UVM_ASSERT(va_range->channel.gpu_va_space->gpu != gpu);
1014             break;
1015         case UVM_VA_RANGE_TYPE_SKED_REFLECTED:
1016             // All ranges for this GPU should have been unmapped by GPU VA space
1017             // unregister (uvm_va_range_destroy_sked_reflected), which should
1018             // have already happened.
1019             if (va_range->sked_reflected.gpu_va_space != NULL)
1020                 UVM_ASSERT(va_range->sked_reflected.gpu_va_space->gpu != gpu);
1021             break;
1022         case UVM_VA_RANGE_TYPE_SEMAPHORE_POOL:
1023             va_range_unregister_gpu_semaphore_pool(va_range, gpu);
1024             break;
1025         default:
1026             UVM_ASSERT_MSG(0, "[0x%llx, 0x%llx] has type %d\n",
1027                            va_range->node.start, va_range->node.end, va_range->type);
1028     }
1029 }
1030 
1031 // Split existing's blocks into new. new's blocks array has already been
1032 // allocated. This is called before existing's range node is split, so it
1033 // overlaps new. new is always in the upper region of existing.
1034 //
1035 // The caller will do the range tree split.
1036 //
1037 // If this fails it leaves existing unchanged.
1038 static NV_STATUS uvm_va_range_split_blocks(uvm_va_range_t *existing, uvm_va_range_t *new)
1039 {
1040     uvm_va_block_t *old_block, *block = NULL;
1041     size_t existing_blocks, split_index, new_index = 0;
1042     NV_STATUS status;
1043 
1044     UVM_ASSERT(new->node.start >  existing->node.start);
1045     UVM_ASSERT(new->node.end   <= existing->node.end);
1046 
1047     split_index = uvm_va_range_block_index(existing, new->node.start);
1048 
1049     // Handle a block spanning the split point
1050     if (block_calc_start(existing, split_index) != new->node.start) {
1051         // If a populated block actually spans the split point, we have to split
1052         // the block. Otherwise just account for the extra entry in the arrays.
1053         old_block = uvm_va_range_block(existing, split_index);
1054         if (old_block) {
1055             UVM_ASSERT(old_block->start < new->node.start);
1056             status = uvm_va_block_split(old_block, new->node.start - 1, &block, new);
1057             if (status != NV_OK)
1058                 return status;
1059 
1060             // No memory barrier is needed since we're holding the va_space lock in
1061             // write mode, so no other thread can access the blocks array.
1062             atomic_long_set(&new->blocks[0], (long)block);
1063         }
1064 
1065         new_index = 1;
1066     }
1067 
1068     // uvm_va_block_split gets first crack at injecting an error. If it did so,
1069     // we wouldn't be here. However, not all va_range splits will call
1070     // uvm_va_block_split so we need an extra check here. We can't push this
1071     // injection later since all paths past this point assume success, so they
1072     // modify the state of 'existing' range.
1073     //
1074     // Even if there was no block split above, there is no guarantee that one
1075     // of our blocks doesn't have the 'inject_split_error' flag set. We clear
1076     // that here to prevent multiple errors caused by one
1077     // 'uvm_test_va_range_inject_split_error' call.
1078     if (existing->inject_split_error) {
1079         UVM_ASSERT(!block);
1080         existing->inject_split_error = false;
1081 
1082         for_each_va_block_in_va_range(existing, block) {
1083             uvm_va_block_test_t *block_test = uvm_va_block_get_test(block);
1084             if (block_test)
1085                 block_test->inject_split_error = false;
1086         }
1087 
1088         return NV_ERR_NO_MEMORY;
1089     }
1090 
1091     existing_blocks = split_index + new_index;
1092 
1093     // Copy existing's blocks over to the new range, accounting for the explicit
1094     // assignment above in case we did a block split. There are two general
1095     // cases:
1096     //
1097     // No split:
1098     //                             split_index
1099     //                                  v
1100     //  existing (before) [----- A ----][----- B ----][----- C ----]
1101     //  existing (after)  [----- A ----]
1102     //  new                             [----- B ----][----- C ----]
1103     //
1104     // Split:
1105     //                                    split_index
1106     //                                         v
1107     //  existing (before) [----- A ----][----- B ----][----- C ----]
1108     //  existing (after   [----- A ----][- B -]
1109     //  new                                    [- N -][----- C ----]
1110     //                                            ^new->blocks[0]
1111 
1112     // Note, if we split the last block of existing, this won't iterate at all.
1113     for (; new_index < uvm_va_range_num_blocks(new); new_index++) {
1114         block = uvm_va_range_block(existing, split_index + new_index);
1115         if (!block) {
1116             // new's array was cleared at allocation
1117             UVM_ASSERT(uvm_va_range_block(new, new_index) == NULL);
1118             continue;
1119         }
1120 
1121         // As soon as we make this assignment and drop the lock, the reverse
1122         // mapping code can start looking at new, so new must be ready to go.
1123         uvm_mutex_lock(&block->lock);
1124         UVM_ASSERT(block->va_range == existing);
1125         block->va_range = new;
1126         uvm_mutex_unlock(&block->lock);
1127 
1128         // No memory barrier is needed since we're holding the va_space lock in
1129         // write mode, so no other thread can access the blocks array.
1130         atomic_long_set(&new->blocks[new_index], (long)block);
1131         atomic_long_set(&existing->blocks[split_index + new_index], (long)NULL);
1132     }
1133 
1134     blocks_array_shrink(existing, existing_blocks);
1135 
1136     return NV_OK;
1137 }
1138 
1139 NV_STATUS uvm_va_range_split(uvm_va_range_t *existing_va_range,
1140                              NvU64 new_end,
1141                              uvm_va_range_t **new_va_range)
1142 {
1143     uvm_va_space_t *va_space = existing_va_range->va_space;
1144     uvm_va_range_t *new = NULL;
1145     uvm_perf_event_data_t event_data;
1146     NV_STATUS status;
1147 
1148     UVM_ASSERT(existing_va_range->type == UVM_VA_RANGE_TYPE_MANAGED);
1149     UVM_ASSERT(new_end > existing_va_range->node.start);
1150     UVM_ASSERT(new_end < existing_va_range->node.end);
1151     UVM_ASSERT(PAGE_ALIGNED(new_end + 1));
1152     uvm_assert_rwsem_locked_write(&va_space->lock);
1153 
1154     new = uvm_va_range_alloc_managed(va_space, new_end + 1, existing_va_range->node.end);
1155     if (!new) {
1156         status = NV_ERR_NO_MEMORY;
1157         goto error;
1158     }
1159 
1160     // The new va_range is under the same vma. If this is a uvm_vm_open, the
1161     // caller takes care of updating existing's vma_wrapper for us.
1162     new->managed.vma_wrapper = existing_va_range->managed.vma_wrapper;
1163 
1164     // Copy over state before splitting blocks so any block lookups happening
1165     // concurrently on the eviction path will see the new range's data.
1166     uvm_va_range_get_policy(new)->read_duplication = uvm_va_range_get_policy(existing_va_range)->read_duplication;
1167     uvm_va_range_get_policy(new)->preferred_location = uvm_va_range_get_policy(existing_va_range)->preferred_location;
1168     uvm_va_range_get_policy(new)->preferred_nid = uvm_va_range_get_policy(existing_va_range)->preferred_nid;
1169     uvm_processor_mask_copy(&uvm_va_range_get_policy(new)->accessed_by,
1170                             &uvm_va_range_get_policy(existing_va_range)->accessed_by);
1171     uvm_processor_mask_copy(&new->uvm_lite_gpus, &existing_va_range->uvm_lite_gpus);
1172 
1173     status = uvm_va_range_split_blocks(existing_va_range, new);
1174     if (status != NV_OK)
1175         goto error;
1176 
1177     // Finally, update the VA range tree
1178     uvm_range_tree_split(&va_space->va_range_tree, &existing_va_range->node, &new->node);
1179 
1180     if (new->type == UVM_VA_RANGE_TYPE_MANAGED) {
1181         event_data.range_shrink.range = new;
1182         uvm_perf_event_notify(&va_space->perf_events, UVM_PERF_EVENT_RANGE_SHRINK, &event_data);
1183     }
1184 
1185     if (new_va_range)
1186         *new_va_range = new;
1187     return NV_OK;
1188 
1189 error:
1190     uvm_va_range_destroy(new, NULL);
1191     return status;
1192 
1193 }
1194 
1195 uvm_va_range_t *uvm_va_range_find(uvm_va_space_t *va_space, NvU64 addr)
1196 {
1197     uvm_assert_rwsem_locked(&va_space->lock);
1198     return uvm_va_range_container(uvm_range_tree_find(&va_space->va_range_tree, addr));
1199 }
1200 
1201 uvm_va_range_t *uvm_va_space_iter_first(uvm_va_space_t *va_space, NvU64 start, NvU64 end)
1202 {
1203     uvm_range_tree_node_t *node = uvm_range_tree_iter_first(&va_space->va_range_tree, start, end);
1204     return uvm_va_range_container(node);
1205 }
1206 
1207 uvm_va_range_t *uvm_va_space_iter_next(uvm_va_range_t *va_range, NvU64 end)
1208 {
1209     uvm_range_tree_node_t *node;
1210 
1211     // Handling a NULL va_range here makes uvm_for_each_va_range_in_safe much
1212     // less messy
1213     if (!va_range)
1214         return NULL;
1215 
1216     node = uvm_range_tree_iter_next(&va_range->va_space->va_range_tree, &va_range->node, end);
1217     return uvm_va_range_container(node);
1218 }
1219 
1220 size_t uvm_va_range_num_blocks(uvm_va_range_t *va_range)
1221 {
1222     NvU64 start = UVM_VA_BLOCK_ALIGN_DOWN(va_range->node.start);
1223     NvU64 end   = UVM_VA_BLOCK_ALIGN_UP(va_range->node.end); // End is inclusive
1224     return (end - start) / UVM_VA_BLOCK_SIZE;
1225 }
1226 
1227 size_t uvm_va_range_block_index(uvm_va_range_t *va_range, NvU64 addr)
1228 {
1229     size_t addr_index, start_index, index;
1230 
1231     UVM_ASSERT(addr >= va_range->node.start);
1232     UVM_ASSERT(addr <= va_range->node.end);
1233     UVM_ASSERT(va_range->type == UVM_VA_RANGE_TYPE_MANAGED);
1234 
1235     // Each block will cover as much space as possible within the aligned
1236     // UVM_VA_BLOCK_SIZE, up to the parent VA range boundaries. In other words,
1237     // the entire VA space can be broken into UVM_VA_BLOCK_SIZE chunks. Even if
1238     // there are multiple ranges (and thus multiple blocks) per actual
1239     // UVM_VA_BLOCK_SIZE chunk, none of those will have more than 1 block unless
1240     // they span a UVM_VA_BLOCK_SIZE alignment boundary.
1241     addr_index = (size_t)(addr / UVM_VA_BLOCK_SIZE);
1242     start_index = (size_t)(va_range->node.start / UVM_VA_BLOCK_SIZE);
1243 
1244     index = addr_index - start_index;
1245     UVM_ASSERT(index < uvm_va_range_num_blocks(va_range));
1246     return index;
1247 }
1248 
1249 NV_STATUS uvm_va_range_block_create(uvm_va_range_t *va_range, size_t index, uvm_va_block_t **out_block)
1250 {
1251     uvm_va_block_t *block, *old;
1252     NV_STATUS status;
1253 
1254     block = uvm_va_range_block(va_range, index);
1255     if (!block) {
1256         // No block has been created here yet, so allocate one and attempt to
1257         // insert it. Note that this runs the risk of an out-of-memory error
1258         // when multiple threads race and all concurrently allocate a block for
1259         // the same address. This should be extremely rare. There is also
1260         // precedent in the Linux kernel, which does the same thing for demand-
1261         // allocation of anonymous pages.
1262         status = uvm_va_block_create(va_range,
1263                                      block_calc_start(va_range, index),
1264                                      block_calc_end(va_range, index),
1265                                      &block);
1266         if (status != NV_OK)
1267             return status;
1268 
1269         // Try to insert it
1270         old = (uvm_va_block_t *)nv_atomic_long_cmpxchg(&va_range->blocks[index],
1271                                                       (long)NULL,
1272                                                       (long)block);
1273         if (old) {
1274             // Someone else beat us on the insert
1275             uvm_va_block_release(block);
1276             block = old;
1277         }
1278     }
1279 
1280     *out_block = block;
1281     return NV_OK;
1282 }
1283 
1284 uvm_va_block_t *uvm_va_range_block_next(uvm_va_range_t *va_range, uvm_va_block_t *va_block)
1285 {
1286     uvm_va_space_t *va_space = va_range->va_space;
1287     size_t i = 0;
1288 
1289     uvm_assert_rwsem_locked(&va_space->lock);
1290 
1291     UVM_ASSERT(va_range->type == UVM_VA_RANGE_TYPE_MANAGED);
1292 
1293     if (va_block)
1294         i = uvm_va_range_block_index(va_range, va_block->start) + 1;
1295 
1296     for (; i < uvm_va_range_num_blocks(va_range); i++) {
1297         va_block = uvm_va_range_block(va_range, i);
1298         if (va_block) {
1299             UVM_ASSERT(va_block->va_range == va_range);
1300             UVM_ASSERT(uvm_va_range_block_index(va_range, va_block->start) == i);
1301             return va_block;
1302         }
1303     }
1304 
1305     return NULL;
1306 }
1307 
1308 static NV_STATUS range_unmap_mask(uvm_va_range_t *va_range,
1309                                   const uvm_processor_mask_t *mask,
1310                                   uvm_tracker_t *out_tracker)
1311 {
1312     uvm_va_space_t *va_space = va_range->va_space;
1313     uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, NULL);
1314     uvm_va_block_t *block;
1315 
1316     UVM_ASSERT_MSG(va_range->type == UVM_VA_RANGE_TYPE_MANAGED, "type 0x%x\n", va_range->type);
1317 
1318     if (uvm_processor_mask_empty(mask))
1319         return NV_OK;
1320 
1321 
1322     for_each_va_block_in_va_range(va_range, block) {
1323         NV_STATUS status;
1324         uvm_va_block_region_t region = uvm_va_block_region_from_block(block);
1325 
1326         uvm_mutex_lock(&block->lock);
1327         status = uvm_va_block_unmap_mask(block, block_context, mask, region, NULL);
1328         if (out_tracker)
1329             uvm_tracker_add_tracker_safe(out_tracker, &block->tracker);
1330 
1331         uvm_mutex_unlock(&block->lock);
1332         if (status != NV_OK)
1333             return status;
1334     }
1335 
1336     return NV_OK;
1337 }
1338 
1339 static NV_STATUS range_unmap(uvm_va_range_t *va_range, uvm_processor_id_t processor, uvm_tracker_t *out_tracker)
1340 {
1341     uvm_processor_mask_t mask;
1342 
1343     UVM_ASSERT_MSG(va_range->type == UVM_VA_RANGE_TYPE_MANAGED, "type 0x%x\n", va_range->type);
1344 
1345     uvm_processor_mask_zero(&mask);
1346     uvm_processor_mask_set(&mask, processor);
1347 
1348     return range_unmap_mask(va_range, &mask, out_tracker);
1349 }
1350 
1351 static NV_STATUS range_map_uvm_lite_gpus(uvm_va_range_t *va_range, uvm_tracker_t *out_tracker)
1352 {
1353     NV_STATUS status = NV_OK;
1354     uvm_va_block_t *va_block;
1355     uvm_va_block_context_t *va_block_context = uvm_va_space_block_context(va_range->va_space, NULL);
1356 
1357     UVM_ASSERT(va_range->type == UVM_VA_RANGE_TYPE_MANAGED);
1358 
1359     if (uvm_processor_mask_empty(&va_range->uvm_lite_gpus))
1360         return NV_OK;
1361 
1362 
1363     for_each_va_block_in_va_range(va_range, va_block) {
1364         // UVM-Lite GPUs always map with RWA
1365         uvm_mutex_lock(&va_block->lock);
1366         status = UVM_VA_BLOCK_RETRY_LOCKED(va_block, NULL,
1367                 uvm_va_block_map_mask(va_block,
1368                                       va_block_context,
1369                                       &va_range->uvm_lite_gpus,
1370                                       uvm_va_block_region_from_block(va_block),
1371                                       NULL,
1372                                       UVM_PROT_READ_WRITE_ATOMIC,
1373                                       UvmEventMapRemoteCauseCoherence));
1374         if (status == NV_OK && out_tracker)
1375             status = uvm_tracker_add_tracker(out_tracker, &va_block->tracker);
1376 
1377         uvm_mutex_unlock(&va_block->lock);
1378         if (status != NV_OK)
1379             break;
1380     }
1381 
1382     return status;
1383 }
1384 
1385 // Calculate the mask of GPUs that should follow the UVM-Lite behaviour
1386 static void calc_uvm_lite_gpus_mask(uvm_va_space_t *va_space,
1387                                     uvm_processor_id_t preferred_location,
1388                                     const uvm_processor_mask_t *accessed_by_mask,
1389                                     uvm_processor_mask_t *uvm_lite_gpus)
1390 {
1391     uvm_gpu_id_t gpu_id;
1392 
1393     uvm_assert_rwsem_locked_write(&va_space->lock);
1394 
1395     // Zero out the mask first
1396     uvm_processor_mask_zero(uvm_lite_gpus);
1397 
1398     // If no preferred location is set then there are no GPUs following the UVM-Lite behavior
1399     if (UVM_ID_IS_INVALID(preferred_location))
1400         return;
1401 
1402     // If the preferred location is a faultable GPU, then no GPUs should follow
1403     // the UVM-Lite behaviour.
1404     if (UVM_ID_IS_GPU(preferred_location) &&
1405         uvm_processor_mask_test(&va_space->faultable_processors, preferred_location)) {
1406         return;
1407     }
1408 
1409     // Otherwise add all non-faultable GPUs to the UVM-Lite mask that have
1410     // accessed by set.
1411     for_each_gpu_id_in_mask(gpu_id, accessed_by_mask) {
1412         if (!uvm_processor_mask_test(&va_space->faultable_processors, gpu_id))
1413             uvm_processor_mask_set(uvm_lite_gpus, gpu_id);
1414     }
1415 
1416     // And the preferred location if it's a GPU
1417     if (UVM_ID_IS_GPU(preferred_location))
1418         uvm_processor_mask_set(uvm_lite_gpus, preferred_location);
1419 }
1420 
1421 // Update the mask of GPUs that follow the UVM-Lite behaviour
1422 static void range_update_uvm_lite_gpus_mask(uvm_va_range_t *va_range)
1423 {
1424     UVM_ASSERT(va_range->type == UVM_VA_RANGE_TYPE_MANAGED);
1425     calc_uvm_lite_gpus_mask(va_range->va_space,
1426                             uvm_va_range_get_policy(va_range)->preferred_location,
1427                             &uvm_va_range_get_policy(va_range)->accessed_by,
1428                             &va_range->uvm_lite_gpus);
1429 }
1430 
1431 NV_STATUS uvm_va_range_set_preferred_location(uvm_va_range_t *va_range,
1432                                               uvm_processor_id_t preferred_location,
1433                                               int preferred_cpu_nid,
1434                                               struct mm_struct *mm,
1435                                               uvm_tracker_t *out_tracker)
1436 {
1437     NV_STATUS status;
1438     uvm_processor_mask_t all_uvm_lite_gpus;
1439     uvm_processor_mask_t new_uvm_lite_gpus;
1440     uvm_processor_mask_t set_accessed_by_processors;
1441     uvm_range_group_range_iter_t iter;
1442     uvm_range_group_range_t *rgr = NULL;
1443     uvm_va_space_t *va_space = va_range->va_space;
1444     uvm_va_block_t *va_block;
1445     uvm_va_block_context_t *va_block_context;
1446     uvm_va_policy_t *va_range_policy;
1447 
1448     uvm_assert_rwsem_locked_write(&va_space->lock);
1449     UVM_ASSERT(va_range->type == UVM_VA_RANGE_TYPE_MANAGED);
1450 
1451     va_range_policy = uvm_va_range_get_policy(va_range);
1452     if (uvm_va_policy_preferred_location_equal(va_range_policy, preferred_location, preferred_cpu_nid))
1453         return NV_OK;
1454 
1455     // Mark all range group ranges within this VA range as migrated since the preferred location has changed.
1456     uvm_range_group_for_each_range_in(rgr, va_space, va_range->node.start, va_range->node.end) {
1457         uvm_spin_lock(&rgr->range_group->migrated_ranges_lock);
1458         if (list_empty(&rgr->range_group_migrated_list_node))
1459             list_move_tail(&rgr->range_group_migrated_list_node, &rgr->range_group->migrated_ranges);
1460         uvm_spin_unlock(&rgr->range_group->migrated_ranges_lock);
1461     }
1462 
1463     // Calculate the new UVM-Lite GPUs mask, but don't update va_range state so
1464     // that we can keep block_page_check_mappings() happy while updating the
1465     // mappings.
1466     calc_uvm_lite_gpus_mask(va_space, preferred_location, &va_range_policy->accessed_by, &new_uvm_lite_gpus);
1467 
1468     // If the range contains non-migratable range groups, check that new UVM-Lite GPUs
1469     // can all map the new preferred location.
1470     if (!uvm_range_group_all_migratable(va_space, va_range->node.start, va_range->node.end) &&
1471         UVM_ID_IS_VALID(preferred_location) &&
1472         !uvm_processor_mask_subset(&new_uvm_lite_gpus, &va_space->accessible_from[uvm_id_value(preferred_location)])) {
1473         return NV_ERR_INVALID_DEVICE;
1474     }
1475 
1476     if (UVM_ID_IS_INVALID(preferred_location)) {
1477         uvm_range_group_for_each_migratability_in_safe(&iter, va_space, va_range->node.start, va_range->node.end) {
1478             if (!iter.migratable) {
1479                 // Clear the range group assocation for any unmigratable ranges if there is no preferred location
1480                 status = uvm_range_group_assign_range(va_space, NULL, iter.start, iter.end);
1481                 if (status != NV_OK)
1482                     return status;
1483             }
1484         }
1485     }
1486 
1487     // Unmap all old and new UVM-Lite GPUs
1488     //  - GPUs that stop being UVM-Lite need to be unmapped so that they don't
1489     //    have stale mappings to the old preferred location.
1490     //  - GPUs that will continue to be UVM-Lite GPUs or are new UVM-Lite GPUs
1491     //    need to be unmapped so that the new preferred location can be mapped.
1492     uvm_processor_mask_or(&all_uvm_lite_gpus, &va_range->uvm_lite_gpus, &new_uvm_lite_gpus);
1493     status = range_unmap_mask(va_range, &all_uvm_lite_gpus, out_tracker);
1494     if (status != NV_OK)
1495         return status;
1496 
1497     // GPUs that stop being UVM-Lite, but are in the accessed_by mask need to
1498     // have any possible mappings established.
1499     uvm_processor_mask_andnot(&set_accessed_by_processors, &va_range->uvm_lite_gpus, &new_uvm_lite_gpus);
1500 
1501     // A GPU which had been in UVM-Lite mode before must still be in UVM-Lite
1502     // mode if it is the new preferred location. Otherwise we'd have to be more
1503     // careful below to not establish remote mappings to the new preferred
1504     // location.
1505     if (UVM_ID_IS_GPU(preferred_location))
1506         UVM_ASSERT(!uvm_processor_mask_test(&set_accessed_by_processors, preferred_location));
1507 
1508     // The old preferred location should establish new remote mappings if it has
1509     // accessed-by set.
1510     if (UVM_ID_IS_VALID(va_range_policy->preferred_location))
1511         uvm_processor_mask_set(&set_accessed_by_processors, va_range_policy->preferred_location);
1512 
1513     uvm_processor_mask_and(&set_accessed_by_processors, &set_accessed_by_processors, &va_range_policy->accessed_by);
1514 
1515     // Now update the va_range state
1516     va_range_policy->preferred_location = preferred_location;
1517     va_range_policy->preferred_nid = preferred_cpu_nid;
1518     uvm_processor_mask_copy(&va_range->uvm_lite_gpus, &new_uvm_lite_gpus);
1519 
1520     va_block_context = uvm_va_space_block_context(va_space, mm);
1521 
1522     for_each_va_block_in_va_range(va_range, va_block) {
1523         uvm_processor_id_t id;
1524         uvm_va_block_region_t region = uvm_va_block_region_from_block(va_block);
1525 
1526         for_each_id_in_mask(id, &set_accessed_by_processors) {
1527             status = uvm_va_block_set_accessed_by(va_block, va_block_context, id);
1528             if (status != NV_OK)
1529                 return status;
1530         }
1531 
1532         // Also, mark CPU pages as dirty and remove remote mappings from the new
1533         // preferred location
1534         uvm_mutex_lock(&va_block->lock);
1535         status = UVM_VA_BLOCK_RETRY_LOCKED(va_block,
1536                                            NULL,
1537                                            uvm_va_block_set_preferred_location_locked(va_block,
1538                                                                                       va_block_context,
1539                                                                                       region));
1540 
1541         if (out_tracker) {
1542             NV_STATUS tracker_status;
1543 
1544             tracker_status = uvm_tracker_add_tracker_safe(out_tracker, &va_block->tracker);
1545             if (status == NV_OK)
1546                 status = tracker_status;
1547         }
1548 
1549         uvm_mutex_unlock(&va_block->lock);
1550 
1551         if (status != NV_OK)
1552             return status;
1553     }
1554 
1555     // And lastly map all of the current UVM-Lite GPUs to the resident pages on
1556     // the new preferred location. Anything that's not resident right now will
1557     // get mapped on the next PreventMigration().
1558     return range_map_uvm_lite_gpus(va_range, out_tracker);
1559 }
1560 
1561 NV_STATUS uvm_va_range_set_accessed_by(uvm_va_range_t *va_range,
1562                                        uvm_processor_id_t processor_id,
1563                                        struct mm_struct *mm,
1564                                        uvm_tracker_t *out_tracker)
1565 {
1566     NV_STATUS status;
1567     uvm_va_block_t *va_block;
1568     uvm_processor_mask_t new_uvm_lite_gpus;
1569     uvm_va_space_t *va_space = va_range->va_space;
1570     uvm_va_policy_t *policy = uvm_va_range_get_policy(va_range);
1571     uvm_va_block_context_t *va_block_context;
1572 
1573     // If the range belongs to a non-migratable range group and that processor_id is a non-faultable GPU,
1574     // check it can map the preferred location
1575     if (!uvm_range_group_all_migratable(va_space, va_range->node.start, va_range->node.end) &&
1576         UVM_ID_IS_GPU(processor_id) &&
1577         !uvm_processor_mask_test(&va_space->faultable_processors, processor_id) &&
1578         !uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(policy->preferred_location)], processor_id))
1579         return NV_ERR_INVALID_DEVICE;
1580 
1581     uvm_processor_mask_set(&policy->accessed_by, processor_id);
1582 
1583     // If a GPU is already a UVM-Lite GPU then there is nothing else to do.
1584     if (uvm_processor_mask_test(&va_range->uvm_lite_gpus, processor_id))
1585         return NV_OK;
1586 
1587     // Calculate the new UVM-Lite GPUs mask, but don't update it in the va range
1588     // yet so that we can keep block_page_check_mappings() happy while updating
1589     // the mappings.
1590     calc_uvm_lite_gpus_mask(va_space, policy->preferred_location, &policy->accessed_by, &new_uvm_lite_gpus);
1591 
1592     if (uvm_processor_mask_test(&new_uvm_lite_gpus, processor_id)) {
1593         // GPUs that become UVM-Lite GPUs need to unmap everything so that they
1594         // can map the preferred location.
1595         status = range_unmap(va_range, processor_id, out_tracker);
1596         if (status != NV_OK)
1597             return status;
1598     }
1599 
1600     uvm_processor_mask_copy(&va_range->uvm_lite_gpus, &new_uvm_lite_gpus);
1601     va_block_context = uvm_va_space_block_context(va_space, mm);
1602 
1603     for_each_va_block_in_va_range(va_range, va_block) {
1604         status = uvm_va_block_set_accessed_by(va_block, va_block_context, processor_id);
1605         if (status != NV_OK)
1606             return status;
1607     }
1608 
1609     return NV_OK;
1610 }
1611 
1612 void uvm_va_range_unset_accessed_by(uvm_va_range_t *va_range,
1613                                     uvm_processor_id_t processor_id,
1614                                     uvm_tracker_t *out_tracker)
1615 {
1616     uvm_range_group_range_t *rgr = NULL;
1617 
1618     // Mark all range group ranges within this VA range as migrated. We do this to force
1619     // uvm_range_group_set_migration_policy to re-check the policy state since we're changing it here.
1620     uvm_range_group_for_each_range_in(rgr, va_range->va_space, va_range->node.start, va_range->node.end) {
1621         uvm_spin_lock(&rgr->range_group->migrated_ranges_lock);
1622         if (list_empty(&rgr->range_group_migrated_list_node))
1623             list_move_tail(&rgr->range_group_migrated_list_node, &rgr->range_group->migrated_ranges);
1624         uvm_spin_unlock(&rgr->range_group->migrated_ranges_lock);
1625     }
1626 
1627     uvm_processor_mask_clear(&uvm_va_range_get_policy(va_range)->accessed_by, processor_id);
1628 
1629     // If a UVM-Lite GPU is being removed from the accessed_by mask, it will
1630     // also stop being a UVM-Lite GPU unless it's also the preferred location.
1631     if (uvm_processor_mask_test(&va_range->uvm_lite_gpus, processor_id) &&
1632         !uvm_id_equal(uvm_va_range_get_policy(va_range)->preferred_location, processor_id)) {
1633         range_unmap(va_range, processor_id, out_tracker);
1634     }
1635 
1636     range_update_uvm_lite_gpus_mask(va_range);
1637 }
1638 
1639 NV_STATUS uvm_va_range_set_read_duplication(uvm_va_range_t *va_range, struct mm_struct *mm)
1640 {
1641     uvm_va_block_t *va_block;
1642     uvm_va_block_context_t *va_block_context;
1643 
1644     if (uvm_va_range_get_policy(va_range)->read_duplication == UVM_READ_DUPLICATION_ENABLED)
1645         return NV_OK;
1646 
1647     va_block_context = uvm_va_space_block_context(va_range->va_space, mm);
1648 
1649     for_each_va_block_in_va_range(va_range, va_block) {
1650         NV_STATUS status = uvm_va_block_set_read_duplication(va_block, va_block_context);
1651 
1652         if (status != NV_OK)
1653             return status;
1654     }
1655 
1656     return NV_OK;
1657 }
1658 
1659 NV_STATUS uvm_va_range_unset_read_duplication(uvm_va_range_t *va_range, struct mm_struct *mm)
1660 {
1661     uvm_va_block_t *va_block;
1662     uvm_va_block_context_t *va_block_context;
1663     NV_STATUS status;
1664 
1665     if (uvm_va_range_get_policy(va_range)->read_duplication == UVM_READ_DUPLICATION_DISABLED)
1666         return NV_OK;
1667 
1668     va_block_context = uvm_va_space_block_context(va_range->va_space, mm);
1669 
1670     for_each_va_block_in_va_range(va_range, va_block) {
1671         status = uvm_va_block_unset_read_duplication(va_block, va_block_context);
1672 
1673         if (status != NV_OK)
1674             return status;
1675     }
1676 
1677     return NV_OK;
1678 }
1679 
1680 uvm_vma_wrapper_t *uvm_vma_wrapper_alloc(struct vm_area_struct *vma)
1681 {
1682     uvm_vma_wrapper_t *vma_wrapper = nv_kmem_cache_zalloc(g_uvm_vma_wrapper_cache, NV_UVM_GFP_FLAGS);
1683     if (!vma_wrapper)
1684         return NULL;
1685 
1686     vma_wrapper->vma = vma;
1687     uvm_init_rwsem(&vma_wrapper->lock, UVM_LOCK_ORDER_LEAF);
1688 
1689     return vma_wrapper;
1690 }
1691 
1692 void uvm_vma_wrapper_destroy(uvm_vma_wrapper_t *vma_wrapper)
1693 {
1694     if (!vma_wrapper)
1695         return;
1696 
1697     uvm_assert_rwsem_unlocked(&vma_wrapper->lock);
1698 
1699     kmem_cache_free(g_uvm_vma_wrapper_cache, vma_wrapper);
1700 }
1701 
1702 static NvU64 sked_reflected_pte_maker(uvm_page_table_range_vec_t *range_vec, NvU64 offset, void *caller_data)
1703 {
1704     (void)caller_data;
1705 
1706     return range_vec->tree->hal->make_sked_reflected_pte();
1707 }
1708 
1709 static NV_STATUS uvm_map_sked_reflected_range(uvm_va_space_t *va_space, UVM_MAP_DYNAMIC_PARALLELISM_REGION_PARAMS *params)
1710 {
1711     NV_STATUS status;
1712     uvm_va_range_t *va_range = NULL;
1713     uvm_gpu_t *gpu;
1714     uvm_gpu_va_space_t *gpu_va_space;
1715     uvm_page_tree_t *page_tables;
1716     struct mm_struct *mm;
1717 
1718     if (uvm_api_range_invalid_4k(params->base, params->length))
1719         return NV_ERR_INVALID_ADDRESS;
1720 
1721     // The mm needs to be locked in order to remove stale HMM va_blocks.
1722     mm = uvm_va_space_mm_or_current_retain_lock(va_space);
1723     uvm_va_space_down_write(va_space);
1724 
1725     gpu = uvm_va_space_get_gpu_by_uuid_with_gpu_va_space(va_space, &params->gpuUuid);
1726     if (!gpu) {
1727         status = NV_ERR_INVALID_DEVICE;
1728         goto done;
1729     }
1730 
1731     // Check if the GPU can access the VA
1732     if (!uvm_gpu_can_address(gpu, params->base, params->length)) {
1733         status = NV_ERR_OUT_OF_RANGE;
1734         goto done;
1735     }
1736 
1737     gpu_va_space = va_space->gpu_va_spaces[uvm_id_gpu_index(gpu->id)];
1738     page_tables = &gpu_va_space->page_tables;
1739 
1740     // The VA range must exactly cover one supported GPU page
1741     if (!is_power_of_2(params->length) ||
1742         !IS_ALIGNED(params->base, params->length) ||
1743         !uvm_mmu_page_size_supported(page_tables, params->length)) {
1744         status = NV_ERR_INVALID_ADDRESS;
1745         goto done;
1746     }
1747 
1748     status = uvm_va_range_create_sked_reflected(va_space, mm, params->base, params->length, &va_range);
1749     if (status != NV_OK) {
1750         UVM_DBG_PRINT_RL("Failed to create sked reflected VA range [0x%llx, 0x%llx)\n",
1751                 params->base, params->base + params->length);
1752         goto done;
1753     }
1754 
1755     va_range->sked_reflected.gpu_va_space = gpu_va_space;
1756 
1757     status = uvm_page_table_range_vec_init(page_tables,
1758                                            va_range->node.start,
1759                                            uvm_va_range_size(va_range),
1760                                            params->length,
1761                                            UVM_PMM_ALLOC_FLAGS_EVICT,
1762                                            &va_range->sked_reflected.pt_range_vec);
1763     if (status != NV_OK)
1764         goto done;
1765 
1766     status = uvm_page_table_range_vec_write_ptes(&va_range->sked_reflected.pt_range_vec,
1767             UVM_MEMBAR_NONE, sked_reflected_pte_maker, NULL);
1768 
1769     if (status != NV_OK)
1770         goto done;
1771 
1772 done:
1773     if (status != NV_OK && va_range != NULL)
1774         uvm_va_range_destroy(va_range, NULL);
1775 
1776     uvm_va_space_up_write(va_space);
1777     uvm_va_space_mm_or_current_release_unlock(va_space, mm);
1778 
1779     return status;
1780 }
1781 
1782 NV_STATUS uvm_api_map_dynamic_parallelism_region(UVM_MAP_DYNAMIC_PARALLELISM_REGION_PARAMS *params, struct file *filp)
1783 {
1784     uvm_va_space_t *va_space = uvm_va_space_get(filp);
1785 
1786     // Notably the ranges created by the UvmMapDynamicParallelismRegion() API
1787     // are referred to as "SKED reflected ranges" internally as it's more
1788     // descriptive.
1789     return uvm_map_sked_reflected_range(va_space, params);
1790 }
1791 
1792 NV_STATUS uvm_api_alloc_semaphore_pool(UVM_ALLOC_SEMAPHORE_POOL_PARAMS *params, struct file *filp)
1793 {
1794     NV_STATUS status;
1795     uvm_va_space_t *va_space = uvm_va_space_get(filp);
1796     uvm_va_range_t *va_range = NULL;
1797     uvm_gpu_t *gpu;
1798     struct mm_struct *mm;
1799 
1800     if (uvm_api_range_invalid(params->base, params->length))
1801         return NV_ERR_INVALID_ADDRESS;
1802     if (params->gpuAttributesCount > UVM_MAX_GPUS)
1803         return NV_ERR_INVALID_ARGUMENT;
1804 
1805     if (g_uvm_global.conf_computing_enabled && params->gpuAttributesCount == 0)
1806         return NV_ERR_INVALID_ARGUMENT;
1807 
1808     // The mm needs to be locked in order to remove stale HMM va_blocks.
1809     mm = uvm_va_space_mm_or_current_retain_lock(va_space);
1810     uvm_va_space_down_write(va_space);
1811 
1812     status = uvm_va_range_create_semaphore_pool(va_space,
1813                                                 mm,
1814                                                 params->base,
1815                                                 params->length,
1816                                                 params->perGpuAttributes,
1817                                                 params->gpuAttributesCount,
1818                                                 &va_range);
1819     if (status != NV_OK)
1820         goto unlock;
1821 
1822     for_each_va_space_gpu(gpu, va_space) {
1823         status = va_range_register_gpu_semaphore_pool(va_range, gpu);
1824         if (status != NV_OK)
1825             goto done;
1826 
1827         if (!uvm_processor_mask_test(&va_space->registered_gpu_va_spaces, gpu->id))
1828             continue;
1829 
1830         status = va_range_add_gpu_va_space_semaphore_pool(va_range, gpu);
1831         if (status != NV_OK)
1832             goto done;
1833     }
1834 
1835 done:
1836     if (status != NV_OK)
1837         uvm_va_range_destroy(va_range, NULL);
1838 
1839 unlock:
1840     uvm_va_space_up_write(va_space);
1841     uvm_va_space_mm_or_current_release_unlock(va_space, mm);
1842     return status;
1843 }
1844 
1845 NV_STATUS uvm_test_va_range_info(UVM_TEST_VA_RANGE_INFO_PARAMS *params, struct file *filp)
1846 {
1847     uvm_va_space_t *va_space;
1848     uvm_va_range_t *va_range;
1849     uvm_processor_id_t processor_id;
1850     uvm_va_policy_t *policy;
1851     struct vm_area_struct *vma;
1852     NV_STATUS status = NV_OK;
1853     struct mm_struct *mm;
1854 
1855     va_space = uvm_va_space_get(filp);
1856 
1857     mm = uvm_va_space_mm_or_current_retain_lock(va_space);
1858     uvm_va_space_down_read(va_space);
1859 
1860     va_range = uvm_va_range_find(va_space, params->lookup_address);
1861     if (!va_range) {
1862         status = uvm_hmm_va_range_info(va_space, mm, params);
1863         goto out;
1864     }
1865 
1866     policy = uvm_va_range_get_policy(va_range);
1867     params->va_range_start = va_range->node.start;
1868     params->va_range_end   = va_range->node.end;
1869 
1870     // -Wall implies -Wenum-compare, so cast through int to avoid warnings
1871     BUILD_BUG_ON((int)UVM_READ_DUPLICATION_UNSET    != (int)UVM_TEST_READ_DUPLICATION_UNSET);
1872     BUILD_BUG_ON((int)UVM_READ_DUPLICATION_ENABLED  != (int)UVM_TEST_READ_DUPLICATION_ENABLED);
1873     BUILD_BUG_ON((int)UVM_READ_DUPLICATION_DISABLED != (int)UVM_TEST_READ_DUPLICATION_DISABLED);
1874     BUILD_BUG_ON((int)UVM_READ_DUPLICATION_MAX      != (int)UVM_TEST_READ_DUPLICATION_MAX);
1875     params->read_duplication = policy->read_duplication;
1876 
1877     if (UVM_ID_IS_INVALID(policy->preferred_location)) {
1878         memset(&params->preferred_location, 0, sizeof(params->preferred_location));
1879         params->preferred_cpu_nid = NUMA_NO_NODE;
1880     }
1881     else {
1882         uvm_va_space_processor_uuid(va_space, &params->preferred_location, policy->preferred_location);
1883         params->preferred_cpu_nid = policy->preferred_nid;
1884     }
1885 
1886     params->accessed_by_count = 0;
1887     for_each_id_in_mask(processor_id, &policy->accessed_by)
1888         uvm_va_space_processor_uuid(va_space, &params->accessed_by[params->accessed_by_count++], processor_id);
1889 
1890     // -Wall implies -Wenum-compare, so cast through int to avoid warnings
1891     BUILD_BUG_ON((int)UVM_TEST_VA_RANGE_TYPE_INVALID        != (int)UVM_VA_RANGE_TYPE_INVALID);
1892     BUILD_BUG_ON((int)UVM_TEST_VA_RANGE_TYPE_MANAGED        != (int)UVM_VA_RANGE_TYPE_MANAGED);
1893     BUILD_BUG_ON((int)UVM_TEST_VA_RANGE_TYPE_EXTERNAL       != (int)UVM_VA_RANGE_TYPE_EXTERNAL);
1894     BUILD_BUG_ON((int)UVM_TEST_VA_RANGE_TYPE_CHANNEL        != (int)UVM_VA_RANGE_TYPE_CHANNEL);
1895     BUILD_BUG_ON((int)UVM_TEST_VA_RANGE_TYPE_SKED_REFLECTED != (int)UVM_VA_RANGE_TYPE_SKED_REFLECTED);
1896     BUILD_BUG_ON((int)UVM_TEST_VA_RANGE_TYPE_SEMAPHORE_POOL != (int)UVM_VA_RANGE_TYPE_SEMAPHORE_POOL);
1897     BUILD_BUG_ON((int)UVM_TEST_VA_RANGE_TYPE_MAX            != (int)UVM_VA_RANGE_TYPE_MAX);
1898     params->type = va_range->type;
1899 
1900     switch (va_range->type) {
1901         case UVM_VA_RANGE_TYPE_MANAGED:
1902 
1903             params->managed.subtype = UVM_TEST_RANGE_SUBTYPE_UVM;
1904             if (!va_range->managed.vma_wrapper) {
1905                 params->managed.is_zombie = NV_TRUE;
1906                 goto out;
1907             }
1908             params->managed.is_zombie = NV_FALSE;
1909             vma = uvm_va_range_vma_check(va_range, mm);
1910             if (!vma) {
1911                 // We aren't in the same mm as the one which owns the vma, and
1912                 // we don't have that mm locked.
1913                 params->managed.owned_by_calling_process = NV_FALSE;
1914                 goto out;
1915             }
1916             params->managed.owned_by_calling_process = (mm == current->mm ? NV_TRUE : NV_FALSE);
1917             params->managed.vma_start = vma->vm_start;
1918             params->managed.vma_end   = vma->vm_end - 1;
1919             break;
1920         default:
1921             break;
1922     }
1923 
1924 out:
1925     uvm_va_space_up_read(va_space);
1926     uvm_va_space_mm_or_current_release_unlock(va_space, mm);
1927     return status;
1928 }
1929 
1930 NV_STATUS uvm_test_va_range_split(UVM_TEST_VA_RANGE_SPLIT_PARAMS *params, struct file *filp)
1931 {
1932     uvm_va_space_t *va_space = uvm_va_space_get(filp);
1933     uvm_va_range_t *va_range;
1934     NV_STATUS status = NV_OK;
1935 
1936     if (!PAGE_ALIGNED(params->split_address + 1))
1937         return NV_ERR_INVALID_ADDRESS;
1938 
1939     uvm_va_space_down_write(va_space);
1940 
1941     va_range = uvm_va_range_find(va_space, params->split_address);
1942     if (!va_range ||
1943         va_range->node.end == params->split_address ||
1944         va_range->type != UVM_VA_RANGE_TYPE_MANAGED) {
1945         status = NV_ERR_INVALID_ADDRESS;
1946         goto out;
1947     }
1948 
1949     status = uvm_va_range_split(va_range, params->split_address, NULL);
1950 
1951 out:
1952     uvm_va_space_up_write(va_space);
1953     return status;
1954 }
1955 
1956 NV_STATUS uvm_test_va_range_inject_split_error(UVM_TEST_VA_RANGE_INJECT_SPLIT_ERROR_PARAMS *params, struct file *filp)
1957 {
1958     uvm_va_space_t *va_space = uvm_va_space_get(filp);
1959     uvm_va_range_t *va_range;
1960     struct mm_struct *mm;
1961     NV_STATUS status = NV_OK;
1962 
1963     mm = uvm_va_space_mm_or_current_retain_lock(va_space);
1964     uvm_va_space_down_write(va_space);
1965 
1966     va_range = uvm_va_range_find(va_space, params->lookup_address);
1967     if (!va_range) {
1968         if (!mm)
1969             status = NV_ERR_INVALID_ADDRESS;
1970         else
1971             status = uvm_hmm_test_va_block_inject_split_error(va_space, params->lookup_address);
1972     }
1973     else if (va_range->type != UVM_VA_RANGE_TYPE_MANAGED) {
1974         status = NV_ERR_INVALID_ADDRESS;
1975     }
1976     else {
1977         uvm_va_block_t *va_block;
1978         size_t split_index;
1979 
1980         va_range->inject_split_error = true;
1981 
1982         split_index = uvm_va_range_block_index(va_range, params->lookup_address);
1983         va_block = uvm_va_range_block(va_range, split_index);
1984         if (va_block) {
1985             uvm_va_block_test_t *block_test = uvm_va_block_get_test(va_block);
1986 
1987             if (block_test)
1988                 block_test->inject_split_error = true;
1989         }
1990     }
1991 
1992     uvm_va_space_up_write(va_space);
1993     uvm_va_space_mm_or_current_release_unlock(va_space, mm);
1994     return status;
1995 }
1996 
1997 NV_STATUS uvm_test_va_range_inject_add_gpu_va_space_error(UVM_TEST_VA_RANGE_INJECT_ADD_GPU_VA_SPACE_ERROR_PARAMS *params,
1998                                                           struct file *filp)
1999 {
2000     uvm_va_space_t *va_space = uvm_va_space_get(filp);
2001     uvm_va_range_t *va_range;
2002     NV_STATUS status = NV_OK;
2003 
2004     uvm_va_space_down_write(va_space);
2005 
2006     va_range = uvm_va_range_find(va_space, params->lookup_address);
2007     if (!va_range) {
2008         status = NV_ERR_INVALID_ADDRESS;
2009         goto out;
2010     }
2011 
2012     va_range->inject_add_gpu_va_space_error = true;
2013 
2014 out:
2015     uvm_va_space_up_write(va_space);
2016     return status;
2017 }
2018 
2019