1 /*******************************************************************************
2     Copyright (c) 2015-2023 NVIDIA Corporation
3 
4     Permission is hereby granted, free of charge, to any person obtaining a copy
5     of this software and associated documentation files (the "Software"), to
6     deal in the Software without restriction, including without limitation the
7     rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
8     sell copies of the Software, and to permit persons to whom the Software is
9     furnished to do so, subject to the following conditions:
10 
11         The above copyright notice and this permission notice shall be
12         included in all copies or substantial portions of the Software.
13 
14     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17     THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19     FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20     DEALINGS IN THE SOFTWARE.
21 
22 *******************************************************************************/
23 
24 #include "uvm_linux.h"
25 #include "uvm_common.h"
26 #include "uvm_api.h"
27 #include "uvm_gpu.h"
28 #include "uvm_va_space.h"
29 #include "uvm_va_range.h"
30 #include "uvm_va_block.h"
31 #include "uvm_hal_types.h"
32 #include "uvm_kvmalloc.h"
33 #include "uvm_tools.h"
34 #include "uvm_push.h"
35 #include "uvm_hal.h"
36 #include "uvm_perf_thrashing.h"
37 #include "uvm_perf_prefetch.h"
38 #include "uvm_mem.h"
39 #include "uvm_gpu_access_counters.h"
40 #include "uvm_va_space_mm.h"
41 #include "uvm_test_ioctl.h"
42 #include "uvm_conf_computing.h"
43 
44 typedef enum
45 {
46     BLOCK_PTE_OP_MAP,
47     BLOCK_PTE_OP_REVOKE,
48     BLOCK_PTE_OP_COUNT
49 } block_pte_op_t;
50 
51 static NvU64 uvm_perf_authorized_cpu_fault_tracking_window_ns = 300000;
52 
53 static struct kmem_cache *g_uvm_va_block_cache __read_mostly;
54 static struct kmem_cache *g_uvm_va_block_gpu_state_cache __read_mostly;
55 static struct kmem_cache *g_uvm_page_mask_cache __read_mostly;
56 static struct kmem_cache *g_uvm_va_block_context_cache __read_mostly;
57 
58 static int uvm_fault_force_sysmem __read_mostly = 0;
59 module_param(uvm_fault_force_sysmem, int, S_IRUGO|S_IWUSR);
60 MODULE_PARM_DESC(uvm_fault_force_sysmem, "Force (1) using sysmem storage for pages that faulted. Default: 0.");
61 
62 static int uvm_perf_map_remote_on_eviction __read_mostly = 1;
63 module_param(uvm_perf_map_remote_on_eviction, int, S_IRUGO);
64 
65 // Caching is always disabled for mappings to remote memory. The following two
66 // module parameters can be used to force caching for GPU peer/sysmem mappings.
67 //
68 // However, it is important to note that it may not be safe to enable caching
69 // in the general case so the enablement should only be used for experiments.
70 static unsigned uvm_exp_gpu_cache_peermem __read_mostly = 0;
71 module_param(uvm_exp_gpu_cache_peermem, uint, S_IRUGO);
72 MODULE_PARM_DESC(uvm_exp_gpu_cache_peermem,
73                  "Force caching for mappings to peer memory. "
74                  "This is an experimental parameter that may cause correctness issues if used.");
75 
76 static unsigned uvm_exp_gpu_cache_sysmem __read_mostly = 0;
77 module_param(uvm_exp_gpu_cache_sysmem, uint, S_IRUGO);
78 MODULE_PARM_DESC(uvm_exp_gpu_cache_sysmem,
79                  "Force caching for mappings to system memory. "
80                  "This is an experimental parameter that may cause correctness issues if used.");
81 
82 static void block_add_eviction_mappings_entry(void *args);
83 
84 uvm_va_space_t *uvm_va_block_get_va_space_maybe_dead(uvm_va_block_t *va_block)
85 {
86 #if UVM_IS_CONFIG_HMM()
87     if (va_block->hmm.va_space)
88         return va_block->hmm.va_space;
89 #endif
90 
91     if (va_block->va_range)
92         return va_block->va_range->va_space;
93 
94     return NULL;
95 }
96 
97 uvm_va_space_t *uvm_va_block_get_va_space(uvm_va_block_t *va_block)
98 {
99     uvm_va_space_t *va_space;
100 
101     UVM_ASSERT(!uvm_va_block_is_dead(va_block));
102 
103     va_space = uvm_va_block_get_va_space_maybe_dead(va_block);
104     UVM_ASSERT(va_space);
105 
106     return va_space;
107 }
108 
109 bool uvm_va_block_check_policy_is_valid(uvm_va_block_t *va_block,
110                                         const uvm_va_policy_t *policy,
111                                         uvm_va_block_region_t region)
112 {
113     uvm_assert_mutex_locked(&va_block->lock);
114 
115     if (uvm_va_block_is_hmm(va_block)) {
116         const uvm_va_policy_node_t *node;
117 
118         if (uvm_va_policy_is_default(policy)) {
119             // There should only be the default policy within the region.
120             node = uvm_va_policy_node_iter_first(va_block,
121                                                  uvm_va_block_region_start(va_block, region),
122                                                  uvm_va_block_region_end(va_block, region));
123             UVM_ASSERT(!node);
124         }
125         else {
126             // The policy node should cover the region.
127             node = uvm_va_policy_node_from_policy(policy);
128             UVM_ASSERT(node->node.start <= uvm_va_block_region_start(va_block, region));
129             UVM_ASSERT(node->node.end >= uvm_va_block_region_end(va_block, region));
130         }
131     }
132     else {
133         UVM_ASSERT(policy == uvm_va_range_get_policy(va_block->va_range));
134     }
135 
136     return true;
137 }
138 
139 static NvU64 block_gpu_pte_flag_cacheable(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_processor_id_t resident_id)
140 {
141     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
142 
143     UVM_ASSERT(UVM_ID_IS_VALID(resident_id));
144 
145     // Local vidmem is always cached
146     if (uvm_id_equal(resident_id, gpu->id))
147         return UVM_MMU_PTE_FLAGS_CACHED;
148 
149     if (UVM_ID_IS_CPU(resident_id))
150         return uvm_exp_gpu_cache_sysmem == 0 ? UVM_MMU_PTE_FLAGS_NONE : UVM_MMU_PTE_FLAGS_CACHED;
151 
152     UVM_ASSERT(uvm_processor_mask_test(&va_space->can_access[uvm_id_value(gpu->id)], resident_id));
153 
154     return uvm_exp_gpu_cache_peermem == 0 ? UVM_MMU_PTE_FLAGS_NONE : UVM_MMU_PTE_FLAGS_CACHED;
155 }
156 
157 static uvm_gpu_t *block_get_gpu(uvm_va_block_t *block, uvm_gpu_id_t gpu_id)
158 {
159     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
160 
161     return uvm_va_space_get_gpu(va_space, gpu_id);
162 }
163 
164 static const char *block_processor_name(uvm_va_block_t *block, uvm_processor_id_t id)
165 {
166     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
167 
168     return uvm_va_space_processor_name(va_space, id);
169 }
170 
171 static bool block_processor_has_memory(uvm_va_block_t *block, uvm_processor_id_t id)
172 {
173     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
174 
175     return uvm_va_space_processor_has_memory(va_space, id);
176 }
177 
178 static bool is_uvm_fault_force_sysmem_set(void)
179 {
180     // Only enforce this during testing
181     return uvm_enable_builtin_tests && uvm_fault_force_sysmem != 0;
182 }
183 
184 bool uvm_va_space_map_remote_on_eviction(uvm_va_space_t *va_space)
185 {
186     return uvm_perf_map_remote_on_eviction &&
187            uvm_va_space_has_access_counter_migrations(va_space);
188 }
189 
190 static const uvm_processor_mask_t *block_get_uvm_lite_gpus(uvm_va_block_t *va_block)
191 {
192     // Note that for HMM we always return a pointer to a zero bitmap
193     // (not allocated on the stack) since uvm_lite GPUs are not supported.
194     static const uvm_processor_mask_t uvm_lite_gpus = {};
195 
196     if (uvm_va_block_is_hmm(va_block))
197         return &uvm_lite_gpus;
198     else
199         return &va_block->va_range->uvm_lite_gpus;
200 }
201 
202 void uvm_va_block_retry_init(uvm_va_block_retry_t *retry)
203 {
204     if (!retry)
205         return;
206 
207     uvm_tracker_init(&retry->tracker);
208     INIT_LIST_HEAD(&retry->used_chunks);
209     INIT_LIST_HEAD(&retry->free_chunks);
210 }
211 
212 // The bottom bit of uvm_va_block_t::chunks is used to indicate how CPU chunks
213 // are stored.
214 //
215 // CPU chunk storage is handled in three different ways depending on the
216 // type of chunks the VA block owns. This is done to minimize the memory
217 // required to hold metadata.
218 typedef enum
219 {
220     // The uvm_va_block_t::chunk pointer points to a single 2MB
221     // CPU chunk.
222     UVM_CPU_CHUNK_STORAGE_CHUNK = 0,
223 
224     // The uvm_va_block_t::chunks pointer points to a
225     // structure of mixed (64K and 4K) chunks.
226     UVM_CPU_CHUNK_STORAGE_MIXED,
227     UVM_CPU_CHUNK_STORAGE_COUNT,
228 } uvm_cpu_chunk_storage_type_t;
229 
230 #define UVM_CPU_CHUNK_STORAGE_MASK 0x1
231 
232 // The maximum number of slots in the mixed chunk mode (64K + 4K chunks) is
233 // MAX_BIG_PAGES_PER_UVM_VA_BLOCK. Any leading/trailing misaligned pages will
234 // be stored in the first/last entry, respectively.
235 #define MAX_BIG_CPU_CHUNK_SLOTS_PER_UVM_VA_BLOCK MAX_BIG_PAGES_PER_UVM_VA_BLOCK
236 
237 #define MAX_SMALL_CHUNKS_PER_BIG_SLOT (UVM_MIN_BIG_PAGE_SIZE / PAGE_SIZE)
238 
239 // This structure is used when a VA block contains 64K or a mix of 64K and 4K
240 // CPU chunks.
241 // For every 64K CPU chunks, big_chunks will have its corresponding bit set
242 // and the corresponding index in slots will point directly to the
243 // uvm_cpu_chunk_t structure.
244 //
245 // For 4K CPU chunks, the corresponding bit in big_chunks will be clear and
246 // the element in slots will point to an array of 16 uvm_cpu_chunk_t pointers.
247 typedef struct {
248     DECLARE_BITMAP(big_chunks, MAX_BIG_CPU_CHUNK_SLOTS_PER_UVM_VA_BLOCK);
249     void *slots[MAX_BIG_CPU_CHUNK_SLOTS_PER_UVM_VA_BLOCK];
250 } uvm_cpu_chunk_storage_mixed_t;
251 
252 static uvm_va_block_region_t uvm_cpu_chunk_block_region(uvm_va_block_t *va_block,
253                                                         uvm_cpu_chunk_t *chunk,
254                                                         uvm_page_index_t page_index)
255 {
256     UVM_ASSERT(chunk);
257     return uvm_va_block_chunk_region(va_block, uvm_cpu_chunk_get_size(chunk), page_index);
258 }
259 
260 static void *uvm_cpu_storage_get_ptr(uvm_va_block_t *block)
261 {
262     return (void *)(block->cpu.chunks & ~UVM_CPU_CHUNK_STORAGE_MASK);
263 }
264 
265 static uvm_cpu_chunk_storage_type_t uvm_cpu_storage_get_type(uvm_va_block_t *block)
266 {
267     return block->cpu.chunks & UVM_CPU_CHUNK_STORAGE_MASK;
268 }
269 
270 static uvm_page_index_t compute_page_prefix(uvm_va_block_t *va_block, uvm_chunk_size_t size)
271 {
272     return (UVM_ALIGN_UP(va_block->start, size) - va_block->start) / PAGE_SIZE;
273 }
274 
275 static size_t compute_slot_index(uvm_va_block_t *va_block, uvm_page_index_t page_index)
276 {
277     uvm_va_block_region_t block_region = uvm_va_block_region_from_block(va_block);
278     uvm_page_index_t prefix;
279     size_t slot_index;
280 
281     UVM_ASSERT(page_index < block_region.outer);
282     prefix = compute_page_prefix(va_block, UVM_PAGE_SIZE_64K);
283 
284     if (page_index < prefix)
285         return 0;
286 
287     slot_index = ((page_index - prefix) / MAX_SMALL_CHUNKS_PER_BIG_SLOT) + !!prefix;
288     UVM_ASSERT(slot_index < MAX_BIG_CPU_CHUNK_SLOTS_PER_UVM_VA_BLOCK);
289 
290     return slot_index;
291 }
292 
293 static size_t compute_small_index(uvm_va_block_t *va_block, uvm_page_index_t page_index)
294 {
295     size_t prefix = compute_page_prefix(va_block, UVM_PAGE_SIZE_64K);
296 
297     if (page_index < prefix)
298         return page_index;
299 
300     return (page_index - prefix) % MAX_SMALL_CHUNKS_PER_BIG_SLOT;
301 }
302 
303 NV_STATUS uvm_cpu_chunk_insert_in_block(uvm_va_block_t *va_block,
304                                         uvm_cpu_chunk_t *chunk,
305                                         uvm_page_index_t page_index)
306 {
307     uvm_chunk_size_t chunk_size = uvm_cpu_chunk_get_size(chunk);
308     uvm_va_block_region_t chunk_region = uvm_va_block_region(page_index, page_index + uvm_cpu_chunk_num_pages(chunk));
309     size_t slot_index;
310     uvm_cpu_chunk_storage_mixed_t *mixed;
311     uvm_cpu_chunk_t **chunks = NULL;
312 
313     // We only want to use the bottom bit of a pointer.
314     BUILD_BUG_ON(UVM_CPU_CHUNK_STORAGE_COUNT > 2);
315 
316     // We want to protect against two threads manipulating the VA block's CPU
317     // chunks at the same time. However, when a block is split, the new block's
318     // lock is locked without tracking. So, we can't use
319     // uvm_assert_mutex_locked().
320     UVM_ASSERT(mutex_is_locked(&va_block->lock.m));
321 
322     if (chunk_size == UVM_CHUNK_SIZE_2M) {
323         UVM_ASSERT(uvm_va_block_size(va_block) == UVM_PAGE_SIZE_2M);
324         UVM_ASSERT(!va_block->cpu.chunks);
325         va_block->cpu.chunks = (unsigned long)chunk | UVM_CPU_CHUNK_STORAGE_CHUNK;
326     }
327     else {
328         if (!va_block->cpu.chunks) {
329             mixed = uvm_kvmalloc_zero(sizeof(*mixed));
330             if (!mixed)
331                 return NV_ERR_NO_MEMORY;
332 
333             va_block->cpu.chunks = (unsigned long)mixed | UVM_CPU_CHUNK_STORAGE_MIXED;
334         }
335 
336         UVM_ASSERT(uvm_cpu_storage_get_type(va_block) == UVM_CPU_CHUNK_STORAGE_MIXED);
337         mixed = uvm_cpu_storage_get_ptr(va_block);
338         slot_index = compute_slot_index(va_block, page_index);
339         UVM_ASSERT(compute_slot_index(va_block, page_index + uvm_cpu_chunk_num_pages(chunk) - 1) == slot_index);
340         UVM_ASSERT(!test_bit(slot_index, mixed->big_chunks));
341 
342         if (chunk_size == UVM_CHUNK_SIZE_64K) {
343             mixed->slots[slot_index] = chunk;
344             set_bit(slot_index, mixed->big_chunks);
345         }
346         else {
347             size_t small_index;
348 
349             UVM_ASSERT(chunk_size == UVM_CHUNK_SIZE_4K);
350             chunks = mixed->slots[slot_index];
351 
352             if (!chunks) {
353                 chunks = uvm_kvmalloc_zero(sizeof(*chunks) * MAX_SMALL_CHUNKS_PER_BIG_SLOT);
354                 if (!chunks)
355                     return NV_ERR_NO_MEMORY;
356                 mixed->slots[slot_index] = chunks;
357             }
358 
359             small_index = compute_small_index(va_block, page_index);
360             chunks[small_index] = chunk;
361         }
362     }
363 
364     uvm_page_mask_region_fill(&va_block->cpu.allocated, chunk_region);
365     return NV_OK;
366 }
367 
368 uvm_cpu_chunk_t *uvm_cpu_chunk_get_chunk_for_page(uvm_va_block_t *va_block, uvm_page_index_t page_index)
369 {
370     uvm_cpu_chunk_storage_mixed_t *mixed;
371     uvm_cpu_chunk_t *chunk;
372     uvm_cpu_chunk_t **chunks;
373     size_t slot_index;
374 
375     UVM_ASSERT(page_index < uvm_va_block_num_cpu_pages(va_block));
376     if (!uvm_page_mask_test(&va_block->cpu.allocated, page_index))
377         return NULL;
378 
379     UVM_ASSERT(va_block->cpu.chunks);
380 
381     if (uvm_cpu_storage_get_type(va_block) == UVM_CPU_CHUNK_STORAGE_CHUNK) {
382         return uvm_cpu_storage_get_ptr(va_block);
383     }
384     else {
385         mixed = uvm_cpu_storage_get_ptr(va_block);
386         slot_index = compute_slot_index(va_block, page_index);
387         UVM_ASSERT(mixed->slots[slot_index] != NULL);
388         if (test_bit(slot_index, mixed->big_chunks))
389             return mixed->slots[slot_index];
390 
391         chunks = mixed->slots[slot_index];
392         chunk = chunks[compute_small_index(va_block, page_index)];
393     }
394 
395     UVM_ASSERT(chunk);
396     return chunk;
397 }
398 
399 void uvm_cpu_chunk_remove_from_block(uvm_va_block_t *va_block,
400                                      uvm_page_index_t page_index)
401 {
402     uvm_cpu_chunk_storage_mixed_t *mixed;
403     uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, page_index);
404     uvm_va_block_region_t chunk_region = uvm_cpu_chunk_block_region(va_block, chunk, page_index);
405     size_t slot_index;
406     uvm_cpu_chunk_t **chunks;
407 
408     // We want to protect against two threads manipulating the VA block's CPU
409     // chunks at the same time. However, when a block is split, the new block's
410     // lock is locked without tracking. So, we can't use
411     // uvm_assert_mutex_locked().
412     UVM_ASSERT(mutex_is_locked(&va_block->lock.m));
413     UVM_ASSERT(va_block->cpu.chunks);
414     UVM_ASSERT(uvm_va_block_region_num_pages(chunk_region) == uvm_cpu_chunk_num_pages(chunk));
415 
416     if (uvm_cpu_storage_get_type(va_block) == UVM_CPU_CHUNK_STORAGE_CHUNK) {
417         UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_2M);
418         UVM_ASSERT(uvm_cpu_storage_get_ptr(va_block) == chunk);
419         va_block->cpu.chunks = 0;
420     }
421     else {
422         UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) != UVM_CHUNK_SIZE_2M);
423         mixed = uvm_cpu_storage_get_ptr(va_block);
424         slot_index = compute_slot_index(va_block, page_index);
425         UVM_ASSERT(mixed->slots[slot_index] != NULL);
426 
427         if (test_bit(slot_index, mixed->big_chunks)) {
428             UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_64K);
429             UVM_ASSERT(mixed->slots[slot_index] == chunk);
430             mixed->slots[slot_index] = NULL;
431             clear_bit(slot_index, mixed->big_chunks);
432         }
433         else {
434             size_t small_index;
435 
436             UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_4K);
437             chunks = mixed->slots[slot_index];
438             small_index = compute_small_index(va_block, page_index);
439             UVM_ASSERT(chunks[small_index] == chunk);
440             chunks[small_index] = NULL;
441 
442             for (small_index = 0; small_index < MAX_SMALL_CHUNKS_PER_BIG_SLOT; small_index++) {
443                 if (chunks[small_index])
444                     break;
445             }
446 
447             if (small_index == MAX_SMALL_CHUNKS_PER_BIG_SLOT) {
448                 uvm_kvfree(chunks);
449                 mixed->slots[slot_index] = NULL;
450             }
451         }
452     }
453 
454     uvm_page_mask_region_clear(&va_block->cpu.allocated, chunk_region);
455 
456     if (uvm_page_mask_empty(&va_block->cpu.allocated) && va_block->cpu.chunks) {
457         uvm_kvfree(uvm_cpu_storage_get_ptr(va_block));
458         va_block->cpu.chunks = 0;
459     }
460 }
461 
462 struct page *uvm_cpu_chunk_get_cpu_page(uvm_va_block_t *va_block, uvm_page_index_t page_index)
463 {
464     uvm_va_block_region_t chunk_region;
465     uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, page_index);
466 
467     UVM_ASSERT(chunk);
468     UVM_ASSERT(chunk->page);
469     chunk_region = uvm_va_block_chunk_region(va_block, uvm_cpu_chunk_get_size(chunk), page_index);
470     return chunk->page + (page_index - chunk_region.first);
471 }
472 
473 static uvm_cpu_chunk_t *uvm_cpu_chunk_first_in_region(uvm_va_block_t *va_block,
474                                                       uvm_va_block_region_t region,
475                                                       uvm_page_index_t *first_chunk_page)
476 {
477     uvm_cpu_chunk_t *chunk = NULL;
478     uvm_page_index_t page_index;
479 
480     page_index = uvm_va_block_first_page_in_mask(region, &va_block->cpu.allocated);
481     if (page_index < region.outer)
482         chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, page_index);
483 
484     if (first_chunk_page && chunk) {
485         uvm_va_block_region_t chunk_region = uvm_cpu_chunk_block_region(va_block, chunk, page_index);
486         *first_chunk_page = chunk_region.first;
487     }
488 
489     return chunk;
490 }
491 
492 #define for_each_cpu_chunk_in_block_region(chunk, page_index, va_block, region)                                       \
493     for ((chunk) = uvm_cpu_chunk_first_in_region((va_block), (region), &(page_index));                                \
494          (chunk) != NULL;                                                                                             \
495          (chunk) = uvm_cpu_chunk_first_in_region((va_block),                                                          \
496                                                  uvm_va_block_region((page_index) + uvm_cpu_chunk_num_pages((chunk)), \
497                                                                      (region).outer),                                 \
498                                                  &(page_index)))
499 
500 #define for_each_cpu_chunk_in_block_region_safe(chunk, page_index, next_page_index, va_block, region)    \
501     for ((chunk) = uvm_cpu_chunk_first_in_region((va_block), (region), &(page_index)),                   \
502                        (next_page_index) = (page_index) + (chunk ? uvm_cpu_chunk_num_pages(chunk) : 0);  \
503          (chunk) != NULL;                                                                                \
504          (chunk) = uvm_cpu_chunk_first_in_region((va_block),                                             \
505                                                  uvm_va_block_region((next_page_index), (region).outer), \
506                                                  &(page_index)),                                         \
507              (next_page_index) = (page_index) + ((chunk) ? uvm_cpu_chunk_num_pages((chunk)) : 0))
508 
509 #define for_each_cpu_chunk_in_block(chunk, page_index, va_block)        \
510     for_each_cpu_chunk_in_block_region((chunk), (page_index), (va_block), uvm_va_block_region_from_block((va_block)))
511 
512 #define for_each_cpu_chunk_in_block_safe(chunk, page_index, next_page_index, va_block)  \
513     for_each_cpu_chunk_in_block_region_safe((chunk),                                    \
514                                             (page_index),                               \
515                                             (next_page_index),                          \
516                                             (va_block),                                 \
517                                             uvm_va_block_region_from_block((va_block)))
518 
519 struct vm_area_struct *uvm_va_block_find_vma_region(uvm_va_block_t *va_block,
520                                                     struct mm_struct *mm,
521                                                     NvU64 start,
522                                                     uvm_va_block_region_t *region)
523 {
524     struct vm_area_struct *vma;
525     NvU64 end;
526 
527     if (start > va_block->end)
528         return NULL;
529 
530     vma = find_vma_intersection(mm, start, va_block->end + 1);
531     if (!vma)
532         return NULL;
533 
534     if (start < vma->vm_start)
535         start = vma->vm_start;
536 
537     end = vma->vm_end - 1;
538     if (end > va_block->end)
539         end = va_block->end;
540 
541     *region = uvm_va_block_region_from_start_end(va_block, start, end);
542 
543     return vma;
544 }
545 
546 static bool block_check_cpu_chunks(uvm_va_block_t *block)
547 {
548     uvm_cpu_chunk_t *chunk;
549     size_t alloced_pages = 0;
550     uvm_va_block_region_t prev_region = { 0 };
551     uvm_page_index_t page_index;
552 
553     for_each_cpu_chunk_in_block(chunk, page_index, block) {
554         uvm_va_block_region_t chunk_region = uvm_cpu_chunk_block_region(block, chunk, page_index);
555         size_t num_chunk_pages = uvm_cpu_chunk_num_pages(chunk);
556         uvm_page_index_t chunk_page;
557 
558         UVM_ASSERT(prev_region.outer <= chunk_region.first);
559         UVM_ASSERT(IS_ALIGNED(uvm_va_block_region_start(block, chunk_region), uvm_cpu_chunk_get_size(chunk)));
560         UVM_ASSERT(chunk_region.outer <= uvm_va_block_num_cpu_pages(block));
561 
562         alloced_pages += uvm_cpu_chunk_num_pages(chunk);
563         UVM_ASSERT(uvm_page_mask_region_full(&block->cpu.allocated, chunk_region));
564         prev_region = chunk_region;
565 
566         for (chunk_page = page_index; chunk_page < page_index + num_chunk_pages; chunk_page++)
567             UVM_ASSERT(uvm_cpu_chunk_get_chunk_for_page(block, chunk_page) == chunk);
568     }
569 
570     UVM_ASSERT(alloced_pages == uvm_page_mask_weight(&block->cpu.allocated));
571 
572     return true;
573 }
574 
575 // Frees any left-over free chunks and unpins all the used chunks
576 void uvm_va_block_retry_deinit(uvm_va_block_retry_t *retry, uvm_va_block_t *va_block)
577 {
578     uvm_gpu_t *gpu;
579     uvm_gpu_chunk_t *gpu_chunk;
580     uvm_gpu_chunk_t *next_chunk;
581 
582     if (!retry)
583         return;
584 
585     uvm_tracker_deinit(&retry->tracker);
586 
587     // Free any unused chunks
588     list_for_each_entry_safe(gpu_chunk, next_chunk, &retry->free_chunks, list) {
589         list_del_init(&gpu_chunk->list);
590         gpu = uvm_gpu_chunk_get_gpu(gpu_chunk);
591         uvm_pmm_gpu_free(&gpu->pmm, gpu_chunk, NULL);
592     }
593 
594     // Unpin all the used chunks now that we are done
595     list_for_each_entry_safe(gpu_chunk, next_chunk, &retry->used_chunks, list) {
596         list_del_init(&gpu_chunk->list);
597         gpu = uvm_gpu_chunk_get_gpu(gpu_chunk);
598         // HMM should have already moved allocated blocks to the referenced
599         // state so any left over were not migrated and should be freed.
600         if (uvm_va_block_is_hmm(va_block))
601             uvm_pmm_gpu_free(&gpu->pmm, gpu_chunk, NULL);
602         else
603             uvm_pmm_gpu_unpin_allocated(&gpu->pmm, gpu_chunk, va_block);
604     }
605 }
606 
607 static void block_retry_add_free_chunk(uvm_va_block_retry_t *retry, uvm_gpu_chunk_t *gpu_chunk)
608 {
609     list_add_tail(&gpu_chunk->list, &retry->free_chunks);
610 }
611 
612 static void block_retry_add_used_chunk(uvm_va_block_retry_t *retry, uvm_gpu_chunk_t *gpu_chunk)
613 {
614     list_add_tail(&gpu_chunk->list, &retry->used_chunks);
615 }
616 
617 static uvm_gpu_chunk_t *block_retry_get_free_chunk(uvm_va_block_retry_t *retry, uvm_gpu_t *gpu, uvm_chunk_size_t size)
618 {
619     uvm_gpu_chunk_t *gpu_chunk;
620 
621     list_for_each_entry(gpu_chunk, &retry->free_chunks, list) {
622         if (uvm_gpu_chunk_get_gpu(gpu_chunk) == gpu && uvm_gpu_chunk_get_size(gpu_chunk) == size) {
623             list_del_init(&gpu_chunk->list);
624             return gpu_chunk;
625         }
626     }
627 
628     return NULL;
629 }
630 
631 // Encapsulates a reference to a physical page belonging to a specific processor
632 // within a VA block.
633 typedef struct
634 {
635     // Processor the page is on
636     uvm_processor_id_t processor;
637 
638     // The page index
639     uvm_page_index_t page_index;
640 } block_phys_page_t;
641 
642 static block_phys_page_t block_phys_page(uvm_processor_id_t processor, uvm_page_index_t page_index)
643 {
644     return (block_phys_page_t){ processor, page_index };
645 }
646 
647 NV_STATUS uvm_va_block_init(void)
648 {
649     if (uvm_enable_builtin_tests)
650         g_uvm_va_block_cache = NV_KMEM_CACHE_CREATE("uvm_va_block_wrapper_t", uvm_va_block_wrapper_t);
651     else
652         g_uvm_va_block_cache = NV_KMEM_CACHE_CREATE("uvm_va_block_t", uvm_va_block_t);
653 
654     if (!g_uvm_va_block_cache)
655         return NV_ERR_NO_MEMORY;
656 
657     g_uvm_va_block_gpu_state_cache = NV_KMEM_CACHE_CREATE("uvm_va_block_gpu_state_t", uvm_va_block_gpu_state_t);
658     if (!g_uvm_va_block_gpu_state_cache)
659         return NV_ERR_NO_MEMORY;
660 
661     g_uvm_page_mask_cache = NV_KMEM_CACHE_CREATE("uvm_page_mask_t", uvm_page_mask_t);
662     if (!g_uvm_page_mask_cache)
663         return NV_ERR_NO_MEMORY;
664 
665     g_uvm_va_block_context_cache = NV_KMEM_CACHE_CREATE("uvm_va_block_context_t", uvm_va_block_context_t);
666     if (!g_uvm_va_block_context_cache)
667         return NV_ERR_NO_MEMORY;
668 
669     return NV_OK;
670 }
671 
672 void uvm_va_block_exit(void)
673 {
674     kmem_cache_destroy_safe(&g_uvm_va_block_context_cache);
675     kmem_cache_destroy_safe(&g_uvm_page_mask_cache);
676     kmem_cache_destroy_safe(&g_uvm_va_block_gpu_state_cache);
677     kmem_cache_destroy_safe(&g_uvm_va_block_cache);
678 }
679 
680 uvm_va_block_context_t *uvm_va_block_context_alloc(struct mm_struct *mm)
681 {
682     uvm_va_block_context_t *block_context = kmem_cache_alloc(g_uvm_va_block_context_cache, NV_UVM_GFP_FLAGS);
683     if (block_context)
684         uvm_va_block_context_init(block_context, mm);
685 
686     return block_context;
687 }
688 
689 void uvm_va_block_context_free(uvm_va_block_context_t *va_block_context)
690 {
691     if (va_block_context)
692         kmem_cache_free(g_uvm_va_block_context_cache, va_block_context);
693 }
694 
695 // Convert from page_index to chunk_index. The goal is for each system page in
696 // the region [start, start + size) to be covered by the largest naturally-
697 // aligned user chunk size.
698 size_t uvm_va_block_gpu_chunk_index_range(NvU64 start,
699                                           NvU64 size,
700                                           uvm_gpu_t *gpu,
701                                           uvm_page_index_t page_index,
702                                           uvm_chunk_size_t *out_chunk_size)
703 {
704     uvm_chunk_sizes_mask_t chunk_sizes = gpu->parent->mmu_user_chunk_sizes;
705     uvm_chunk_size_t chunk_size, final_chunk_size;
706     size_t num_chunks, num_chunks_total;
707     NvU64 addr, end, aligned_start, aligned_addr, aligned_end, temp_size;
708 
709     UVM_ASSERT(PAGE_ALIGNED(start));
710     UVM_ASSERT(PAGE_ALIGNED(size));
711     UVM_ASSERT(size > 0);
712     UVM_ASSERT(size <= UVM_CHUNK_SIZE_2M);
713     UVM_ASSERT(UVM_ALIGN_DOWN(start, UVM_CHUNK_SIZE_2M) == UVM_ALIGN_DOWN(start + size - 1, UVM_CHUNK_SIZE_2M));
714     BUILD_BUG_ON(UVM_VA_BLOCK_SIZE != UVM_CHUNK_SIZE_2M);
715 
716     // PAGE_SIZE needs to be the lowest natively-supported chunk size in the
717     // mask, since we never deal with chunk sizes smaller than that (although we
718     // may have PTEs mapping pages smaller than that).
719     UVM_ASSERT(uvm_chunk_find_first_size(chunk_sizes) == PAGE_SIZE);
720 
721     // Optimize the ideal Pascal+ case: the whole block is covered by a single
722     // 2M page.
723     if ((chunk_sizes & UVM_CHUNK_SIZE_2M) && size == UVM_CHUNK_SIZE_2M) {
724         UVM_ASSERT(IS_ALIGNED(start, UVM_CHUNK_SIZE_2M));
725         final_chunk_size = UVM_CHUNK_SIZE_2M;
726         num_chunks_total = 0;
727         goto out;
728     }
729 
730     // Only one 2M chunk can fit within a VA block on any GPU architecture, so
731     // remove that size from consideration.
732     chunk_sizes &= ~UVM_CHUNK_SIZE_2M;
733 
734     // Next common case: the whole block is aligned and sized to perfectly fit
735     // the largest page size.
736     final_chunk_size = uvm_chunk_find_last_size(chunk_sizes);
737     if (IS_ALIGNED(start, final_chunk_size) && IS_ALIGNED(size, final_chunk_size)) {
738         num_chunks_total = (size_t)uvm_div_pow2_64(page_index * PAGE_SIZE, final_chunk_size);
739         goto out;
740     }
741 
742     // We didn't hit our special paths. Do it the hard way.
743 
744     num_chunks_total = 0;
745     addr = start + page_index * PAGE_SIZE;
746     end = start + size;
747     final_chunk_size = 0;
748     UVM_ASSERT(addr < end);
749 
750     // The below loop collapses almost completely when chunk_size == PAGE_SIZE
751     // since in that lowest-common-denominator case everything is already
752     // aligned. Skip it and handle that specially after the loop.
753     //
754     // Note that since we removed 2M already above, this loop will only iterate
755     // once on x86 Pascal+ since only 64K is left.
756     chunk_sizes &= ~PAGE_SIZE;
757 
758     // This loop calculates the number of chunks between start and addr by
759     // calculating the number of whole chunks of each size between them,
760     // starting with the largest allowed chunk size. This requires fewer
761     // iterations than if we began from start and kept calculating the next
762     // larger chunk size boundary.
763     for_each_chunk_size_rev(chunk_size, chunk_sizes) {
764         aligned_start = UVM_ALIGN_UP(start, chunk_size);
765         aligned_addr  = UVM_ALIGN_DOWN(addr, chunk_size);
766         aligned_end   = UVM_ALIGN_DOWN(end, chunk_size);
767 
768         // If addr and start are within the same chunk, try smaller
769         if (aligned_start > aligned_addr)
770             continue;
771 
772         // If addr and end are not in the same chunk, then addr is covered by a
773         // single chunk of the current size. Ignore smaller boundaries between
774         // addr and aligned_addr.
775         if (aligned_addr < aligned_end && final_chunk_size == 0) {
776             addr = aligned_addr;
777             final_chunk_size = chunk_size;
778         }
779 
780         // How many chunks of this size are between start and addr? Note that
781         // this might be 0 since aligned_addr and aligned_start could be in the
782         // same chunk.
783         num_chunks = uvm_div_pow2_32(((NvU32)aligned_addr - aligned_start), chunk_size);
784         num_chunks_total += num_chunks;
785 
786         // We've already accounted for these chunks, so "remove" them by
787         // bringing start, addr, and end closer together to calculate the
788         // remaining chunk sizes.
789         temp_size = num_chunks * chunk_size;
790         addr -= temp_size;
791         end -= temp_size;
792 
793         // Once there's no separation between addr and start, and we've
794         // successfully found the right chunk size when taking end into account,
795         // we're done.
796         if (addr == start && final_chunk_size)
797             break;
798     }
799 
800     // Handle PAGE_SIZE cleanup since we skipped it in the loop
801     num_chunks_total += (addr - start) / PAGE_SIZE;
802     if (final_chunk_size == 0)
803         final_chunk_size = PAGE_SIZE;
804 
805 out:
806     if (out_chunk_size)
807         *out_chunk_size = final_chunk_size;
808 
809     return num_chunks_total;
810 }
811 
812 static size_t block_gpu_chunk_index_range(uvm_va_block_t *va_block,
813                                           NvU64 start,
814                                           NvU64 size,
815                                           uvm_gpu_t *gpu,
816                                           uvm_page_index_t page_index,
817                                           uvm_chunk_size_t *out_chunk_size)
818 {
819     if (uvm_va_block_is_hmm(va_block)) {
820         if (out_chunk_size)
821             *out_chunk_size = PAGE_SIZE;
822         return page_index;
823     }
824 
825     return uvm_va_block_gpu_chunk_index_range(start, size, gpu, page_index, out_chunk_size);
826 }
827 
828 static size_t block_gpu_chunk_index(uvm_va_block_t *block,
829                                     uvm_gpu_t *gpu,
830                                     uvm_page_index_t page_index,
831                                     uvm_chunk_size_t *out_chunk_size)
832 {
833     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
834     uvm_chunk_size_t size;
835     uvm_gpu_chunk_t *chunk;
836     size_t index;
837 
838     index = block_gpu_chunk_index_range(block, block->start, uvm_va_block_size(block), gpu, page_index, &size);
839 
840     UVM_ASSERT(size >= PAGE_SIZE);
841 
842     if (gpu_state) {
843         UVM_ASSERT(gpu_state->chunks);
844         chunk = gpu_state->chunks[index];
845         if (chunk) {
846             UVM_ASSERT(uvm_gpu_chunk_get_size(chunk) == size);
847             UVM_ASSERT(chunk->state != UVM_PMM_GPU_CHUNK_STATE_PMA_OWNED);
848             UVM_ASSERT(chunk->state != UVM_PMM_GPU_CHUNK_STATE_FREE);
849         }
850     }
851 
852     if (out_chunk_size)
853         *out_chunk_size = size;
854 
855     return index;
856 }
857 
858 // Compute the size of the chunk known to start at start_page_index
859 static uvm_chunk_size_t block_gpu_chunk_size(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_page_index_t start_page_index)
860 {
861     uvm_chunk_sizes_mask_t chunk_sizes = gpu->parent->mmu_user_chunk_sizes;
862     uvm_chunk_sizes_mask_t start_alignments, pow2_leq_size, allowed_sizes;
863     NvU64 start = uvm_va_block_cpu_page_address(block, start_page_index);
864     NvU64 size = block->end - start + 1;
865 
866     if (uvm_va_block_is_hmm(block))
867         return PAGE_SIZE;
868 
869     // Create a mask of all sizes for which start is aligned. x ^ (x-1) yields a
870     // mask of the rightmost 1 bit in x, as well as all trailing 0 bits in x.
871     // Example: 1011000 -> 0001111
872     start_alignments = (uvm_chunk_sizes_mask_t)(start ^ (start - 1));
873 
874     // Next, compute all sizes (powers of two) which are <= size.
875     pow2_leq_size = (uvm_chunk_sizes_mask_t)rounddown_pow_of_two(size);
876     pow2_leq_size |= pow2_leq_size - 1;
877 
878     // Now and them all together to get our list of GPU-supported chunk sizes
879     // which are aligned to start and will fit within size.
880     allowed_sizes = chunk_sizes & start_alignments & pow2_leq_size;
881 
882     // start and size must always be aligned to at least the smallest supported
883     // chunk size (PAGE_SIZE).
884     UVM_ASSERT(allowed_sizes >= PAGE_SIZE);
885 
886     // Take the largest allowed size
887     return uvm_chunk_find_last_size(allowed_sizes);
888 }
889 
890 static size_t block_num_gpu_chunks(uvm_va_block_t *block, uvm_gpu_t *gpu)
891 {
892     return block_gpu_chunk_index(block, gpu, uvm_va_block_cpu_page_index(block, block->end), NULL) + 1;
893 }
894 
895 static size_t block_num_gpu_chunks_range(uvm_va_block_t *block, NvU64 start, NvU64 size, uvm_gpu_t *gpu)
896 {
897     uvm_page_index_t last_page_index = (size_t)((size / PAGE_SIZE) - 1);
898     return block_gpu_chunk_index_range(block, start, size, gpu, last_page_index, NULL) + 1;
899 }
900 
901 uvm_gpu_chunk_t *uvm_va_block_lookup_gpu_chunk(uvm_va_block_t *va_block, uvm_gpu_t *gpu, NvU64 address)
902 {
903     size_t chunk_index;
904     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id);
905     uvm_page_index_t page_index = uvm_va_block_cpu_page_index(va_block, address);
906 
907     uvm_assert_mutex_locked(&va_block->lock);
908 
909     if (!gpu_state)
910         return NULL;
911 
912     chunk_index = block_gpu_chunk_index(va_block, gpu, page_index, NULL);
913 
914     return gpu_state->chunks[chunk_index];
915 }
916 
917 NV_STATUS uvm_va_block_create(uvm_va_range_t *va_range,
918                               NvU64 start,
919                               NvU64 end,
920                               uvm_va_block_t **out_block)
921 {
922     uvm_va_block_t *block = NULL;
923     NvU64 size = end - start + 1;
924 
925     UVM_ASSERT(PAGE_ALIGNED(start));
926     UVM_ASSERT(PAGE_ALIGNED(end + 1));
927     UVM_ASSERT(PAGE_ALIGNED(size));
928     UVM_ASSERT(size > 0);
929     UVM_ASSERT(size <= UVM_VA_BLOCK_SIZE);
930 
931     if (va_range) {
932         // Create a managed va_block.
933         UVM_ASSERT(start >= va_range->node.start);
934         UVM_ASSERT(end <= va_range->node.end);
935         UVM_ASSERT(va_range->type == UVM_VA_RANGE_TYPE_MANAGED);
936     }
937 
938     // Blocks can't span a block alignment boundary
939     UVM_ASSERT(UVM_VA_BLOCK_ALIGN_DOWN(start) == UVM_VA_BLOCK_ALIGN_DOWN(end));
940 
941     if (uvm_enable_builtin_tests) {
942         uvm_va_block_wrapper_t *block_wrapper = nv_kmem_cache_zalloc(g_uvm_va_block_cache, NV_UVM_GFP_FLAGS);
943 
944         if (block_wrapper)
945             block = &block_wrapper->block;
946     }
947     else {
948         block = nv_kmem_cache_zalloc(g_uvm_va_block_cache, NV_UVM_GFP_FLAGS);
949     }
950 
951     if (!block)
952         return NV_ERR_NO_MEMORY;
953 
954     nv_kref_init(&block->kref);
955     uvm_mutex_init(&block->lock, UVM_LOCK_ORDER_VA_BLOCK);
956     block->start = start;
957     block->end = end;
958     block->va_range = va_range;
959     uvm_tracker_init(&block->tracker);
960     block->prefetch_info.last_migration_proc_id = UVM_ID_INVALID;
961 
962     nv_kthread_q_item_init(&block->eviction_mappings_q_item, block_add_eviction_mappings_entry, block);
963 
964     *out_block = block;
965     return NV_OK;
966 }
967 
968 static void cpu_chunk_remove_sysmem_gpu_mapping(uvm_cpu_chunk_t *chunk, uvm_gpu_t *gpu)
969 {
970     NvU64 gpu_mapping_addr = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu->parent);
971     if (gpu_mapping_addr == 0)
972         return;
973 
974     uvm_pmm_sysmem_mappings_remove_gpu_mapping(&gpu->pmm_reverse_sysmem_mappings, gpu_mapping_addr);
975     uvm_cpu_chunk_unmap_gpu_phys(chunk, gpu->parent);
976 }
977 
978 static NV_STATUS cpu_chunk_add_sysmem_gpu_mapping(uvm_cpu_chunk_t *chunk,
979                                                   uvm_va_block_t *block,
980                                                   uvm_page_index_t page_index,
981                                                   uvm_gpu_t *gpu)
982 {
983     NV_STATUS status;
984     uvm_chunk_size_t chunk_size;
985 
986     // When the Confidential Computing feature is enabled the transfers don't
987     // use the DMA mapping of CPU chunks (since it's protected memory), but
988     // the DMA address of the unprotected dma buffer.
989     if (uvm_conf_computing_mode_enabled(gpu))
990         return NV_OK;
991 
992     status = uvm_cpu_chunk_map_gpu(chunk, gpu);
993     if (status != NV_OK)
994         return status;
995 
996     chunk_size = uvm_cpu_chunk_get_size(chunk);
997 
998     // TODO: Bug 3744779: Handle benign assertion in
999     //       pmm_sysmem_mappings_remove_gpu_mapping() in case of a
1000     //       failure.
1001     status = uvm_pmm_sysmem_mappings_add_gpu_mapping(&gpu->pmm_reverse_sysmem_mappings,
1002                                                      uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu->parent),
1003                                                      uvm_va_block_cpu_page_address(block, page_index),
1004                                                      chunk_size,
1005                                                      block,
1006                                                      UVM_ID_CPU);
1007     if (status != NV_OK)
1008         cpu_chunk_remove_sysmem_gpu_mapping(chunk, gpu);
1009 
1010     return status;
1011 }
1012 
1013 static void block_gpu_unmap_phys_all_cpu_pages(uvm_va_block_t *block, uvm_gpu_t *gpu)
1014 {
1015     uvm_cpu_chunk_t *chunk;
1016     uvm_page_index_t page_index;
1017 
1018     for_each_cpu_chunk_in_block(chunk, page_index, block)
1019         cpu_chunk_remove_sysmem_gpu_mapping(chunk, gpu);
1020 }
1021 
1022 static NV_STATUS block_gpu_map_phys_all_cpu_pages(uvm_va_block_t *block, uvm_gpu_t *gpu)
1023 {
1024     NV_STATUS status;
1025     uvm_cpu_chunk_t *chunk;
1026     NvU64 block_mapping_size = uvm_va_block_size(block);
1027     uvm_page_index_t page_index;
1028 
1029     UVM_ASSERT(IS_ALIGNED(block_mapping_size, UVM_PAGE_SIZE_4K));
1030 
1031     for_each_cpu_chunk_in_block(chunk, page_index, block) {
1032         UVM_ASSERT_MSG(uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu->parent) == 0,
1033                        "GPU%u DMA address 0x%llx\n",
1034                        uvm_id_value(gpu->id),
1035                        uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu->parent));
1036 
1037         status = cpu_chunk_add_sysmem_gpu_mapping(chunk, block, page_index, gpu);
1038         if (status != NV_OK)
1039             goto error;
1040     }
1041 
1042     return NV_OK;
1043 
1044 error:
1045     block_gpu_unmap_phys_all_cpu_pages(block, gpu);
1046     return status;
1047 }
1048 
1049 static NV_STATUS block_sysmem_mappings_add_gpu_chunk(uvm_va_block_t *block,
1050                                                      uvm_gpu_t *local_gpu,
1051                                                      uvm_gpu_chunk_t *chunk,
1052                                                      uvm_gpu_t *accessing_gpu)
1053 {
1054     NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(&local_gpu->pmm, chunk, accessing_gpu);
1055     return uvm_pmm_sysmem_mappings_add_gpu_chunk_mapping(&accessing_gpu->pmm_reverse_sysmem_mappings,
1056                                                          peer_addr,
1057                                                          block->start + chunk->va_block_page_index * PAGE_SIZE,
1058                                                          uvm_gpu_chunk_get_size(chunk),
1059                                                          block,
1060                                                          local_gpu->id);
1061 }
1062 
1063 static void block_sysmem_mappings_remove_gpu_chunk(uvm_gpu_t *local_gpu,
1064                                                    uvm_gpu_chunk_t *chunk,
1065                                                    uvm_gpu_t *accessing_gpu)
1066 {
1067     NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(&local_gpu->pmm, chunk, accessing_gpu);
1068     uvm_pmm_sysmem_mappings_remove_gpu_chunk_mapping(&accessing_gpu->pmm_reverse_sysmem_mappings, peer_addr);
1069 }
1070 
1071 static NV_STATUS block_gpu_map_all_chunks_indirect_peer(uvm_va_block_t *block,
1072                                                         uvm_gpu_t *local_gpu,
1073                                                         uvm_gpu_t *accessing_gpu)
1074 {
1075     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, local_gpu->id);
1076     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
1077     size_t num_chunks, i;
1078     NV_STATUS status;
1079 
1080     UVM_ASSERT(uvm_processor_mask_test(&va_space->indirect_peers[uvm_id_value(local_gpu->id)],
1081                                        accessing_gpu->id));
1082 
1083     // If no chunks are allocated currently, the mappings will be created later
1084     // at chunk allocation.
1085     if (!gpu_state || !gpu_state->chunks)
1086         return NV_OK;
1087 
1088     num_chunks = block_num_gpu_chunks(block, local_gpu);
1089     for (i = 0; i < num_chunks; i++) {
1090         uvm_gpu_chunk_t *chunk = gpu_state->chunks[i];
1091         if (!chunk)
1092             continue;
1093 
1094         status = uvm_pmm_gpu_indirect_peer_map(&local_gpu->pmm, chunk, accessing_gpu);
1095         if (status != NV_OK)
1096             goto error;
1097 
1098         status = block_sysmem_mappings_add_gpu_chunk(block, local_gpu, chunk, accessing_gpu);
1099         if (status != NV_OK)
1100             goto error;
1101     }
1102 
1103     return NV_OK;
1104 
1105 error:
1106     while (i-- > 0) {
1107         uvm_gpu_chunk_t *chunk = gpu_state->chunks[i];
1108         if (chunk) {
1109             // Indirect peer mappings are removed lazily by PMM, so if an error
1110             // occurs the mappings established above will be removed when the
1111             // chunk is freed later on. We only need to remove the sysmem
1112             // reverse mappings.
1113             block_sysmem_mappings_remove_gpu_chunk(local_gpu, chunk, accessing_gpu);
1114         }
1115     }
1116 
1117     return status;
1118 }
1119 
1120 // Mappings for indirect peers are removed lazily by PMM, but we need to remove
1121 // the entries from the reverse map.
1122 static void block_gpu_unmap_all_chunks_indirect_peer(uvm_va_block_t *block,
1123                                                      uvm_gpu_t *local_gpu,
1124                                                      uvm_gpu_t *accessing_gpu)
1125 {
1126     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, local_gpu->id);
1127     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
1128     size_t num_chunks, i;
1129 
1130     UVM_ASSERT(uvm_processor_mask_test(&va_space->indirect_peers[uvm_id_value(local_gpu->id)],
1131                                        accessing_gpu->id));
1132 
1133     // Exit if no chunks are allocated currently.
1134     if (!gpu_state || !gpu_state->chunks)
1135         return;
1136 
1137     num_chunks = block_num_gpu_chunks(block, local_gpu);
1138     for (i = 0; i < num_chunks; i++) {
1139         uvm_gpu_chunk_t *chunk = gpu_state->chunks[i];
1140         if (chunk)
1141             block_sysmem_mappings_remove_gpu_chunk(local_gpu, chunk, accessing_gpu);
1142     }
1143 }
1144 
1145 // Retrieves the gpu_state for the given GPU. The returned pointer is
1146 // internally managed and will be allocated (and freed) automatically,
1147 // rather than by the caller.
1148 static uvm_va_block_gpu_state_t *block_gpu_state_get_alloc(uvm_va_block_t *block, uvm_gpu_t *gpu)
1149 {
1150     NV_STATUS status;
1151     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
1152 
1153     if (gpu_state)
1154         return gpu_state;
1155 
1156     gpu_state = nv_kmem_cache_zalloc(g_uvm_va_block_gpu_state_cache, NV_UVM_GFP_FLAGS);
1157     if (!gpu_state)
1158         return NULL;
1159 
1160     gpu_state->chunks = uvm_kvmalloc_zero(block_num_gpu_chunks(block, gpu) * sizeof(gpu_state->chunks[0]));
1161     if (!gpu_state->chunks)
1162         goto error;
1163 
1164     block->gpus[uvm_id_gpu_index(gpu->id)] = gpu_state;
1165 
1166     status = block_gpu_map_phys_all_cpu_pages(block, gpu);
1167     if (status != NV_OK)
1168         goto error;
1169 
1170     return gpu_state;
1171 
1172 error:
1173     uvm_kvfree(gpu_state->chunks);
1174     kmem_cache_free(g_uvm_va_block_gpu_state_cache, gpu_state);
1175     block->gpus[uvm_id_gpu_index(gpu->id)] = NULL;
1176 
1177     return NULL;
1178 }
1179 
1180 NV_STATUS uvm_va_block_gpu_state_alloc(uvm_va_block_t *va_block)
1181 {
1182     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
1183     uvm_gpu_id_t gpu_id;
1184 
1185     UVM_ASSERT(uvm_va_block_is_hmm(va_block));
1186     uvm_assert_mutex_locked(&va_block->lock);
1187 
1188     for_each_gpu_id_in_mask(gpu_id, &va_space->registered_gpus) {
1189         if (!block_gpu_state_get_alloc(va_block, uvm_va_space_get_gpu(va_space, gpu_id)))
1190             return NV_ERR_NO_MEMORY;
1191     }
1192 
1193     return NV_OK;
1194 }
1195 
1196 void uvm_va_block_unmap_cpu_chunk_on_gpus(uvm_va_block_t *block,
1197                                           uvm_cpu_chunk_t *chunk,
1198                                           uvm_page_index_t page_index)
1199 {
1200     uvm_gpu_id_t id;
1201 
1202     for_each_gpu_id(id) {
1203         if (uvm_va_block_gpu_state_get(block, id))
1204             cpu_chunk_remove_sysmem_gpu_mapping(chunk, block_get_gpu(block, id));
1205     }
1206 }
1207 
1208 NV_STATUS uvm_va_block_map_cpu_chunk_on_gpus(uvm_va_block_t *block,
1209                                              uvm_page_index_t page_index)
1210 {
1211     NV_STATUS status;
1212     uvm_gpu_id_t id;
1213     uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index);
1214     uvm_chunk_size_t chunk_size = uvm_cpu_chunk_get_size(chunk);
1215     uvm_va_block_region_t chunk_region = uvm_va_block_chunk_region(block, chunk_size, page_index);
1216 
1217     // We can't iterate over va_space->registered_gpus because we might be
1218     // on the eviction path, which does not have the VA space lock held. We have
1219     // the VA block lock held however, so the gpu_states can't change.
1220     uvm_assert_mutex_locked(&block->lock);
1221 
1222     for_each_gpu_id(id) {
1223         uvm_gpu_t *gpu;
1224 
1225         if (!uvm_va_block_gpu_state_get(block, id))
1226             continue;
1227 
1228         gpu = block_get_gpu(block, id);
1229         status = cpu_chunk_add_sysmem_gpu_mapping(chunk, block, chunk_region.first, gpu);
1230         if (status != NV_OK)
1231             goto error;
1232     }
1233 
1234     return NV_OK;
1235 
1236 error:
1237     uvm_va_block_unmap_cpu_chunk_on_gpus(block, chunk, page_index);
1238     return status;
1239 }
1240 
1241 void uvm_va_block_remove_cpu_chunks(uvm_va_block_t *va_block, uvm_va_block_region_t region)
1242 {
1243     uvm_cpu_chunk_t *chunk;
1244     uvm_page_index_t page_index, next_page_index;
1245     uvm_va_block_region_t chunk_region;
1246 
1247     for_each_cpu_chunk_in_block_region_safe(chunk, page_index, next_page_index, va_block, region) {
1248         chunk_region = uvm_va_block_region(page_index, page_index + uvm_cpu_chunk_num_pages(chunk));
1249 
1250         uvm_page_mask_region_clear(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ], chunk_region);
1251         uvm_page_mask_region_clear(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE], chunk_region);
1252         uvm_page_mask_region_clear(&va_block->cpu.resident, chunk_region);
1253         uvm_cpu_chunk_remove_from_block(va_block, page_index);
1254         uvm_va_block_unmap_cpu_chunk_on_gpus(va_block, chunk, page_index);
1255         uvm_cpu_chunk_free(chunk);
1256     }
1257 
1258     if (uvm_page_mask_empty(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ]))
1259         uvm_processor_mask_clear(&va_block->mapped, UVM_ID_CPU);
1260     if (uvm_page_mask_empty(&va_block->cpu.resident))
1261         uvm_processor_mask_clear(&va_block->resident, UVM_ID_CPU);
1262 }
1263 
1264 // Create physical mappings to allow other GPUs to access this chunk.
1265 static NV_STATUS block_map_indirect_peers_to_gpu_chunk(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_gpu_chunk_t *chunk)
1266 {
1267     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
1268     uvm_gpu_t *accessing_gpu, *remove_gpu;
1269     NV_STATUS status;
1270 
1271     // Unlike uvm_va_block_map_cpu_chunk_on_gpus, this function isn't called on
1272     // the eviction path, so we can assume that the VA space is locked.
1273     //
1274     // TODO: Bug 2007346: In the future we may want to enable eviction to peers,
1275     //       meaning we may need to allocate peer memory and map it on the
1276     //       eviction path. That will require making sure that peers can't be
1277     //       enabled or disabled either in the VA space or globally within this
1278     //       function.
1279     uvm_assert_rwsem_locked(&va_space->lock);
1280     uvm_assert_mutex_locked(&block->lock);
1281 
1282     for_each_va_space_gpu_in_mask(accessing_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)]) {
1283         status = uvm_pmm_gpu_indirect_peer_map(&gpu->pmm, chunk, accessing_gpu);
1284         if (status != NV_OK)
1285             goto error;
1286 
1287         status = block_sysmem_mappings_add_gpu_chunk(block, gpu, chunk, accessing_gpu);
1288         if (status != NV_OK)
1289             goto error;
1290     }
1291 
1292     return NV_OK;
1293 
1294 error:
1295     for_each_va_space_gpu_in_mask(remove_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)]) {
1296         if (remove_gpu == accessing_gpu)
1297             break;
1298 
1299         // Indirect peer mappings are removed lazily by PMM, so if an error
1300         // occurs the mappings established above will be removed when the
1301         // chunk is freed later on. We only need to remove the sysmem
1302         // reverse mappings.
1303         block_sysmem_mappings_remove_gpu_chunk(gpu, chunk, remove_gpu);
1304     }
1305 
1306     return status;
1307 }
1308 
1309 static void block_unmap_indirect_peers_from_gpu_chunk(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_gpu_chunk_t *chunk)
1310 {
1311     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
1312     uvm_gpu_t *peer_gpu;
1313 
1314     uvm_assert_rwsem_locked(&va_space->lock);
1315     uvm_assert_mutex_locked(&block->lock);
1316 
1317     // Indirect peer mappings are removed lazily by PMM, so we only need to
1318     // remove the sysmem reverse mappings.
1319     for_each_va_space_gpu_in_mask(peer_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)])
1320         block_sysmem_mappings_remove_gpu_chunk(gpu, chunk, peer_gpu);
1321 }
1322 
1323 // Mark a CPU page as dirty.
1324 static void  block_mark_cpu_page_dirty(uvm_va_block_t *block, uvm_page_index_t page_index)
1325 {
1326     uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index);
1327     uvm_va_block_region_t chunk_region = uvm_va_block_chunk_region(block, uvm_cpu_chunk_get_size(chunk), page_index);
1328     uvm_cpu_chunk_mark_dirty(chunk, page_index - chunk_region.first);
1329 }
1330 
1331 // Mark a CPU page as clean.
1332 static void block_mark_cpu_page_clean(uvm_va_block_t *block, uvm_page_index_t page_index)
1333 {
1334     uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index);
1335     uvm_va_block_region_t chunk_region = uvm_va_block_chunk_region(block, uvm_cpu_chunk_get_size(chunk), page_index);
1336     uvm_cpu_chunk_mark_clean(chunk, page_index - chunk_region.first);
1337 }
1338 
1339 // Check if a CPU page is dirty.
1340 static bool block_cpu_page_is_dirty(uvm_va_block_t *block, uvm_page_index_t page_index)
1341 {
1342     uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index);
1343     uvm_va_block_region_t chunk_region = uvm_va_block_chunk_region(block, uvm_cpu_chunk_get_size(chunk), page_index);
1344     return uvm_cpu_chunk_is_dirty(chunk, page_index - chunk_region.first);
1345 }
1346 
1347 static NV_STATUS block_alloc_cpu_chunk(uvm_va_block_t *block,
1348                                        uvm_chunk_size_t alloc_size,
1349                                        uvm_cpu_chunk_alloc_flags_t flags,
1350                                        uvm_cpu_chunk_t **chunk)
1351 {
1352     uvm_va_block_test_t *block_test = uvm_va_block_get_test(block);
1353 
1354     // Return out of memory error if the tests have requested it. As opposed to
1355     // other error injection settings, this one fails N times and then succeeds.
1356     // TODO: Bug 3701182: This will print a warning in Linux kernels newer than
1357     // 5.16.0-rc1+.
1358     if (block_test && block_test->inject_cpu_pages_allocation_error_count) {
1359         if (block_test->inject_cpu_pages_allocation_error_count != ~(NvU32)0)
1360             block_test->inject_cpu_pages_allocation_error_count--;
1361         return NV_ERR_NO_MEMORY;
1362     }
1363 
1364     return uvm_cpu_chunk_alloc(alloc_size, flags, chunk);
1365 }
1366 
1367 // Allocates the input page in the block, if it doesn't already exist
1368 //
1369 // Also maps the page for physical access by all GPUs used by the block, which
1370 // is required for IOMMU support. Skipped on GPUs without access to CPU memory.
1371 // e.g., this happens when the Confidential Computing Feature is enabled.
1372 static NV_STATUS block_populate_pages_cpu(uvm_va_block_t *block,
1373                                           uvm_page_mask_t *populate_page_mask,
1374                                           uvm_va_block_region_t populate_region,
1375                                           uvm_va_block_context_t *block_context)
1376 {
1377     NV_STATUS status = NV_OK;
1378     uvm_cpu_chunk_t *chunk;
1379     uvm_va_block_test_t *block_test = uvm_va_block_get_test(block);
1380     uvm_chunk_sizes_mask_t cpu_allocation_sizes = uvm_cpu_chunk_get_allocation_sizes();
1381     uvm_chunk_size_t alloc_size;
1382     uvm_page_mask_t *resident_mask = &block_context->scratch_page_mask;
1383     uvm_cpu_chunk_alloc_flags_t alloc_flags = UVM_CPU_CHUNK_ALLOC_FLAGS_NONE;
1384     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
1385     uvm_processor_mask_t uvm_lite_gpus;
1386     uvm_page_index_t page_index;
1387     uvm_gpu_id_t id;
1388 
1389     // Check whether all requested pages have already been allocated.
1390     uvm_page_mask_init_from_region(&block_context->scratch_page_mask, populate_region, populate_page_mask);
1391     if (!uvm_page_mask_andnot(&block_context->scratch_page_mask,
1392                               &block_context->scratch_page_mask,
1393                               &block->cpu.allocated))
1394         return NV_OK;
1395 
1396     if (block_test) {
1397         if (block_test->cpu_chunk_allocation_size_mask)
1398             cpu_allocation_sizes &= block_test->cpu_chunk_allocation_size_mask;
1399     }
1400 
1401     uvm_page_mask_zero(resident_mask);
1402     for_each_id_in_mask (id, &block->resident)
1403         uvm_page_mask_or(resident_mask, resident_mask, uvm_va_block_resident_mask_get(block, id));
1404 
1405     // If the VA space has a UVM-Lite GPU registered, only PAGE_SIZE allocations
1406     // should be used in order to avoid extra copies due to dirty compound
1407     // pages. HMM va_blocks also require PAGE_SIZE allocations.
1408     // TODO: Bug 3368756: add support for HMM transparent huge page (THP)
1409     // migrations.
1410     uvm_processor_mask_andnot(&uvm_lite_gpus, &va_space->registered_gpus, &va_space->faultable_processors);
1411     if (!uvm_processor_mask_empty(&uvm_lite_gpus) || uvm_va_block_is_hmm(block))
1412         cpu_allocation_sizes = PAGE_SIZE;
1413 
1414     if (block_context->mm)
1415         alloc_flags |= UVM_CPU_CHUNK_ALLOC_FLAGS_ACCOUNT;
1416 
1417     UVM_ASSERT(cpu_allocation_sizes >= PAGE_SIZE);
1418     UVM_ASSERT(cpu_allocation_sizes & PAGE_SIZE);
1419 
1420     for_each_va_block_page_in_region_mask(page_index, populate_page_mask, populate_region) {
1421         uvm_cpu_chunk_alloc_flags_t chunk_alloc_flags;
1422         uvm_va_block_region_t region = populate_region;
1423 
1424         if (uvm_page_mask_test(&block->cpu.allocated, page_index)) {
1425             page_index = uvm_va_block_next_unset_page_in_mask(populate_region, &block->cpu.allocated, page_index) - 1;
1426             continue;
1427         }
1428 
1429         UVM_ASSERT(!uvm_page_mask_test(&block->cpu.resident, page_index));
1430 
1431         chunk_alloc_flags = alloc_flags;
1432 
1433         // Attempt to allocate CPU pages with the largest physically contiguous
1434         // size from the set of CPU chunk sizes that we can.
1435         // This is accomplished by:
1436         //   1. Aligning the CPU page address down to the allocation size.
1437         //   2. Ensuring that the entire allocation region fits withing the VA
1438         //      block.
1439         //   3. Ensuring that the region covered by the allocation is empty.
1440         for_each_chunk_size_rev(alloc_size, cpu_allocation_sizes) {
1441             NvU64 alloc_virt_addr;
1442 
1443             chunk = NULL;
1444             alloc_virt_addr = UVM_ALIGN_DOWN(uvm_va_block_cpu_page_address(block, page_index), alloc_size);
1445 
1446             if (!uvm_va_block_contains_address(block, alloc_virt_addr) ||
1447                 !uvm_va_block_contains_address(block, alloc_virt_addr + alloc_size - 1))
1448                 continue;
1449 
1450             region = uvm_va_block_region_from_start_end(block, alloc_virt_addr, alloc_virt_addr + alloc_size - 1);
1451 
1452             if (!uvm_page_mask_region_empty(&block->cpu.allocated, region))
1453                 continue;
1454 
1455             // If not all pages in the allocation region are resident somewhere,
1456             // zero out the allocated page.
1457             // This could be wasteful if only a few pages in high-order
1458             // allocation need to be zero'ed out but the alternative is to map
1459             // single sub-pages one-by-one.
1460             if (!uvm_page_mask_region_full(resident_mask, region))
1461                 chunk_alloc_flags |= UVM_CPU_CHUNK_ALLOC_FLAGS_ZERO;
1462 
1463             status = block_alloc_cpu_chunk(block, alloc_size, chunk_alloc_flags, &chunk);
1464             if (status == NV_OK) {
1465                 page_index = region.first;
1466                 break;
1467             }
1468 
1469             UVM_ASSERT(status == NV_ERR_NO_MEMORY);
1470         }
1471 
1472         if (status != NV_OK)
1473             break;
1474 
1475         status = uvm_cpu_chunk_insert_in_block(block, chunk, page_index);
1476         if (status != NV_OK) {
1477             uvm_cpu_chunk_free(chunk);
1478             return status;
1479         }
1480 
1481         status = uvm_va_block_map_cpu_chunk_on_gpus(block, page_index);
1482         if (status != NV_OK)
1483             break;
1484 
1485         // Skip iterating over all pages covered by the allocated chunk.
1486         page_index = region.outer - 1;
1487     }
1488 
1489     if (status != NV_OK && chunk) {
1490         uvm_cpu_chunk_remove_from_block(block, page_index);
1491         uvm_cpu_chunk_free(chunk);
1492     }
1493 
1494     return status;
1495 }
1496 
1497 // Try allocating a chunk. If eviction was required,
1498 // NV_ERR_MORE_PROCESSING_REQUIRED will be returned since the block's lock was
1499 // unlocked and relocked. The caller is responsible for adding the chunk to the
1500 // retry used_chunks list.
1501 static NV_STATUS block_alloc_gpu_chunk(uvm_va_block_t *block,
1502                                        uvm_va_block_retry_t *retry,
1503                                        uvm_gpu_t *gpu,
1504                                        uvm_chunk_size_t size,
1505                                        uvm_gpu_chunk_t **out_gpu_chunk)
1506 {
1507     NV_STATUS status = NV_OK;
1508     uvm_gpu_chunk_t *gpu_chunk;
1509 
1510     // First try getting a free chunk from previously-made allocations.
1511     gpu_chunk = block_retry_get_free_chunk(retry, gpu, size);
1512     if (!gpu_chunk) {
1513         uvm_va_block_test_t *block_test = uvm_va_block_get_test(block);
1514         if (block_test && block_test->user_pages_allocation_retry_force_count > 0) {
1515             // Force eviction by pretending the allocation failed with no memory
1516             --block_test->user_pages_allocation_retry_force_count;
1517             status = NV_ERR_NO_MEMORY;
1518         }
1519         else {
1520             // Try allocating a new one without eviction
1521             status = uvm_pmm_gpu_alloc_user(&gpu->pmm, 1, size, UVM_PMM_ALLOC_FLAGS_NONE, &gpu_chunk, &retry->tracker);
1522         }
1523 
1524         if (status == NV_ERR_NO_MEMORY) {
1525             // If that fails with no memory, try allocating with eviction and
1526             // return back to the caller immediately so that the operation can
1527             // be restarted.
1528             uvm_mutex_unlock(&block->lock);
1529 
1530             status = uvm_pmm_gpu_alloc_user(&gpu->pmm, 1, size, UVM_PMM_ALLOC_FLAGS_EVICT, &gpu_chunk, &retry->tracker);
1531             if (status == NV_OK) {
1532                 block_retry_add_free_chunk(retry, gpu_chunk);
1533                 status = NV_ERR_MORE_PROCESSING_REQUIRED;
1534             }
1535 
1536             uvm_mutex_lock(&block->lock);
1537             return status;
1538         }
1539         else if (status != NV_OK) {
1540             return status;
1541         }
1542     }
1543 
1544     *out_gpu_chunk = gpu_chunk;
1545     return NV_OK;
1546 }
1547 
1548 static bool block_gpu_has_page_tables(uvm_va_block_t *block, uvm_gpu_t *gpu)
1549 {
1550     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
1551 
1552     if (!gpu_state)
1553         return false;
1554 
1555     return gpu_state->page_table_range_4k.table  ||
1556            gpu_state->page_table_range_big.table ||
1557            gpu_state->page_table_range_2m.table;
1558 }
1559 
1560 // A helper to get a known-to-be-present GPU VA space given a VA block that's
1561 // locked. In order to use this function, the caller must know that at least one
1562 // of these conditions is true:
1563 //
1564 // 1) The VA space lock is held
1565 // 2) The VA block has active page tables for the GPU
1566 //
1567 // If the VA space lock is held (#1), then the gpu_va_space obviously can't go
1568 // away.
1569 //
1570 // On the eviction path, we don't have a lock on the VA space state. However,
1571 // since remove_gpu_va_space walks each block to unmap the GPU and free GPU page
1572 // tables before destroying the gpu_va_space, we're guaranteed that if this GPU
1573 // has page tables (#2), the gpu_va_space can't go away while we're holding the
1574 // block lock.
1575 static uvm_gpu_va_space_t *uvm_va_block_get_gpu_va_space(uvm_va_block_t *va_block, uvm_gpu_t *gpu)
1576 {
1577     uvm_gpu_va_space_t *gpu_va_space;
1578     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
1579 
1580     UVM_ASSERT(gpu);
1581 
1582     if (!block_gpu_has_page_tables(va_block, gpu))
1583         uvm_assert_rwsem_locked(&va_space->lock);
1584 
1585     UVM_ASSERT(uvm_processor_mask_test(&va_space->registered_gpu_va_spaces, gpu->id));
1586 
1587     gpu_va_space = va_space->gpu_va_spaces[uvm_id_gpu_index(gpu->id)];
1588 
1589     UVM_ASSERT(uvm_gpu_va_space_state(gpu_va_space) == UVM_GPU_VA_SPACE_STATE_ACTIVE);
1590     UVM_ASSERT(gpu_va_space->va_space == va_space);
1591     UVM_ASSERT(gpu_va_space->gpu == gpu);
1592 
1593     return gpu_va_space;
1594 }
1595 
1596 static bool block_gpu_supports_2m(uvm_va_block_t *block, uvm_gpu_t *gpu)
1597 {
1598     uvm_gpu_va_space_t *gpu_va_space;
1599 
1600     // TODO: Bug 3368756: add HMM support for transparent huge page migrations.
1601     if (uvm_va_block_size(block) < UVM_PAGE_SIZE_2M || uvm_va_block_is_hmm(block))
1602         return false;
1603 
1604     UVM_ASSERT(uvm_va_block_size(block) == UVM_PAGE_SIZE_2M);
1605 
1606     gpu_va_space = uvm_va_block_get_gpu_va_space(block, gpu);
1607     return uvm_mmu_page_size_supported(&gpu_va_space->page_tables, UVM_PAGE_SIZE_2M);
1608 }
1609 
1610 NvU32 uvm_va_block_gpu_big_page_size(uvm_va_block_t *va_block, uvm_gpu_t *gpu)
1611 {
1612     uvm_gpu_va_space_t *gpu_va_space;
1613 
1614     gpu_va_space = uvm_va_block_get_gpu_va_space(va_block, gpu);
1615     return gpu_va_space->page_tables.big_page_size;
1616 }
1617 
1618 static uvm_va_block_region_t range_big_page_region_all(NvU64 start, NvU64 end, NvU32 big_page_size)
1619 {
1620     NvU64 first_addr = UVM_ALIGN_UP(start, big_page_size);
1621     NvU64 outer_addr = UVM_ALIGN_DOWN(end + 1, big_page_size);
1622 
1623     // The range must fit within a VA block
1624     UVM_ASSERT(UVM_VA_BLOCK_ALIGN_DOWN(start) == UVM_VA_BLOCK_ALIGN_DOWN(end));
1625 
1626     if (outer_addr <= first_addr)
1627         return uvm_va_block_region(0, 0);
1628 
1629     return uvm_va_block_region((first_addr - start) / PAGE_SIZE, (outer_addr - start) / PAGE_SIZE);
1630 }
1631 
1632 static size_t range_num_big_pages(NvU64 start, NvU64 end, NvU32 big_page_size)
1633 {
1634     uvm_va_block_region_t region = range_big_page_region_all(start, end, big_page_size);
1635     return (size_t)uvm_div_pow2_64(uvm_va_block_region_size(region), big_page_size);
1636 }
1637 
1638 uvm_va_block_region_t uvm_va_block_big_page_region_all(uvm_va_block_t *va_block, NvU32 big_page_size)
1639 {
1640     return range_big_page_region_all(va_block->start, va_block->end, big_page_size);
1641 }
1642 
1643 uvm_va_block_region_t uvm_va_block_big_page_region_subset(uvm_va_block_t *va_block,
1644                                                           uvm_va_block_region_t region,
1645                                                           NvU32 big_page_size)
1646 {
1647     NvU64 start = uvm_va_block_region_start(va_block, region);
1648     NvU64 end = uvm_va_block_region_end(va_block, region);
1649     uvm_va_block_region_t big_region;
1650 
1651     UVM_ASSERT(start < va_block->end);
1652     UVM_ASSERT(end <= va_block->end);
1653 
1654     big_region = range_big_page_region_all(start, end, big_page_size);
1655     if (big_region.outer) {
1656         big_region.first += region.first;
1657         big_region.outer += region.first;
1658     }
1659 
1660     return big_region;
1661 }
1662 
1663 size_t uvm_va_block_num_big_pages(uvm_va_block_t *va_block, NvU32 big_page_size)
1664 {
1665     return range_num_big_pages(va_block->start, va_block->end, big_page_size);
1666 }
1667 
1668 NvU64 uvm_va_block_big_page_addr(uvm_va_block_t *va_block, size_t big_page_index, NvU32 big_page_size)
1669 {
1670     NvU64 addr = UVM_ALIGN_UP(va_block->start, big_page_size) + (big_page_index * big_page_size);
1671     UVM_ASSERT(addr >= va_block->start);
1672     UVM_ASSERT(addr < va_block->end);
1673     return addr;
1674 }
1675 
1676 uvm_va_block_region_t uvm_va_block_big_page_region(uvm_va_block_t *va_block, size_t big_page_index, NvU32 big_page_size)
1677 {
1678     NvU64 page_addr = uvm_va_block_big_page_addr(va_block, big_page_index, big_page_size);
1679 
1680     // Assume that we don't have to handle multiple big PTEs per system page.
1681     // It's not terribly difficult to implement, but we don't currently have a
1682     // use case.
1683     UVM_ASSERT(big_page_size >= PAGE_SIZE);
1684 
1685     return uvm_va_block_region_from_start_size(va_block, page_addr, big_page_size);
1686 }
1687 
1688 // Returns the big page index (the bit index within
1689 // uvm_va_block_gpu_state_t::big_ptes) corresponding to page_index. If
1690 // page_index cannot be covered by a big PTE due to alignment or block size,
1691 // MAX_BIG_PAGES_PER_UVM_VA_BLOCK is returned.
1692 size_t uvm_va_block_big_page_index(uvm_va_block_t *va_block, uvm_page_index_t page_index, NvU32 big_page_size)
1693 {
1694     uvm_va_block_region_t big_region_all = uvm_va_block_big_page_region_all(va_block, big_page_size);
1695     size_t big_index;
1696 
1697     // Note that this condition also handles the case of having no big pages in
1698     // the block, in which case .first >= .outer.
1699     if (page_index < big_region_all.first || page_index >= big_region_all.outer)
1700         return MAX_BIG_PAGES_PER_UVM_VA_BLOCK;
1701 
1702     big_index = (size_t)uvm_div_pow2_64((page_index - big_region_all.first) * PAGE_SIZE, big_page_size);
1703 
1704     UVM_ASSERT(uvm_va_block_big_page_addr(va_block, big_index, big_page_size) >= va_block->start);
1705     UVM_ASSERT(uvm_va_block_big_page_addr(va_block, big_index, big_page_size) + big_page_size <= va_block->end + 1);
1706 
1707     return big_index;
1708 }
1709 
1710 static void uvm_page_mask_init_from_big_ptes(uvm_va_block_t *block,
1711                                              uvm_gpu_t *gpu,
1712                                              uvm_page_mask_t *mask_out,
1713                                              const unsigned long *big_ptes_in)
1714 {
1715     uvm_va_block_region_t big_region;
1716     size_t big_page_index;
1717     NvU32 big_page_size = uvm_va_block_gpu_big_page_size(block, gpu);
1718 
1719     uvm_page_mask_zero(mask_out);
1720 
1721     for_each_set_bit(big_page_index, big_ptes_in, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) {
1722         big_region = uvm_va_block_big_page_region(block, big_page_index, big_page_size);
1723         uvm_page_mask_region_fill(mask_out, big_region);
1724     }
1725 }
1726 
1727 NvU32 uvm_va_block_page_size_cpu(uvm_va_block_t *va_block, uvm_page_index_t page_index)
1728 {
1729     if (!uvm_page_mask_test(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ], page_index))
1730         return 0;
1731 
1732     UVM_ASSERT(uvm_processor_mask_test(&va_block->mapped, UVM_ID_CPU));
1733 
1734     // Despite the fact that physical CPU memory can be allocated at sizes
1735     // greater than PAGE_SIZE, vm_insert_page(s)() always maps CPU memory
1736     // with 4K PTEs. Until the core kernel adds support for PMD mappings,
1737     // the return value of this function will remain at PAGE_SIZE.
1738     return PAGE_SIZE;
1739 }
1740 
1741 NvU32 uvm_va_block_page_size_gpu(uvm_va_block_t *va_block, uvm_gpu_id_t gpu_id, uvm_page_index_t page_index)
1742 {
1743     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu_id);
1744     size_t big_page_size, big_page_index;
1745 
1746     if (!gpu_state)
1747         return 0;
1748 
1749     if (!uvm_page_mask_test(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ], page_index))
1750         return 0;
1751 
1752     UVM_ASSERT(uvm_processor_mask_test(&va_block->mapped, gpu_id));
1753 
1754     if (gpu_state->pte_is_2m)
1755         return UVM_PAGE_SIZE_2M;
1756 
1757     big_page_size = uvm_va_block_gpu_big_page_size(va_block, block_get_gpu(va_block, gpu_id));
1758     big_page_index = uvm_va_block_big_page_index(va_block, page_index, big_page_size);
1759     if (big_page_index != MAX_BIG_PAGES_PER_UVM_VA_BLOCK && test_bit(big_page_index, gpu_state->big_ptes))
1760         return big_page_size;
1761 
1762     return UVM_PAGE_SIZE_4K;
1763 }
1764 
1765 // Get the size of the physical allocation backing the page, or 0 if not
1766 // resident. Note that this is different from uvm_va_block_page_size_* because
1767 // those return the size of the PTE which maps the page index, which may be
1768 // smaller than the physical allocation.
1769 static NvU32 block_phys_page_size(uvm_va_block_t *block, block_phys_page_t page)
1770 {
1771     uvm_va_block_gpu_state_t *gpu_state;
1772     uvm_chunk_size_t chunk_size;
1773 
1774     if (UVM_ID_IS_CPU(page.processor)) {
1775         uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page.page_index);
1776 
1777         if (!uvm_page_mask_test(&block->cpu.resident, page.page_index))
1778             return 0;
1779 
1780         UVM_ASSERT(uvm_processor_mask_test(&block->resident, UVM_ID_CPU));
1781         return (NvU32)uvm_cpu_chunk_get_size(chunk);
1782     }
1783 
1784     gpu_state = uvm_va_block_gpu_state_get(block, page.processor);
1785     if (!gpu_state || !uvm_page_mask_test(&gpu_state->resident, page.page_index))
1786         return 0;
1787 
1788     UVM_ASSERT(uvm_processor_mask_test(&block->resident, page.processor));
1789     block_gpu_chunk_index(block, block_get_gpu(block, page.processor), page.page_index, &chunk_size);
1790     return (NvU32)chunk_size;
1791 }
1792 
1793 static uvm_pte_bits_cpu_t get_cpu_pte_bit_index(uvm_prot_t prot)
1794 {
1795     uvm_pte_bits_cpu_t pte_bit_index = UVM_PTE_BITS_CPU_MAX;
1796 
1797     // ATOMIC and WRITE are synonyms for the CPU
1798     if (prot == UVM_PROT_READ_WRITE_ATOMIC || prot == UVM_PROT_READ_WRITE)
1799         pte_bit_index = UVM_PTE_BITS_CPU_WRITE;
1800     else if (prot == UVM_PROT_READ_ONLY)
1801         pte_bit_index = UVM_PTE_BITS_CPU_READ;
1802     else
1803         UVM_ASSERT_MSG(false, "Invalid access permissions %s\n", uvm_prot_string(prot));
1804 
1805     return pte_bit_index;
1806 }
1807 
1808 static uvm_pte_bits_gpu_t get_gpu_pte_bit_index(uvm_prot_t prot)
1809 {
1810     uvm_pte_bits_gpu_t pte_bit_index = UVM_PTE_BITS_GPU_MAX;
1811 
1812     if (prot == UVM_PROT_READ_WRITE_ATOMIC)
1813         pte_bit_index = UVM_PTE_BITS_GPU_ATOMIC;
1814     else if (prot == UVM_PROT_READ_WRITE)
1815         pte_bit_index = UVM_PTE_BITS_GPU_WRITE;
1816     else if (prot == UVM_PROT_READ_ONLY)
1817         pte_bit_index = UVM_PTE_BITS_GPU_READ;
1818     else
1819         UVM_ASSERT_MSG(false, "Invalid access permissions %s\n", uvm_prot_string(prot));
1820 
1821     return pte_bit_index;
1822 }
1823 
1824 uvm_page_mask_t *uvm_va_block_resident_mask_get(uvm_va_block_t *block, uvm_processor_id_t processor)
1825 {
1826     uvm_va_block_gpu_state_t *gpu_state;
1827 
1828     if (UVM_ID_IS_CPU(processor))
1829         return &block->cpu.resident;
1830 
1831     gpu_state = uvm_va_block_gpu_state_get(block, processor);
1832 
1833     UVM_ASSERT(gpu_state);
1834     return &gpu_state->resident;
1835 }
1836 
1837 // Get the page residency mask for a processor
1838 //
1839 // Notably this will allocate GPU state if not yet present and if that fails
1840 // NULL is returned.
1841 static uvm_page_mask_t *block_resident_mask_get_alloc(uvm_va_block_t *block, uvm_processor_id_t processor)
1842 {
1843     uvm_va_block_gpu_state_t *gpu_state;
1844 
1845     if (UVM_ID_IS_CPU(processor))
1846         return &block->cpu.resident;
1847 
1848     gpu_state = block_gpu_state_get_alloc(block, block_get_gpu(block, processor));
1849     if (!gpu_state)
1850         return NULL;
1851 
1852     return &gpu_state->resident;
1853 }
1854 
1855 static const uvm_page_mask_t *block_map_with_prot_mask_get(uvm_va_block_t *block,
1856                                                            uvm_processor_id_t processor,
1857                                                            uvm_prot_t prot)
1858 {
1859     uvm_va_block_gpu_state_t *gpu_state;
1860 
1861     if (UVM_ID_IS_CPU(processor))
1862         return &block->cpu.pte_bits[get_cpu_pte_bit_index(prot)];
1863 
1864     gpu_state = uvm_va_block_gpu_state_get(block, processor);
1865 
1866     UVM_ASSERT(gpu_state);
1867     return &gpu_state->pte_bits[get_gpu_pte_bit_index(prot)];
1868 }
1869 
1870 const uvm_page_mask_t *uvm_va_block_map_mask_get(uvm_va_block_t *block, uvm_processor_id_t processor)
1871 {
1872     return block_map_with_prot_mask_get(block, processor, UVM_PROT_READ_ONLY);
1873 }
1874 
1875 static const uvm_page_mask_t *block_evicted_mask_get(uvm_va_block_t *block, uvm_gpu_id_t gpu_id)
1876 {
1877     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu_id);
1878     UVM_ASSERT(gpu_state);
1879 
1880     return &gpu_state->evicted;
1881 }
1882 
1883 static bool block_is_page_resident_anywhere(uvm_va_block_t *block, uvm_page_index_t page_index)
1884 {
1885     uvm_processor_id_t id;
1886     for_each_id_in_mask(id, &block->resident) {
1887         if (uvm_page_mask_test(uvm_va_block_resident_mask_get(block, id), page_index))
1888             return true;
1889     }
1890 
1891     return false;
1892 }
1893 
1894 static bool block_processor_page_is_populated(uvm_va_block_t *block, uvm_processor_id_t proc, uvm_page_index_t page_index)
1895 {
1896     uvm_va_block_gpu_state_t *gpu_state;
1897     size_t chunk_index;
1898 
1899     if (UVM_ID_IS_CPU(proc))
1900         return uvm_page_mask_test(&block->cpu.allocated, page_index);
1901 
1902     gpu_state = uvm_va_block_gpu_state_get(block, proc);
1903     if (!gpu_state)
1904         return false;
1905 
1906     chunk_index = block_gpu_chunk_index(block, block_get_gpu(block, proc), page_index, NULL);
1907     return gpu_state->chunks[chunk_index] != NULL;
1908 }
1909 
1910 static bool block_processor_page_is_resident_on(uvm_va_block_t *block, uvm_processor_id_t proc, uvm_page_index_t page_index)
1911 {
1912     const uvm_page_mask_t *resident_mask;
1913 
1914     if (UVM_ID_IS_CPU(proc)) {
1915         resident_mask = &block->cpu.resident;
1916     }
1917     else {
1918         uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, proc);
1919         if (!gpu_state)
1920             return false;
1921 
1922         resident_mask = &gpu_state->resident;
1923     }
1924 
1925     return uvm_page_mask_test(resident_mask, page_index);
1926 }
1927 
1928 // Compute the gpus that have at least the given access permissions for the
1929 // range described by region and page_mask. The function sets the bit if any
1930 // page in the region has the permissions.
1931 static void block_region_authorized_gpus(uvm_va_block_t *va_block,
1932                                          uvm_va_block_region_t region,
1933                                          uvm_prot_t access_permission,
1934                                          uvm_processor_mask_t *authorized_gpus)
1935 {
1936     uvm_gpu_id_t gpu_id;
1937     uvm_pte_bits_gpu_t search_gpu_bit = get_gpu_pte_bit_index(access_permission);
1938 
1939     uvm_processor_mask_zero(authorized_gpus);
1940 
1941     // Test all GPUs with mappings on the block
1942     for_each_gpu_id_in_mask(gpu_id, &va_block->mapped) {
1943         uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu_id);
1944         if (gpu_state && !uvm_page_mask_region_empty(&gpu_state->pte_bits[search_gpu_bit], region))
1945             uvm_processor_mask_set(authorized_gpus, gpu_id);
1946     }
1947 }
1948 
1949 // Compute the processors that have at least the given access permissions for
1950 // the range described by region and page_mask. The function sets the bit if any
1951 // page in the region has the permissions.
1952 static void block_region_authorized_processors(uvm_va_block_t *va_block,
1953                                                uvm_va_block_region_t region,
1954                                                uvm_prot_t access_permission,
1955                                                uvm_processor_mask_t *authorized_processors)
1956 {
1957     uvm_pte_bits_cpu_t search_cpu_bit = get_cpu_pte_bit_index(access_permission);
1958 
1959     // Compute GPUs
1960     block_region_authorized_gpus(va_block, region, access_permission, authorized_processors);
1961 
1962     // Test CPU
1963     if (uvm_processor_mask_test(&va_block->mapped, UVM_ID_CPU) &&
1964         !uvm_page_mask_region_empty(&va_block->cpu.pte_bits[search_cpu_bit], region)) {
1965         uvm_processor_mask_set(authorized_processors, UVM_ID_CPU);
1966     }
1967 }
1968 
1969 static void block_page_authorized_processors(uvm_va_block_t *va_block,
1970                                              uvm_page_index_t page_index,
1971                                              uvm_prot_t access_permission,
1972                                              uvm_processor_mask_t *authorized_processors)
1973 {
1974     block_region_authorized_processors(va_block,
1975                                        uvm_va_block_region_for_page(page_index),
1976                                        access_permission,
1977                                        authorized_processors);
1978 }
1979 
1980 static bool block_is_gpu_authorized_on_whole_region(uvm_va_block_t *va_block,
1981                                                     uvm_va_block_region_t region,
1982                                                     uvm_gpu_id_t gpu_id,
1983                                                     uvm_prot_t required_prot)
1984 {
1985     uvm_pte_bits_gpu_t search_gpu_bit = get_gpu_pte_bit_index(required_prot);
1986     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu_id);
1987 
1988     if (!gpu_state)
1989         return false;
1990 
1991     return uvm_page_mask_region_full(&gpu_state->pte_bits[search_gpu_bit], region);
1992 }
1993 
1994 static bool block_is_processor_authorized_on_whole_region(uvm_va_block_t *va_block,
1995                                                           uvm_va_block_region_t region,
1996                                                           uvm_processor_id_t processor_id,
1997                                                           uvm_prot_t required_prot)
1998 {
1999     if (UVM_ID_IS_CPU(processor_id)) {
2000         uvm_pte_bits_cpu_t search_cpu_bit = get_cpu_pte_bit_index(required_prot);
2001 
2002         return uvm_page_mask_region_full(&va_block->cpu.pte_bits[search_cpu_bit], region);
2003     }
2004     else {
2005         return block_is_gpu_authorized_on_whole_region(va_block, region, processor_id, required_prot);
2006     }
2007 }
2008 
2009 bool uvm_va_block_page_is_gpu_authorized(uvm_va_block_t *va_block,
2010                                          uvm_page_index_t page_index,
2011                                          uvm_gpu_id_t gpu_id,
2012                                          uvm_prot_t required_prot)
2013 {
2014     return block_is_gpu_authorized_on_whole_region(va_block,
2015                                                    uvm_va_block_region_for_page(page_index),
2016                                                    gpu_id,
2017                                                    required_prot);
2018 }
2019 
2020 static bool block_page_is_processor_authorized(uvm_va_block_t *va_block,
2021                                                uvm_page_index_t page_index,
2022                                                uvm_processor_id_t processor_id,
2023                                                uvm_prot_t required_prot)
2024 {
2025     return block_is_processor_authorized_on_whole_region(va_block,
2026                                                          uvm_va_block_region_for_page(page_index),
2027                                                          processor_id,
2028                                                          required_prot);
2029 }
2030 
2031 // Compute the gpus that have a copy of the given page resident in their memory
2032 static void block_page_resident_gpus(uvm_va_block_t *va_block,
2033                                      uvm_page_index_t page_index,
2034                                      uvm_processor_mask_t *resident_gpus)
2035 {
2036     uvm_gpu_id_t id;
2037     uvm_processor_mask_zero(resident_gpus);
2038 
2039     for_each_gpu_id_in_mask(id, &va_block->resident) {
2040         if (uvm_page_mask_test(uvm_va_block_resident_mask_get(va_block, id), page_index)) {
2041             UVM_ASSERT(block_processor_page_is_populated(va_block, id, page_index));
2042             uvm_processor_mask_set(resident_gpus, id);
2043         }
2044     }
2045 }
2046 
2047 void uvm_va_block_page_resident_processors(uvm_va_block_t *va_block,
2048                                            uvm_page_index_t page_index,
2049                                            uvm_processor_mask_t *resident_processors)
2050 {
2051     block_page_resident_gpus(va_block, page_index, resident_processors);
2052 
2053     if (uvm_page_mask_test(uvm_va_block_resident_mask_get(va_block, UVM_ID_CPU), page_index)) {
2054         UVM_ASSERT(block_processor_page_is_populated(va_block, UVM_ID_CPU, page_index));
2055         uvm_processor_mask_set(resident_processors, UVM_ID_CPU);
2056     }
2057 }
2058 
2059 NvU32 uvm_va_block_page_resident_processors_count(uvm_va_block_t *va_block, uvm_page_index_t page_index)
2060 {
2061     uvm_processor_mask_t resident_processors;
2062     uvm_va_block_page_resident_processors(va_block, page_index, &resident_processors);
2063 
2064     return uvm_processor_mask_get_count(&resident_processors);
2065 }
2066 
2067 static uvm_processor_id_t block_page_get_closest_resident_in_mask(uvm_va_block_t *va_block,
2068                                                                   uvm_page_index_t page_index,
2069                                                                   uvm_processor_id_t processor,
2070                                                                   const uvm_processor_mask_t *processor_mask)
2071 {
2072     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
2073     uvm_processor_mask_t search_mask;
2074     uvm_processor_id_t id;
2075 
2076     if (processor_mask)
2077         uvm_processor_mask_and(&search_mask, processor_mask, &va_block->resident);
2078     else
2079         uvm_processor_mask_copy(&search_mask, &va_block->resident);
2080 
2081     for_each_closest_id(id, &search_mask, processor, va_space) {
2082         if (uvm_page_mask_test(uvm_va_block_resident_mask_get(va_block, id), page_index))
2083             return id;
2084     }
2085 
2086     // HMM va_blocks don't know if a page is CPU resident until either
2087     // migrate_vma_setup() or hmm_range_fault() is called. If a page isn't
2088     // resident anywhere, assume it is CPU resident.
2089     if (uvm_va_block_is_hmm(va_block))
2090         return UVM_ID_CPU;
2091 
2092     return UVM_ID_INVALID;
2093 }
2094 
2095 uvm_processor_id_t uvm_va_block_page_get_closest_resident(uvm_va_block_t *va_block,
2096                                                           uvm_page_index_t page_index,
2097                                                           uvm_processor_id_t processor)
2098 {
2099     return block_page_get_closest_resident_in_mask(va_block, page_index, processor, NULL);
2100 }
2101 
2102 // We don't track the specific aperture of each mapped page. Instead, we assume
2103 // that each virtual mapping from a given processor always targets the closest
2104 // processor on which that page is resident (with special rules for UVM-Lite).
2105 //
2106 // This function verifies that assumption: before a page becomes resident on a
2107 // new location, assert that no processor has a valid mapping to a farther
2108 // processor on that page.
2109 static bool block_check_resident_proximity(uvm_va_block_t *block, uvm_page_index_t page_index, uvm_processor_id_t new_residency)
2110 {
2111     uvm_processor_mask_t resident_procs, mapped_procs;
2112     uvm_processor_id_t mapped_id, closest_id;
2113     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
2114 
2115     uvm_processor_mask_andnot(&mapped_procs, &block->mapped, block_get_uvm_lite_gpus(block));
2116 
2117     for_each_id_in_mask(mapped_id, &mapped_procs) {
2118         if (!uvm_page_mask_test(uvm_va_block_map_mask_get(block, mapped_id), page_index))
2119             continue;
2120 
2121         uvm_va_block_page_resident_processors(block, page_index, &resident_procs);
2122         UVM_ASSERT(!uvm_processor_mask_empty(&resident_procs));
2123         UVM_ASSERT(!uvm_processor_mask_test(&resident_procs, new_residency));
2124         uvm_processor_mask_set(&resident_procs, new_residency);
2125         closest_id = uvm_processor_mask_find_closest_id(va_space, &resident_procs, mapped_id);
2126         UVM_ASSERT(!uvm_id_equal(closest_id, new_residency));
2127     }
2128 
2129     return true;
2130 }
2131 
2132 // Returns the processor to which page_index should be mapped on gpu
2133 static uvm_processor_id_t block_gpu_get_processor_to_map(uvm_va_block_t *block,
2134                                                          uvm_gpu_t *gpu,
2135                                                          uvm_page_index_t page_index)
2136 {
2137     uvm_processor_id_t dest_id;
2138 
2139     // UVM-Lite GPUs can only map pages on the preferred location
2140     if (uvm_processor_mask_test(block_get_uvm_lite_gpus(block), gpu->id))
2141         return uvm_va_range_get_policy(block->va_range)->preferred_location;
2142 
2143     // Otherwise we always map the closest resident processor
2144     dest_id = uvm_va_block_page_get_closest_resident(block, page_index, gpu->id);
2145     UVM_ASSERT(UVM_ID_IS_VALID(dest_id));
2146     return dest_id;
2147 }
2148 
2149 // Returns the processor to which page_index should be mapped on mapping_id
2150 static uvm_processor_id_t block_get_processor_to_map(uvm_va_block_t *block,
2151                                                      uvm_processor_id_t mapping_id,
2152                                                      uvm_page_index_t page_index)
2153 {
2154 
2155     if (UVM_ID_IS_CPU(mapping_id))
2156         return uvm_va_block_page_get_closest_resident(block, page_index, mapping_id);
2157 
2158     return block_gpu_get_processor_to_map(block, block_get_gpu(block, mapping_id), page_index);
2159 }
2160 
2161 static void block_get_mapped_processors(uvm_va_block_t *block,
2162                                         uvm_processor_id_t resident_id,
2163                                         uvm_page_index_t page_index,
2164                                         uvm_processor_mask_t *mapped_procs)
2165 {
2166     uvm_processor_id_t mapped_id;
2167 
2168     uvm_processor_mask_zero(mapped_procs);
2169 
2170     for_each_id_in_mask(mapped_id, &block->mapped) {
2171         if (uvm_page_mask_test(uvm_va_block_map_mask_get(block, mapped_id), page_index)) {
2172             uvm_processor_id_t to_map_id = block_get_processor_to_map(block, mapped_id, page_index);
2173 
2174             if (uvm_id_equal(to_map_id, resident_id))
2175                 uvm_processor_mask_set(mapped_procs, mapped_id);
2176         }
2177     }
2178 }
2179 
2180 // We use block_gpu_get_processor_to_map to find the destination processor of a
2181 // given GPU mapping. This function is called when the mapping is established to
2182 // sanity check that the destination of the mapping matches the query.
2183 static bool block_check_mapping_residency_region(uvm_va_block_t *block,
2184                                                  uvm_gpu_t *gpu,
2185                                                  uvm_processor_id_t mapping_dest,
2186                                                  uvm_va_block_region_t region,
2187                                                  const uvm_page_mask_t *page_mask)
2188 {
2189     uvm_page_index_t page_index;
2190     for_each_va_block_page_in_region_mask(page_index, page_mask, region) {
2191         NvU64 va = uvm_va_block_cpu_page_address(block, page_index);
2192         uvm_processor_id_t proc_to_map = block_gpu_get_processor_to_map(block, gpu, page_index);
2193         UVM_ASSERT_MSG(uvm_id_equal(mapping_dest, proc_to_map),
2194                        "VA 0x%llx on %s: mapping %s, supposed to map %s",
2195                        va,
2196                        uvm_gpu_name(gpu),
2197                        block_processor_name(block, mapping_dest),
2198                        block_processor_name(block, proc_to_map));
2199     }
2200     return true;
2201 }
2202 
2203 static bool block_check_mapping_residency(uvm_va_block_t *block,
2204                                           uvm_gpu_t *gpu,
2205                                           uvm_processor_id_t mapping_dest,
2206                                           const uvm_page_mask_t *page_mask)
2207 {
2208     return block_check_mapping_residency_region(block,
2209                                                 gpu,
2210                                                 mapping_dest,
2211                                                 uvm_va_block_region_from_block(block),
2212                                                 page_mask);
2213 }
2214 
2215 // Check that there are no mappings targeting resident_id from any processor in
2216 // the block.
2217 static bool block_check_processor_not_mapped(uvm_va_block_t *block, uvm_processor_id_t resident_id)
2218 {
2219     uvm_processor_id_t mapped_id;
2220     uvm_page_index_t page_index;
2221 
2222     for_each_id_in_mask(mapped_id, &block->mapped) {
2223         const uvm_page_mask_t *map_mask = uvm_va_block_map_mask_get(block, mapped_id);
2224 
2225         for_each_va_block_page_in_mask(page_index, map_mask, block) {
2226             uvm_processor_id_t to_map_id = block_get_processor_to_map(block, mapped_id, page_index);
2227             UVM_ASSERT(!uvm_id_equal(to_map_id, resident_id));
2228         }
2229     }
2230 
2231     return true;
2232 }
2233 
2234 // Zero all pages of the newly-populated chunk which are not resident anywhere
2235 // else in the system, adding that work to the block's tracker. In all cases,
2236 // this function adds a dependency on passed in tracker to the block's tracker.
2237 static NV_STATUS block_zero_new_gpu_chunk(uvm_va_block_t *block,
2238                                           uvm_gpu_t *gpu,
2239                                           uvm_gpu_chunk_t *chunk,
2240                                           uvm_va_block_region_t chunk_region,
2241                                           uvm_tracker_t *tracker)
2242 {
2243     uvm_va_block_gpu_state_t *gpu_state;
2244     NV_STATUS status;
2245     uvm_gpu_address_t memset_addr_base, memset_addr;
2246     uvm_push_t push;
2247     uvm_gpu_id_t id;
2248     uvm_va_block_region_t subregion;
2249     uvm_page_mask_t *zero_mask;
2250 
2251     UVM_ASSERT(uvm_va_block_region_size(chunk_region) == uvm_gpu_chunk_get_size(chunk));
2252 
2253     if (chunk->is_zero)
2254         return NV_OK;
2255 
2256     gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
2257     zero_mask = kmem_cache_alloc(g_uvm_page_mask_cache, NV_UVM_GFP_FLAGS);
2258 
2259     if (!zero_mask)
2260         return NV_ERR_NO_MEMORY;
2261 
2262     // Tradeoff: zeroing entire chunk vs zeroing only the pages needed for the
2263     // operation.
2264     //
2265     // We may over-zero the page with this approach. For example, we might be
2266     // populating a 2MB chunk because only a single page within that chunk needs
2267     // to be made resident. If we also zero non-resident pages outside of the
2268     // strict region, we could waste the effort if those pages are populated on
2269     // another processor later and migrated here.
2270     //
2271     // We zero all non-resident pages in the chunk anyway for two reasons:
2272     //
2273     // 1) Efficiency. It's better to do all zeros as pipelined transfers once
2274     //    rather than scatter them around for each populate operation.
2275     //
2276     // 2) Optimizing the common case of block_populate_gpu_chunk being called
2277     //    for already-populated chunks. If we zero once at initial populate, we
2278     //    can simply check whether the chunk is present in the array. Otherwise
2279     //    we'd have to recompute the "is any page resident" mask every time.
2280 
2281     // Roll up all pages in chunk_region which are resident somewhere
2282     uvm_page_mask_zero(zero_mask);
2283     for_each_id_in_mask(id, &block->resident)
2284         uvm_page_mask_or(zero_mask, zero_mask, uvm_va_block_resident_mask_get(block, id));
2285 
2286     // If all pages in the chunk are resident somewhere, we don't need to clear
2287     // anything. Just make sure the chunk is tracked properly.
2288     if (uvm_page_mask_region_full(zero_mask, chunk_region)) {
2289         status = uvm_tracker_add_tracker_safe(&block->tracker, tracker);
2290         goto out;
2291     }
2292 
2293     // Complement to get the pages which are not resident anywhere. These
2294     // are the pages which must be zeroed.
2295     uvm_page_mask_complement(zero_mask, zero_mask);
2296 
2297     memset_addr_base = uvm_gpu_address_copy(gpu, uvm_gpu_phys_address(UVM_APERTURE_VID, chunk->address));
2298     memset_addr = memset_addr_base;
2299 
2300     status = uvm_push_begin_acquire(gpu->channel_manager,
2301                                     UVM_CHANNEL_TYPE_GPU_INTERNAL,
2302                                     tracker,
2303                                     &push,
2304                                     "Zero out chunk [0x%llx, 0x%llx) for region [0x%llx, 0x%llx) in va block [0x%llx, 0x%llx)",
2305                                     chunk->address,
2306                                     chunk->address + uvm_gpu_chunk_get_size(chunk),
2307                                     uvm_va_block_region_start(block, chunk_region),
2308                                     uvm_va_block_region_end(block, chunk_region) + 1,
2309                                     block->start,
2310                                     block->end + 1);
2311     if (status != NV_OK)
2312         goto out;
2313 
2314     for_each_va_block_subregion_in_mask(subregion, zero_mask, chunk_region) {
2315         // Pipeline the memsets since they never overlap with each other
2316         uvm_push_set_flag(&push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
2317 
2318         // We'll push one membar later for all memsets in this loop
2319         uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
2320 
2321         memset_addr.address = memset_addr_base.address + (subregion.first - chunk_region.first) * PAGE_SIZE;
2322         gpu->parent->ce_hal->memset_8(&push, memset_addr, 0, uvm_va_block_region_size(subregion));
2323     }
2324 
2325     // A membar from this GPU is required between this memset and any PTE write
2326     // pointing this or another GPU to this chunk. Otherwise an engine could
2327     // read the PTE then access the page before the memset write is visible to
2328     // that engine.
2329     //
2330     // This memset writes GPU memory, so local mappings need only a GPU-local
2331     // membar. We can't easily determine here whether a peer GPU will ever map
2332     // this page in the future, so always use a sysmembar. uvm_push_end provides
2333     // one by default.
2334     //
2335     // TODO: Bug 1766424: Use GPU-local membars if no peer can currently map
2336     //       this page. When peer access gets enabled, do a MEMBAR_SYS at that
2337     //       point.
2338     uvm_push_end(&push);
2339     status = uvm_tracker_add_push_safe(&block->tracker, &push);
2340 
2341 out:
2342     if (zero_mask)
2343         kmem_cache_free(g_uvm_page_mask_cache, zero_mask);
2344 
2345     return status;
2346 }
2347 
2348 static NV_STATUS block_populate_gpu_chunk(uvm_va_block_t *block,
2349                                           uvm_va_block_retry_t *retry,
2350                                           uvm_gpu_t *gpu,
2351                                           size_t chunk_index,
2352                                           uvm_va_block_region_t chunk_region)
2353 {
2354     uvm_va_block_gpu_state_t *gpu_state = block_gpu_state_get_alloc(block, gpu);
2355     uvm_gpu_chunk_t *chunk = NULL;
2356     uvm_chunk_size_t chunk_size = uvm_va_block_region_size(chunk_region);
2357     uvm_va_block_test_t *block_test = uvm_va_block_get_test(block);
2358     NV_STATUS status;
2359 
2360     if (!gpu_state)
2361         return NV_ERR_NO_MEMORY;
2362 
2363     uvm_assert_mutex_locked(&block->lock);
2364     UVM_ASSERT(chunk_index < block_num_gpu_chunks(block, gpu));
2365     UVM_ASSERT(chunk_size & gpu->parent->mmu_user_chunk_sizes);
2366 
2367     // We zero chunks as necessary at initial population, so if the chunk is
2368     // already populated we're done. See the comment in
2369     // block_zero_new_gpu_chunk.
2370     if (gpu_state->chunks[chunk_index])
2371         return NV_OK;
2372 
2373     UVM_ASSERT(uvm_page_mask_region_empty(&gpu_state->resident, chunk_region));
2374 
2375     status = block_alloc_gpu_chunk(block, retry, gpu, chunk_size, &chunk);
2376     if (status != NV_OK)
2377         return status;
2378 
2379     // In some configurations such as SR-IOV heavy, the chunk cannot be
2380     // referenced using its physical address. Create a virtual mapping.
2381     status = uvm_mmu_chunk_map(chunk);
2382     if (status != NV_OK)
2383         goto chunk_free;
2384 
2385     status = block_zero_new_gpu_chunk(block, gpu, chunk, chunk_region, &retry->tracker);
2386     if (status != NV_OK)
2387         goto chunk_unmap;
2388 
2389     // It is safe to modify the page index field without holding any PMM locks
2390     // because the chunk is pinned, which means that none of the other fields in
2391     // the bitmap can change.
2392     chunk->va_block_page_index = chunk_region.first;
2393 
2394     // va_block_page_index is a bitfield of size PAGE_SHIFT. Make sure at
2395     // compile-time that it can store VA Block page indexes.
2396     BUILD_BUG_ON(PAGES_PER_UVM_VA_BLOCK >= PAGE_SIZE);
2397 
2398     status = block_map_indirect_peers_to_gpu_chunk(block, gpu, chunk);
2399     if (status != NV_OK)
2400         goto chunk_unmap;
2401 
2402     if (block_test && block_test->inject_populate_error) {
2403         block_test->inject_populate_error = false;
2404 
2405         // Use NV_ERR_MORE_PROCESSING_REQUIRED to force a retry rather than
2406         // causing a fatal OOM failure.
2407         status = NV_ERR_MORE_PROCESSING_REQUIRED;
2408         goto chunk_unmap_indirect_peers;
2409     }
2410 
2411     // Record the used chunk so that it can be unpinned at the end of the whole
2412     // operation.
2413     block_retry_add_used_chunk(retry, chunk);
2414     gpu_state->chunks[chunk_index] = chunk;
2415 
2416     return NV_OK;
2417 
2418 chunk_unmap_indirect_peers:
2419     block_unmap_indirect_peers_from_gpu_chunk(block, gpu, chunk);
2420 
2421 chunk_unmap:
2422     uvm_mmu_chunk_unmap(chunk, &block->tracker);
2423 
2424 chunk_free:
2425     // block_zero_new_gpu_chunk may have pushed memsets on this chunk which it
2426     // placed in the block tracker.
2427     uvm_pmm_gpu_free(&gpu->pmm, chunk, &block->tracker);
2428 
2429     return status;
2430 }
2431 
2432 // Populate all chunks which cover the given region and page mask.
2433 static NV_STATUS block_populate_pages_gpu(uvm_va_block_t *block,
2434                                           uvm_va_block_retry_t *retry,
2435                                           uvm_gpu_t *gpu,
2436                                           uvm_va_block_region_t region,
2437                                           const uvm_page_mask_t *populate_mask)
2438 {
2439     uvm_va_block_region_t chunk_region, check_region;
2440     size_t chunk_index;
2441     uvm_page_index_t page_index;
2442     uvm_chunk_size_t chunk_size;
2443     NV_STATUS status;
2444 
2445     page_index = uvm_va_block_first_page_in_mask(region, populate_mask);
2446     if (page_index == region.outer)
2447         return NV_OK;
2448 
2449     chunk_index = block_gpu_chunk_index(block, gpu, page_index, &chunk_size);
2450     chunk_region = uvm_va_block_chunk_region(block, chunk_size, page_index);
2451 
2452     while (1) {
2453         check_region = uvm_va_block_region(max(chunk_region.first, region.first),
2454                                            min(chunk_region.outer, region.outer));
2455         page_index = uvm_va_block_first_page_in_mask(check_region, populate_mask);
2456         if (page_index != check_region.outer) {
2457             status = block_populate_gpu_chunk(block, retry, gpu, chunk_index, chunk_region);
2458             if (status != NV_OK)
2459                 return status;
2460         }
2461 
2462         if (check_region.outer == region.outer)
2463             break;
2464 
2465         ++chunk_index;
2466         chunk_size = block_gpu_chunk_size(block, gpu, chunk_region.outer);
2467         chunk_region = uvm_va_block_region(chunk_region.outer, chunk_region.outer + (chunk_size / PAGE_SIZE));
2468     }
2469 
2470     return NV_OK;
2471 }
2472 
2473 static NV_STATUS block_populate_pages(uvm_va_block_t *block,
2474                                       uvm_va_block_retry_t *retry,
2475                                       uvm_va_block_context_t *block_context,
2476                                       uvm_processor_id_t dest_id,
2477                                       uvm_va_block_region_t region,
2478                                       const uvm_page_mask_t *page_mask)
2479 {
2480     NV_STATUS status;
2481     const uvm_page_mask_t *resident_mask = block_resident_mask_get_alloc(block, dest_id);
2482     uvm_page_mask_t *populate_page_mask = &block_context->make_resident.page_mask;
2483     uvm_memcg_context_t memcg_context;
2484 
2485     if (!resident_mask)
2486         return NV_ERR_NO_MEMORY;
2487 
2488     if (page_mask)
2489         uvm_page_mask_andnot(populate_page_mask, page_mask, resident_mask);
2490     else
2491         uvm_page_mask_complement(populate_page_mask, resident_mask);
2492 
2493     if (UVM_ID_IS_GPU(dest_id))
2494         return block_populate_pages_gpu(block, retry, block_get_gpu(block, dest_id), region, populate_page_mask);
2495 
2496     uvm_memcg_context_start(&memcg_context, block_context->mm);
2497     status = block_populate_pages_cpu(block, populate_page_mask, region, block_context);
2498     uvm_memcg_context_end(&memcg_context);
2499     return status;
2500 }
2501 
2502 static const uvm_processor_mask_t *block_get_can_copy_from_mask(uvm_va_block_t *block, uvm_processor_id_t from)
2503 {
2504     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
2505 
2506     return &va_space->can_copy_from[uvm_id_value(from)];
2507 }
2508 
2509 static bool block_can_copy_from(uvm_va_block_t *va_block, uvm_processor_id_t from, uvm_processor_id_t to)
2510 {
2511     return uvm_processor_mask_test(block_get_can_copy_from_mask(va_block, to), from);
2512 }
2513 
2514 // Get the chunk containing the given page, along with the offset of that page
2515 // within the chunk.
2516 static uvm_gpu_chunk_t *block_phys_page_chunk(uvm_va_block_t *block, block_phys_page_t block_page, size_t *chunk_offset)
2517 {
2518     uvm_gpu_t *gpu = block_get_gpu(block, block_page.processor);
2519     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, block_page.processor);
2520     size_t chunk_index;
2521     uvm_gpu_chunk_t *chunk;
2522     uvm_chunk_size_t chunk_size;
2523 
2524     UVM_ASSERT(gpu_state);
2525 
2526     chunk_index = block_gpu_chunk_index(block, gpu, block_page.page_index, &chunk_size);
2527     chunk = gpu_state->chunks[chunk_index];
2528     UVM_ASSERT(chunk);
2529 
2530     if (chunk_offset) {
2531         size_t page_offset = block_page.page_index -
2532                              uvm_va_block_chunk_region(block,chunk_size, block_page.page_index).first;
2533         *chunk_offset = page_offset * PAGE_SIZE;
2534     }
2535 
2536     return chunk;
2537 }
2538 
2539 // Get the physical GPU address of a block's page from the POV of the specified GPU
2540 // This is the address that should be used for making PTEs for the specified GPU.
2541 static uvm_gpu_phys_address_t block_phys_page_address(uvm_va_block_t *block,
2542                                                       block_phys_page_t block_page,
2543                                                       uvm_gpu_t *gpu)
2544 {
2545     uvm_va_block_gpu_state_t *accessing_gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
2546     size_t chunk_offset;
2547     uvm_gpu_chunk_t *chunk;
2548 
2549     UVM_ASSERT(accessing_gpu_state);
2550 
2551     if (UVM_ID_IS_CPU(block_page.processor)) {
2552         uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, block_page.page_index);
2553         NvU64 dma_addr = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu->parent);
2554         uvm_va_block_region_t chunk_region = uvm_va_block_chunk_region(block,
2555                                                                        uvm_cpu_chunk_get_size(chunk),
2556                                                                        block_page.page_index);
2557 
2558         // The page should be mapped for physical access already as we do that
2559         // eagerly on CPU page population and GPU state alloc.
2560         UVM_ASSERT(dma_addr != 0);
2561         dma_addr += (block_page.page_index - chunk_region.first) * PAGE_SIZE;
2562 
2563         return uvm_gpu_phys_address(UVM_APERTURE_SYS, dma_addr);
2564     }
2565 
2566     chunk = block_phys_page_chunk(block, block_page, &chunk_offset);
2567 
2568     if (uvm_id_equal(block_page.processor, gpu->id)) {
2569         return uvm_gpu_phys_address(UVM_APERTURE_VID, chunk->address + chunk_offset);
2570     }
2571     else {
2572         uvm_gpu_phys_address_t phys_addr;
2573         uvm_gpu_t *owning_gpu = block_get_gpu(block, block_page.processor);
2574         uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
2575 
2576         UVM_ASSERT(uvm_va_space_peer_enabled(va_space, gpu, owning_gpu));
2577         phys_addr = uvm_pmm_gpu_peer_phys_address(&owning_gpu->pmm, chunk, gpu);
2578         phys_addr.address += chunk_offset;
2579         return phys_addr;
2580     }
2581 }
2582 
2583 // Get the physical GPU address of a block's page from the POV of the specified
2584 // GPU, suitable for accessing the memory from UVM-internal CE channels.
2585 //
2586 // Notably this is may be different from block_phys_page_address() to handle CE
2587 // limitations in addressing physical memory directly.
2588 static uvm_gpu_address_t block_phys_page_copy_address(uvm_va_block_t *block,
2589                                                       block_phys_page_t block_page,
2590                                                       uvm_gpu_t *gpu)
2591 {
2592     uvm_gpu_t *owning_gpu;
2593     size_t chunk_offset;
2594     uvm_gpu_chunk_t *chunk;
2595     uvm_gpu_address_t copy_addr;
2596     uvm_va_space_t *va_space;
2597 
2598     UVM_ASSERT_MSG(block_can_copy_from(block, gpu->id, block_page.processor),
2599                    "from %s to %s\n",
2600                    block_processor_name(block, gpu->id),
2601                    block_processor_name(block, block_page.processor));
2602 
2603     // CPU and local GPU accesses can rely on block_phys_page_address, but the
2604     // resulting physical address may need to be converted into virtual.
2605     if (UVM_ID_IS_CPU(block_page.processor) || uvm_id_equal(block_page.processor, gpu->id))
2606         return uvm_gpu_address_copy(gpu, block_phys_page_address(block, block_page, gpu));
2607 
2608     va_space = uvm_va_block_get_va_space(block);
2609 
2610     // See the comments on the peer_identity_mappings_supported assignments in
2611     // the HAL for why we disable direct copies between peers.
2612     owning_gpu = block_get_gpu(block, block_page.processor);
2613 
2614     UVM_ASSERT(uvm_va_space_peer_enabled(va_space, gpu, owning_gpu));
2615 
2616     chunk = block_phys_page_chunk(block, block_page, &chunk_offset);
2617     copy_addr = uvm_pmm_gpu_peer_copy_address(&owning_gpu->pmm, chunk, gpu);
2618     copy_addr.address += chunk_offset;
2619     return copy_addr;
2620 }
2621 
2622 uvm_gpu_phys_address_t uvm_va_block_res_phys_page_address(uvm_va_block_t *va_block,
2623                                                           uvm_page_index_t page_index,
2624                                                           uvm_processor_id_t residency,
2625                                                           uvm_gpu_t *gpu)
2626 {
2627     uvm_assert_mutex_locked(&va_block->lock);
2628 
2629     return block_phys_page_address(va_block, block_phys_page(residency, page_index), gpu);
2630 }
2631 
2632 uvm_gpu_phys_address_t uvm_va_block_gpu_phys_page_address(uvm_va_block_t *va_block,
2633                                                           uvm_page_index_t page_index,
2634                                                           uvm_gpu_t *gpu)
2635 {
2636     return uvm_va_block_res_phys_page_address(va_block, page_index, gpu->id, gpu);
2637 }
2638 
2639 typedef struct
2640 {
2641     // Location of the memory
2642     uvm_processor_id_t id;
2643 
2644     // Whether the whole block has a single physically-contiguous chunk of
2645     // storage on the processor.
2646     bool is_block_contig;
2647 
2648     // Starting address of the physically-contiguous allocation, from the view
2649     // of the copying GPU. Valid only if is_block_contig.
2650     uvm_gpu_address_t gpu_address;
2651 } block_copy_addr_t;
2652 
2653 typedef struct
2654 {
2655     block_copy_addr_t src;
2656     block_copy_addr_t dst;
2657     uvm_conf_computing_dma_buffer_t *dma_buffer;
2658 } block_copy_state_t;
2659 
2660 // Begin a push appropriate for copying data from src_id processor to dst_id processor.
2661 // One of src_id and dst_id needs to be a GPU.
2662 static NV_STATUS block_copy_begin_push(uvm_va_block_t *va_block,
2663                                        block_copy_state_t *copy_state,
2664                                        uvm_tracker_t *tracker,
2665                                        uvm_push_t *push)
2666 {
2667     uvm_gpu_t *gpu;
2668     NV_STATUS status;
2669     uvm_channel_type_t channel_type;
2670     uvm_tracker_t *tracker_ptr = tracker;
2671     uvm_processor_id_t dst_id = copy_state->dst.id;
2672     uvm_processor_id_t src_id = copy_state->src.id;
2673     uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
2674 
2675     UVM_ASSERT_MSG(!uvm_id_equal(src_id, dst_id),
2676                    "Unexpected copy to self, processor %s\n",
2677                    block_processor_name(va_block, src_id));
2678 
2679     if (UVM_ID_IS_CPU(src_id)) {
2680         gpu = block_get_gpu(va_block, dst_id);
2681         channel_type = UVM_CHANNEL_TYPE_CPU_TO_GPU;
2682     }
2683     else if (UVM_ID_IS_CPU(dst_id)) {
2684         gpu = block_get_gpu(va_block, src_id);
2685         channel_type = UVM_CHANNEL_TYPE_GPU_TO_CPU;
2686     }
2687     else {
2688         // For GPU to GPU copies, prefer to "push" the data from the source as
2689         // that works better at least for P2P over PCI-E.
2690         gpu = block_get_gpu(va_block, src_id);
2691 
2692         channel_type = UVM_CHANNEL_TYPE_GPU_TO_GPU;
2693     }
2694 
2695     UVM_ASSERT_MSG(block_can_copy_from(va_block, gpu->id, dst_id),
2696                    "GPU %s dst %s src %s\n",
2697                    block_processor_name(va_block, gpu->id),
2698                    block_processor_name(va_block, dst_id),
2699                    block_processor_name(va_block, src_id));
2700     UVM_ASSERT_MSG(block_can_copy_from(va_block, gpu->id, src_id),
2701                    "GPU %s dst %s src %s\n",
2702                    block_processor_name(va_block, gpu->id),
2703                    block_processor_name(va_block, dst_id),
2704                    block_processor_name(va_block, src_id));
2705 
2706     if (channel_type == UVM_CHANNEL_TYPE_GPU_TO_GPU) {
2707         uvm_gpu_t *dst_gpu = block_get_gpu(va_block, dst_id);
2708         return uvm_push_begin_acquire_gpu_to_gpu(gpu->channel_manager,
2709                                                  dst_gpu,
2710                                                  tracker,
2711                                                  push,
2712                                                  "Copy from %s to %s for block [0x%llx, 0x%llx]",
2713                                                  block_processor_name(va_block, src_id),
2714                                                  block_processor_name(va_block, dst_id),
2715                                                  va_block->start,
2716                                                  va_block->end);
2717     }
2718 
2719     if (uvm_conf_computing_mode_enabled(gpu)) {
2720         // When the Confidential Feature is enabled, additional dependencies
2721         // apply to the input tracker as well as the dma_buffer tracker.
2722         // * In the CPU to GPU case, because UVM performs CPU side
2723         //   crypto-operations first before the GPU copy, we both need to
2724         //   ensure that the dma_buffer and the input tracker are completed.
2725         // * In the GPU to CPU case, the GPU copy happens first, but the same
2726         //   principles apply. Hence, UVM acquires the input tracker and the
2727         //   dma buffer.
2728         status = uvm_tracker_overwrite_safe(&local_tracker, tracker);
2729         if (status != NV_OK)
2730             goto error;
2731 
2732         UVM_ASSERT(copy_state->dma_buffer == NULL);
2733         status = uvm_conf_computing_dma_buffer_alloc(&gpu->conf_computing.dma_buffer_pool,
2734                                                      &copy_state->dma_buffer,
2735                                                      &local_tracker);
2736 
2737         if (status != NV_OK)
2738             goto error;
2739 
2740         if (channel_type == UVM_CHANNEL_TYPE_CPU_TO_GPU) {
2741             status = uvm_tracker_wait(&local_tracker);
2742             if (status != NV_OK)
2743                 goto error;
2744         }
2745 
2746         tracker_ptr = &local_tracker;
2747     }
2748 
2749     status = uvm_push_begin_acquire(gpu->channel_manager,
2750                                     channel_type,
2751                                     tracker_ptr,
2752                                     push,
2753                                     "Copy from %s to %s for block [0x%llx, 0x%llx]",
2754                                     block_processor_name(va_block, src_id),
2755                                     block_processor_name(va_block, dst_id),
2756                                     va_block->start,
2757                                     va_block->end);
2758 
2759 error:
2760     // Caller is responsible for freeing the DMA buffer on error
2761     uvm_tracker_deinit(&local_tracker);
2762     return status;
2763 }
2764 
2765 // A page is clean iff...
2766 // the destination is the preferred location and
2767 // the source is the CPU and
2768 // the destination does not support faults/eviction and
2769 // the CPU page is not dirty
2770 static bool block_page_is_clean(uvm_va_block_t *block,
2771                                 uvm_processor_id_t dst_id,
2772                                 uvm_processor_id_t src_id,
2773                                 uvm_page_index_t page_index)
2774 {
2775     return !uvm_va_block_is_hmm(block) &&
2776            uvm_id_equal(dst_id, uvm_va_range_get_policy(block->va_range)->preferred_location) &&
2777            UVM_ID_IS_CPU(src_id) &&
2778            !block_get_gpu(block, dst_id)->parent->isr.replayable_faults.handling &&
2779            !block_cpu_page_is_dirty(block, page_index);
2780 }
2781 
2782 // When the destination is the CPU...
2783 // if the source is the preferred location, mark as clean
2784 // otherwise, mark as dirty
2785 static void block_update_page_dirty_state(uvm_va_block_t *block,
2786                                           uvm_processor_id_t dst_id,
2787                                           uvm_processor_id_t src_id,
2788                                           uvm_page_index_t page_index)
2789 {
2790     if (UVM_ID_IS_GPU(dst_id))
2791         return;
2792 
2793     if (uvm_id_equal(src_id, uvm_va_range_get_policy(block->va_range)->preferred_location))
2794         block_mark_cpu_page_clean(block, page_index);
2795     else
2796         block_mark_cpu_page_dirty(block, page_index);
2797 }
2798 
2799 static void block_mark_memory_used(uvm_va_block_t *block, uvm_processor_id_t id)
2800 {
2801     uvm_gpu_t *gpu;
2802 
2803     if (UVM_ID_IS_CPU(id))
2804         return;
2805 
2806     gpu = block_get_gpu(block, id);
2807 
2808     // If the block is of the max size and the GPU supports eviction, mark the
2809     // root chunk as used in PMM.
2810     // HMM always allocates PAGE_SIZE GPU chunks so skip HMM va_blocks.
2811     if (!uvm_va_block_is_hmm(block) &&
2812         uvm_va_block_size(block) == UVM_CHUNK_SIZE_MAX &&
2813         uvm_gpu_supports_eviction(gpu)) {
2814         // The chunk has to be there if this GPU is resident
2815         UVM_ASSERT(uvm_processor_mask_test(&block->resident, id));
2816         uvm_pmm_gpu_mark_root_chunk_used(&gpu->pmm, uvm_va_block_gpu_state_get(block, gpu->id)->chunks[0]);
2817     }
2818 }
2819 
2820 static void block_set_resident_processor(uvm_va_block_t *block, uvm_processor_id_t id)
2821 {
2822     UVM_ASSERT(!uvm_page_mask_empty(uvm_va_block_resident_mask_get(block, id)));
2823 
2824     if (uvm_processor_mask_test_and_set(&block->resident, id))
2825         return;
2826 
2827     block_mark_memory_used(block, id);
2828 }
2829 
2830 static void block_clear_resident_processor(uvm_va_block_t *block, uvm_processor_id_t id)
2831 {
2832     uvm_gpu_t *gpu;
2833 
2834     UVM_ASSERT(uvm_page_mask_empty(uvm_va_block_resident_mask_get(block, id)));
2835 
2836     if (!uvm_processor_mask_test_and_clear(&block->resident, id))
2837         return;
2838 
2839     if (UVM_ID_IS_CPU(id))
2840         return;
2841 
2842     gpu = block_get_gpu(block, id);
2843 
2844     // If the block is of the max size and the GPU supports eviction, mark the
2845     // root chunk as unused in PMM.
2846     if (!uvm_va_block_is_hmm(block) &&
2847         uvm_va_block_size(block) == UVM_CHUNK_SIZE_MAX &&
2848         uvm_gpu_supports_eviction(gpu)) {
2849         // The chunk may not be there any more when residency is cleared.
2850         uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
2851         if (gpu_state && gpu_state->chunks[0])
2852             uvm_pmm_gpu_mark_root_chunk_unused(&gpu->pmm, gpu_state->chunks[0]);
2853     }
2854 }
2855 
2856 static bool block_phys_copy_contig_check(uvm_va_block_t *block,
2857                                          uvm_page_index_t page_index,
2858                                          const uvm_gpu_address_t *base_address,
2859                                          uvm_processor_id_t proc_id,
2860                                          uvm_gpu_t *copying_gpu)
2861 {
2862     uvm_gpu_address_t page_address;
2863     uvm_gpu_address_t contig_address = *base_address;
2864 
2865     contig_address.address += page_index * PAGE_SIZE;
2866 
2867     page_address = block_phys_page_copy_address(block, block_phys_page(proc_id, page_index), copying_gpu);
2868 
2869     return uvm_gpu_addr_cmp(page_address, contig_address) == 0;
2870 }
2871 
2872 // Check if the VA block has a single physically-contiguous chunk of storage
2873 // on the processor.
2874 static bool is_block_phys_contig(uvm_va_block_t *block, uvm_processor_id_t id)
2875 {
2876     uvm_cpu_chunk_t *chunk;
2877 
2878     if (UVM_ID_IS_GPU(id))
2879         return uvm_va_block_size(block) == block_gpu_chunk_size(block, block_get_gpu(block, id), 0);
2880 
2881     chunk = uvm_cpu_chunk_first_in_region(block, uvm_va_block_region_from_block(block), NULL);
2882     return chunk && (uvm_va_block_size(block) == uvm_cpu_chunk_get_size(chunk));
2883 }
2884 
2885 static uvm_va_block_region_t block_phys_contig_region(uvm_va_block_t *block,
2886                                                       uvm_page_index_t page_index,
2887                                                       uvm_processor_id_t resident_id)
2888 {
2889     if (UVM_ID_IS_CPU(resident_id)) {
2890         uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index);
2891         return uvm_va_block_region(page_index, page_index + uvm_cpu_chunk_num_pages(chunk));
2892     }
2893     else {
2894         uvm_chunk_size_t chunk_size;
2895         (void)block_gpu_chunk_index(block, block_get_gpu(block, resident_id), page_index, &chunk_size);
2896         return uvm_va_block_chunk_region(block, chunk_size, page_index);
2897     }
2898 }
2899 
2900 // Like block_phys_page_copy_address, but uses the address cached in bca when
2901 // possible.
2902 static uvm_gpu_address_t block_copy_get_address(uvm_va_block_t *block,
2903                                                 block_copy_addr_t *bca,
2904                                                 uvm_page_index_t page_index,
2905                                                 uvm_gpu_t *copying_gpu)
2906 {
2907     if (bca->is_block_contig) {
2908         uvm_gpu_address_t addr = bca->gpu_address;
2909         addr.address += page_index * PAGE_SIZE;
2910         UVM_ASSERT(block_phys_copy_contig_check(block, page_index, &bca->gpu_address, bca->id, copying_gpu));
2911         return addr;
2912     }
2913 
2914     return block_phys_page_copy_address(block, block_phys_page(bca->id, page_index), copying_gpu);
2915 }
2916 
2917 // When the Confidential Computing feature is enabled, the function performs
2918 // CPU side page encryption and GPU side decryption to the CPR.
2919 // GPU operations respect the caller's membar previously set in the push.
2920 static void conf_computing_block_copy_push_cpu_to_gpu(uvm_va_block_t *block,
2921                                                       block_copy_state_t *copy_state,
2922                                                       uvm_va_block_region_t region,
2923                                                       uvm_push_t *push)
2924 {
2925     uvm_push_flag_t membar_flag = 0;
2926     uvm_gpu_t *gpu = uvm_push_get_gpu(push);
2927     uvm_page_index_t page_index = region.first;
2928     uvm_conf_computing_dma_buffer_t *dma_buffer = copy_state->dma_buffer;
2929     struct page *src_page = uvm_cpu_chunk_get_cpu_page(block, page_index);
2930     uvm_gpu_address_t staging_buffer = uvm_mem_gpu_address_virtual_kernel(dma_buffer->alloc, gpu);
2931     uvm_gpu_address_t auth_tag_buffer = uvm_mem_gpu_address_virtual_kernel(dma_buffer->auth_tag, gpu);
2932     char *cpu_auth_tag_buffer = (char *)uvm_mem_get_cpu_addr_kernel(dma_buffer->auth_tag) +
2933                                         (page_index * UVM_CONF_COMPUTING_AUTH_TAG_SIZE);
2934     uvm_gpu_address_t dst_address = block_copy_get_address(block, &copy_state->dst, page_index, gpu);
2935     char *cpu_va_staging_buffer = (char *)uvm_mem_get_cpu_addr_kernel(dma_buffer->alloc) + (page_index * PAGE_SIZE);
2936 
2937     UVM_ASSERT(UVM_ID_IS_CPU(copy_state->src.id));
2938     UVM_ASSERT(UVM_ID_IS_GPU(copy_state->dst.id));
2939 
2940     UVM_ASSERT(uvm_conf_computing_mode_enabled(gpu));
2941 
2942     // See comment in block_copy_begin_push.
2943     UVM_ASSERT(uvm_tracker_is_completed(&block->tracker));
2944 
2945     staging_buffer.address += page_index * PAGE_SIZE;
2946     auth_tag_buffer.address += page_index * UVM_CONF_COMPUTING_AUTH_TAG_SIZE;
2947 
2948     if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE))
2949         membar_flag = UVM_PUSH_FLAG_NEXT_MEMBAR_NONE;
2950     else if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU))
2951         membar_flag = UVM_PUSH_FLAG_NEXT_MEMBAR_GPU;
2952 
2953     // kmap() only guarantees PAGE_SIZE contiguity, all encryption and
2954     // decryption must happen on a PAGE_SIZE basis.
2955     for_each_va_block_page_in_region(page_index, region) {
2956         void *src_cpu_virt_addr;
2957 
2958         // The caller guarantees that all pages in region are contiguous,
2959         // meaning they're guaranteed to be part of the same compound page.
2960         UVM_ASSERT(src_page == uvm_cpu_chunk_get_cpu_page(block, page_index));
2961 
2962         src_cpu_virt_addr = kmap(src_page);
2963         uvm_conf_computing_cpu_encrypt(push->channel,
2964                                        cpu_va_staging_buffer,
2965                                        src_cpu_virt_addr,
2966                                        NULL,
2967                                        PAGE_SIZE,
2968                                        cpu_auth_tag_buffer);
2969         kunmap(src_page);
2970 
2971         // First LCE operation should be non-pipelined to guarantee ordering as
2972         // we do not know when was the last non-pipelined copy.
2973         // Last one applies the membar originally planned for the push if any
2974         // TODO: 3857691: Inherit policy instead of forcing first invocation to
2975         // be non pipelined.
2976         if (page_index > region.first)
2977             uvm_push_set_flag(push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
2978 
2979         if (page_index < (region.outer - 1))
2980             uvm_push_set_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
2981         else if (membar_flag)
2982             uvm_push_set_flag(push, membar_flag);
2983 
2984         gpu->parent->ce_hal->decrypt(push, dst_address, staging_buffer, PAGE_SIZE, auth_tag_buffer);
2985 
2986         src_page++;
2987         dst_address.address += PAGE_SIZE;
2988         cpu_va_staging_buffer += PAGE_SIZE;
2989         staging_buffer.address += PAGE_SIZE;
2990         cpu_auth_tag_buffer += UVM_CONF_COMPUTING_AUTH_TAG_SIZE;
2991         auth_tag_buffer.address += UVM_CONF_COMPUTING_AUTH_TAG_SIZE;
2992     }
2993 }
2994 
2995 // When the Confidential Computing feature is enabled, the function performs
2996 // GPU side page encryption. GPU operations respect the caller's membar
2997 // previously set in the push.
2998 static void conf_computing_block_copy_push_gpu_to_cpu(uvm_va_block_t *block,
2999                                                       block_copy_state_t *copy_state,
3000                                                       uvm_va_block_region_t region,
3001                                                       uvm_push_t *push)
3002 {
3003     uvm_push_flag_t membar_flag = 0;
3004     uvm_gpu_t *gpu = uvm_push_get_gpu(push);
3005     uvm_page_index_t page_index = region.first;
3006     uvm_conf_computing_dma_buffer_t *dma_buffer = copy_state->dma_buffer;
3007     uvm_gpu_address_t staging_buffer = uvm_mem_gpu_address_virtual_kernel(dma_buffer->alloc, gpu);
3008     uvm_gpu_address_t auth_tag_buffer = uvm_mem_gpu_address_virtual_kernel(dma_buffer->auth_tag, gpu);
3009     uvm_gpu_address_t src_address = block_copy_get_address(block, &copy_state->src, page_index, gpu);
3010 
3011     UVM_ASSERT(UVM_ID_IS_GPU(copy_state->src.id));
3012     UVM_ASSERT(UVM_ID_IS_CPU(copy_state->dst.id));
3013 
3014     UVM_ASSERT(uvm_conf_computing_mode_enabled(gpu));
3015 
3016     staging_buffer.address += page_index * PAGE_SIZE;
3017     auth_tag_buffer.address += page_index * UVM_CONF_COMPUTING_AUTH_TAG_SIZE;
3018 
3019     if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE))
3020         membar_flag = UVM_PUSH_FLAG_NEXT_MEMBAR_NONE;
3021     else if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU))
3022         membar_flag = UVM_PUSH_FLAG_NEXT_MEMBAR_GPU;
3023 
3024     // Because we use kmap() for mapping pages for CPU side
3025     // crypto-operations and it only guarantees PAGE_SIZE contiguity, all
3026     // encryptions and decryptions must happen on a PAGE_SIZE basis.
3027     for_each_va_block_page_in_region(page_index, region) {
3028         uvm_conf_computing_log_gpu_encryption(push->channel, &dma_buffer->decrypt_iv[page_index]);
3029 
3030         // First LCE operation should be non-pipelined to guarantee ordering as
3031         // we do not know when was the last non-pipelined copy.
3032         // Last one applies the membar originally planned for the push if any
3033         // TODO: 3857691: Inherit policy instead of forcing first invocation to
3034         // be non pipelined.
3035         if (page_index > region.first)
3036             uvm_push_set_flag(push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
3037 
3038         if (page_index < (region.outer - 1))
3039             uvm_push_set_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
3040         else if (membar_flag)
3041             uvm_push_set_flag(push, membar_flag);
3042 
3043         gpu->parent->ce_hal->encrypt(push, staging_buffer, src_address, PAGE_SIZE, auth_tag_buffer);
3044 
3045         src_address.address += PAGE_SIZE;
3046         staging_buffer.address += PAGE_SIZE;
3047         auth_tag_buffer.address += UVM_CONF_COMPUTING_AUTH_TAG_SIZE;
3048     }
3049 
3050     uvm_page_mask_region_fill(&dma_buffer->encrypted_page_mask, region);
3051 }
3052 
3053 static NV_STATUS conf_computing_copy_pages_finish(uvm_va_block_t *block,
3054                                                   block_copy_state_t *copy_state,
3055                                                   uvm_push_t *push)
3056 {
3057     NV_STATUS status;
3058     uvm_page_index_t page_index;
3059     uvm_conf_computing_dma_buffer_t *dma_buffer = copy_state->dma_buffer;
3060     uvm_page_mask_t *encrypted_page_mask = &dma_buffer->encrypted_page_mask;
3061     void *auth_tag_buffer_base = uvm_mem_get_cpu_addr_kernel(dma_buffer->auth_tag);
3062     void *staging_buffer_base = uvm_mem_get_cpu_addr_kernel(dma_buffer->alloc);
3063 
3064     UVM_ASSERT(uvm_channel_is_secure(push->channel));
3065 
3066     if (UVM_ID_IS_GPU(copy_state->dst.id))
3067         return NV_OK;
3068 
3069     UVM_ASSERT(UVM_ID_IS_GPU(copy_state->src.id));
3070 
3071     status = uvm_push_wait(push);
3072     if (status != NV_OK)
3073         return status;
3074 
3075     // kmap() only guarantees PAGE_SIZE contiguity, all encryption and
3076     // decryption must happen on a PAGE_SIZE basis.
3077     for_each_va_block_page_in_mask(page_index, encrypted_page_mask, block) {
3078         struct page *dst_page = uvm_cpu_chunk_get_cpu_page(block, page_index);
3079         void *staging_buffer = (char *)staging_buffer_base + (page_index * PAGE_SIZE);
3080         void *auth_tag_buffer = (char *)auth_tag_buffer_base + (page_index * UVM_CONF_COMPUTING_AUTH_TAG_SIZE);
3081         void *cpu_page_address = kmap(dst_page);
3082 
3083         status = uvm_conf_computing_cpu_decrypt(push->channel,
3084                                                 cpu_page_address,
3085                                                 staging_buffer,
3086                                                 &dma_buffer->decrypt_iv[page_index],
3087                                                 PAGE_SIZE,
3088                                                 auth_tag_buffer);
3089         kunmap(dst_page);
3090         if (status != NV_OK) {
3091             // TODO: Bug 3814087: [UVM][HCC] Handle CSL auth_tag verification
3092             //                    failures & other failures gracefully.
3093             // uvm_conf_computing_cpu_decrypt() can fail if the authentication
3094             // tag verification fails. May this happen, it is considered a
3095             // critical failure and cannot be recovered.
3096             uvm_global_set_fatal_error(status);
3097             return status;
3098         }
3099     }
3100 
3101     return NV_OK;
3102 }
3103 
3104 static void block_copy_push(uvm_va_block_t *block,
3105                             block_copy_state_t *copy_state,
3106                             uvm_va_block_region_t region,
3107                             uvm_push_t *push)
3108 {
3109     uvm_gpu_address_t gpu_dst_address;
3110     uvm_gpu_address_t gpu_src_address;
3111     uvm_gpu_t *gpu = uvm_push_get_gpu(push);
3112 
3113     uvm_push_set_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
3114 
3115     if (uvm_channel_is_secure(push->channel)) {
3116         if (UVM_ID_IS_CPU(copy_state->src.id))
3117             conf_computing_block_copy_push_cpu_to_gpu(block, copy_state, region, push);
3118         else
3119             conf_computing_block_copy_push_gpu_to_cpu(block, copy_state, region, push);
3120 
3121         return;
3122     }
3123 
3124     gpu_dst_address = block_copy_get_address(block, &copy_state->dst, region.first, gpu);
3125     gpu_src_address = block_copy_get_address(block, &copy_state->src, region.first, gpu);
3126     gpu->parent->ce_hal->memcopy(push, gpu_dst_address, gpu_src_address, uvm_va_block_region_size(region));
3127 }
3128 
3129 static NV_STATUS block_copy_end_push(uvm_va_block_t *block,
3130                                      block_copy_state_t *copy_state,
3131                                      uvm_tracker_t *copy_tracker,
3132                                      NV_STATUS push_status,
3133                                      uvm_push_t *push)
3134 {
3135     NV_STATUS tracker_status;
3136 
3137     // TODO: Bug 1766424: If the destination is a GPU and the copy was done
3138     //       by that GPU, use a GPU-local membar if no peer can currently
3139     //       map this page. When peer access gets enabled, do a MEMBAR_SYS
3140     //       at that point.
3141     uvm_push_end(push);
3142 
3143     if ((push_status == NV_OK) && uvm_channel_is_secure(push->channel))
3144         push_status = conf_computing_copy_pages_finish(block, copy_state, push);
3145 
3146     tracker_status = uvm_tracker_add_push_safe(copy_tracker, push);
3147     if (push_status == NV_OK)
3148         push_status = tracker_status;
3149 
3150     if (uvm_channel_is_secure(push->channel)) {
3151         uvm_gpu_t *gpu = uvm_push_get_gpu(push);
3152         uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
3153 
3154         uvm_tracker_overwrite_with_push(&local_tracker, push);
3155         uvm_conf_computing_dma_buffer_free(&gpu->conf_computing.dma_buffer_pool,
3156                                            copy_state->dma_buffer,
3157                                            &local_tracker);
3158         copy_state->dma_buffer = NULL;
3159         uvm_tracker_deinit(&local_tracker);
3160     }
3161 
3162     return push_status;
3163 }
3164 
3165 // Copies pages resident on the src_id processor to the dst_id processor
3166 //
3167 // The function adds the pages that were successfully copied to the output
3168 // migrated_pages mask and returns the number of pages in copied_pages. These
3169 // fields are reliable even if an error is returned.
3170 //
3171 // Acquires the block's tracker and adds all of its pushes to the copy_tracker.
3172 static NV_STATUS block_copy_resident_pages_between(uvm_va_block_t *block,
3173                                                    uvm_va_block_context_t *block_context,
3174                                                    uvm_processor_id_t dst_id,
3175                                                    uvm_processor_id_t src_id,
3176                                                    uvm_va_block_region_t region,
3177                                                    uvm_page_mask_t *copy_mask,
3178                                                    const uvm_page_mask_t *prefetch_page_mask,
3179                                                    uvm_va_block_transfer_mode_t transfer_mode,
3180                                                    uvm_page_mask_t *migrated_pages,
3181                                                    NvU32 *copied_pages,
3182                                                    uvm_tracker_t *copy_tracker)
3183 {
3184     NV_STATUS status = NV_OK;
3185     uvm_page_mask_t *dst_resident_mask = uvm_va_block_resident_mask_get(block, dst_id);
3186     uvm_gpu_t *copying_gpu = NULL;
3187     uvm_push_t push;
3188     uvm_page_index_t page_index;
3189     uvm_page_index_t contig_start_index = region.outer;
3190     uvm_page_index_t last_index = region.outer;
3191     uvm_range_group_range_t *rgr = NULL;
3192     bool rgr_has_changed = false;
3193     uvm_make_resident_cause_t cause = block_context->make_resident.cause;
3194     uvm_make_resident_cause_t contig_cause = cause;
3195     const bool may_prefetch = (cause == UVM_MAKE_RESIDENT_CAUSE_REPLAYABLE_FAULT ||
3196                                cause == UVM_MAKE_RESIDENT_CAUSE_NON_REPLAYABLE_FAULT ||
3197                                cause == UVM_MAKE_RESIDENT_CAUSE_ACCESS_COUNTER) && !!prefetch_page_mask;
3198     block_copy_state_t copy_state = {0};
3199     uvm_va_range_t *va_range = block->va_range;
3200     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
3201 
3202     copy_state.src.id = src_id;
3203     copy_state.dst.id = dst_id;
3204     copy_state.src.is_block_contig = is_block_phys_contig(block, src_id);
3205     copy_state.dst.is_block_contig = is_block_phys_contig(block, dst_id);
3206 
3207     *copied_pages = 0;
3208 
3209     // If there are no pages to be copied, exit early
3210     if (!uvm_page_mask_andnot(copy_mask, copy_mask, dst_resident_mask) ||
3211         !uvm_page_mask_andnot(copy_mask, copy_mask, migrated_pages))
3212         return NV_OK;
3213 
3214     // uvm_range_group_range_iter_first should only be called when the va_space
3215     // lock is held, which is always the case unless an eviction is taking
3216     // place.
3217     if (cause != UVM_MAKE_RESIDENT_CAUSE_EVICTION) {
3218         rgr = uvm_range_group_range_iter_first(va_space,
3219                                                uvm_va_block_region_start(block, region),
3220                                                uvm_va_block_region_end(block, region));
3221         rgr_has_changed = true;
3222     }
3223 
3224     if (UVM_ID_IS_CPU(dst_id)) {
3225         uvm_memcg_context_t memcg_context;
3226 
3227         // To support staging through CPU, populate CPU pages on demand.
3228         // GPU destinations should have their pages populated already, but
3229         // that might change if we add staging through GPUs.
3230         uvm_memcg_context_start(&memcg_context, block_context->mm);
3231         status = block_populate_pages_cpu(block, copy_mask, region, block_context);
3232         uvm_memcg_context_end(&memcg_context);
3233         if (status != NV_OK)
3234             return status;
3235     }
3236 
3237     // TODO: Bug 3745051: This function is complicated and needs refactoring
3238     for_each_va_block_page_in_region_mask(page_index, copy_mask, region) {
3239         NvU64 page_start = uvm_va_block_cpu_page_address(block, page_index);
3240         uvm_make_resident_cause_t page_cause = (may_prefetch && uvm_page_mask_test(prefetch_page_mask, page_index)) ?
3241                                                 UVM_MAKE_RESIDENT_CAUSE_PREFETCH:
3242                                                 cause;
3243 
3244         UVM_ASSERT(block_check_resident_proximity(block, page_index, dst_id));
3245         UVM_ASSERT(block_processor_page_is_populated(block, dst_id, page_index));
3246 
3247         // If we're not evicting and we're migrating away from the preferred
3248         // location, then we should add the range group range to the list of
3249         // migrated ranges in the range group. It's safe to skip this because
3250         // the use of range_group's migrated_ranges list is a UVM-Lite
3251         // optimization - eviction is not supported on UVM-Lite GPUs.
3252         if (cause != UVM_MAKE_RESIDENT_CAUSE_EVICTION && !uvm_va_block_is_hmm(block) &&
3253             uvm_id_equal(src_id, uvm_va_range_get_policy(va_range)->preferred_location)) {
3254             // rgr_has_changed is used to minimize the number of times the
3255             // migrated_ranges_lock is taken. It is set to false when the range
3256             // group range pointed by rgr is added to the migrated_ranges list,
3257             // and it is just set back to true when we move to a different
3258             // range group range.
3259 
3260             // The current page could be after the end of rgr. Iterate over the
3261             // range group ranges until rgr's end location is greater than or
3262             // equal to the current page.
3263             while (rgr && rgr->node.end < page_start) {
3264                 rgr = uvm_range_group_range_iter_next(va_space, rgr, uvm_va_block_region_end(block, region));
3265                 rgr_has_changed = true;
3266             }
3267 
3268             // Check whether the current page lies within rgr. A single page
3269             // must entirely reside within a range group range. Since we've
3270             // incremented rgr until its end is higher than page_start, we now
3271             // check if page_start lies within rgr.
3272             if (rgr && rgr_has_changed && page_start >= rgr->node.start && page_start <= rgr->node.end) {
3273                 uvm_spin_lock(&rgr->range_group->migrated_ranges_lock);
3274                 if (list_empty(&rgr->range_group_migrated_list_node))
3275                     list_move_tail(&rgr->range_group_migrated_list_node, &rgr->range_group->migrated_ranges);
3276                 uvm_spin_unlock(&rgr->range_group->migrated_ranges_lock);
3277 
3278                 rgr_has_changed = false;
3279             }
3280         }
3281 
3282         // No need to copy pages that haven't changed.  Just clear residency
3283         // information
3284         if (block_page_is_clean(block, dst_id, src_id, page_index))
3285             continue;
3286 
3287         if (!copying_gpu) {
3288             status = block_copy_begin_push(block, &copy_state, &block->tracker, &push);
3289 
3290             if (status != NV_OK)
3291                 break;
3292             copying_gpu = uvm_push_get_gpu(&push);
3293 
3294             // Record all processors involved in the copy
3295             uvm_processor_mask_set(&block_context->make_resident.all_involved_processors, copying_gpu->id);
3296             uvm_processor_mask_set(&block_context->make_resident.all_involved_processors, dst_id);
3297             uvm_processor_mask_set(&block_context->make_resident.all_involved_processors, src_id);
3298 
3299             // This function is called just once per VA block and needs to
3300             // receive the "main" cause for the migration (it mainly checks if
3301             // we are in the eviction path). Therefore, we pass cause instead
3302             // of contig_cause
3303             uvm_tools_record_block_migration_begin(block, &push, dst_id, src_id, page_start, cause);
3304         }
3305         else {
3306             uvm_push_set_flag(&push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
3307         }
3308 
3309         if (!uvm_va_block_is_hmm(block))
3310             block_update_page_dirty_state(block, dst_id, src_id, page_index);
3311 
3312         if (last_index == region.outer) {
3313             bool can_cache_src_phys_addr = copy_state.src.is_block_contig;
3314             bool can_cache_dst_phys_addr = copy_state.dst.is_block_contig;
3315             contig_start_index = page_index;
3316             contig_cause = page_cause;
3317 
3318             // When CC is enabled, transfers between GPU and CPU don't rely on
3319             // any GPU mapping of CPU chunks, physical or virtual.
3320             if (UVM_ID_IS_CPU(src_id) && uvm_conf_computing_mode_enabled(copying_gpu))
3321                 can_cache_src_phys_addr = false;
3322             if (UVM_ID_IS_CPU(dst_id) && uvm_conf_computing_mode_enabled(copying_gpu))
3323                 can_cache_dst_phys_addr = false;
3324             // Computing the physical address is a non-trivial operation and
3325             // seems to be a performance limiter on systems with 2 or more
3326             // NVLINK links. Therefore, for physically-contiguous block
3327             // storage, we cache the start address and compute the page address
3328             // using the page index.
3329             if (can_cache_src_phys_addr) {
3330                 copy_state.src.gpu_address = block_phys_page_copy_address(block,
3331                                                                           block_phys_page(src_id, 0),
3332                                                                           copying_gpu);
3333             }
3334             if (can_cache_dst_phys_addr) {
3335                 copy_state.dst.gpu_address = block_phys_page_copy_address(block,
3336                                                                           block_phys_page(dst_id, 0),
3337                                                                           copying_gpu);
3338             }
3339         }
3340         else if ((page_index != last_index + 1) || contig_cause != page_cause) {
3341             uvm_va_block_region_t contig_region = uvm_va_block_region(contig_start_index, last_index + 1);
3342             UVM_ASSERT(uvm_va_block_region_contains_region(region, contig_region));
3343 
3344             // If both src and dst are physically-contiguous, consolidate copies
3345             // of contiguous pages into a single method.
3346             if (copy_state.src.is_block_contig && copy_state.dst.is_block_contig)
3347                 block_copy_push(block, &copy_state, contig_region, &push);
3348 
3349             uvm_perf_event_notify_migration(&va_space->perf_events,
3350                                             &push,
3351                                             block,
3352                                             dst_id,
3353                                             src_id,
3354                                             uvm_va_block_region_start(block, contig_region),
3355                                             uvm_va_block_region_size(contig_region),
3356                                             transfer_mode,
3357                                             contig_cause,
3358                                             &block_context->make_resident);
3359 
3360             contig_start_index = page_index;
3361             contig_cause = page_cause;
3362         }
3363 
3364         if (!copy_state.src.is_block_contig || !copy_state.dst.is_block_contig)
3365             block_copy_push(block, &copy_state, uvm_va_block_region_for_page(page_index), &push);
3366 
3367         last_index = page_index;
3368     }
3369 
3370     // Copy the remaining pages
3371     if (copying_gpu) {
3372         uvm_va_block_region_t contig_region = uvm_va_block_region(contig_start_index, last_index + 1);
3373         UVM_ASSERT(uvm_va_block_region_contains_region(region, contig_region));
3374 
3375         if (copy_state.src.is_block_contig && copy_state.dst.is_block_contig)
3376             block_copy_push(block, &copy_state, contig_region, &push);
3377 
3378         uvm_perf_event_notify_migration(&va_space->perf_events,
3379                                         &push,
3380                                         block,
3381                                         dst_id,
3382                                         src_id,
3383                                         uvm_va_block_region_start(block, contig_region),
3384                                         uvm_va_block_region_size(contig_region),
3385                                         transfer_mode,
3386                                         contig_cause,
3387                                         &block_context->make_resident);
3388 
3389         status = block_copy_end_push(block, &copy_state, copy_tracker, status, &push);
3390     }
3391 
3392     // Update VA block status bits
3393     //
3394     // Only update the bits for the pages that succeeded
3395     if (status != NV_OK)
3396         uvm_page_mask_region_clear(copy_mask, uvm_va_block_region(page_index, PAGES_PER_UVM_VA_BLOCK));
3397 
3398     *copied_pages = uvm_page_mask_weight(copy_mask);
3399     if (*copied_pages)
3400         uvm_page_mask_or(migrated_pages, migrated_pages, copy_mask);
3401 
3402     return status;
3403 }
3404 
3405 // Copy resident pages to the destination from all source processors in the
3406 // src_processor_mask
3407 //
3408 // The function adds the pages that were successfully copied to the output
3409 // migrated_pages mask and returns the number of pages in copied_pages. These
3410 // fields are reliable even if an error is returned.
3411 static NV_STATUS block_copy_resident_pages_mask(uvm_va_block_t *block,
3412                                                 uvm_va_block_context_t *block_context,
3413                                                 uvm_processor_id_t dst_id,
3414                                                 const uvm_processor_mask_t *src_processor_mask,
3415                                                 uvm_va_block_region_t region,
3416                                                 const uvm_page_mask_t *page_mask,
3417                                                 const uvm_page_mask_t *prefetch_page_mask,
3418                                                 uvm_va_block_transfer_mode_t transfer_mode,
3419                                                 NvU32 max_pages_to_copy,
3420                                                 uvm_page_mask_t *migrated_pages,
3421                                                 NvU32 *copied_pages_out,
3422                                                 uvm_tracker_t *tracker_out)
3423 {
3424     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
3425     uvm_processor_id_t src_id;
3426     uvm_processor_mask_t search_mask;
3427     uvm_page_mask_t *copy_mask = &block_context->make_resident.copy_resident_pages_mask;
3428 
3429     uvm_processor_mask_copy(&search_mask, src_processor_mask);
3430 
3431     *copied_pages_out = 0;
3432 
3433     for_each_closest_id(src_id, &search_mask, dst_id, va_space) {
3434         uvm_page_mask_t *src_resident_mask = uvm_va_block_resident_mask_get(block, src_id);
3435         NV_STATUS status;
3436         NvU32 copied_pages_from_src;
3437 
3438         UVM_ASSERT(!uvm_id_equal(src_id, dst_id));
3439 
3440         uvm_page_mask_init_from_region(copy_mask, region, src_resident_mask);
3441 
3442         if (page_mask)
3443             uvm_page_mask_and(copy_mask, copy_mask, page_mask);
3444 
3445         status = block_copy_resident_pages_between(block,
3446                                                    block_context,
3447                                                    dst_id,
3448                                                    src_id,
3449                                                    region,
3450                                                    copy_mask,
3451                                                    prefetch_page_mask,
3452                                                    transfer_mode,
3453                                                    migrated_pages,
3454                                                    &copied_pages_from_src,
3455                                                    tracker_out);
3456         *copied_pages_out += copied_pages_from_src;
3457         UVM_ASSERT(*copied_pages_out <= max_pages_to_copy);
3458 
3459         if (status != NV_OK)
3460             return status;
3461 
3462         // Break out once we copied max pages already
3463         if (*copied_pages_out == max_pages_to_copy)
3464             break;
3465     }
3466 
3467     return NV_OK;
3468 }
3469 
3470 static void break_read_duplication_in_region(uvm_va_block_t *block,
3471                                              uvm_va_block_context_t *block_context,
3472                                              uvm_processor_id_t dst_id,
3473                                              uvm_va_block_region_t region,
3474                                              const uvm_page_mask_t *page_mask)
3475 {
3476     uvm_processor_id_t id;
3477     uvm_page_mask_t *break_pages_in_region = &block_context->scratch_page_mask;
3478 
3479     uvm_page_mask_init_from_region(break_pages_in_region, region, page_mask);
3480 
3481     UVM_ASSERT(uvm_page_mask_subset(break_pages_in_region, uvm_va_block_resident_mask_get(block, dst_id)));
3482 
3483     // Clear read_duplicated bit for all pages in region
3484     uvm_page_mask_andnot(&block->read_duplicated_pages, &block->read_duplicated_pages, break_pages_in_region);
3485 
3486     // Clear residency bits for all processors other than dst_id
3487     for_each_id_in_mask(id, &block->resident) {
3488         uvm_page_mask_t *other_resident_mask;
3489 
3490         if (uvm_id_equal(id, dst_id))
3491             continue;
3492 
3493         other_resident_mask = uvm_va_block_resident_mask_get(block, id);
3494 
3495         if (!uvm_page_mask_andnot(other_resident_mask, other_resident_mask, break_pages_in_region))
3496             block_clear_resident_processor(block, id);
3497     }
3498 }
3499 
3500 static void block_copy_set_first_touch_residency(uvm_va_block_t *block,
3501                                                  uvm_va_block_context_t *block_context,
3502                                                  uvm_processor_id_t dst_id,
3503                                                  uvm_va_block_region_t region,
3504                                                  const uvm_page_mask_t *page_mask)
3505 {
3506     uvm_page_index_t page_index;
3507     uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(block, dst_id);
3508     uvm_page_mask_t *first_touch_mask = &block_context->make_resident.page_mask;
3509 
3510     if (page_mask)
3511         uvm_page_mask_andnot(first_touch_mask, page_mask, resident_mask);
3512     else
3513         uvm_page_mask_complement(first_touch_mask, resident_mask);
3514 
3515     uvm_page_mask_region_clear_outside(first_touch_mask, region);
3516 
3517     for_each_va_block_page_in_mask(page_index, first_touch_mask, block) {
3518         UVM_ASSERT(!block_is_page_resident_anywhere(block, page_index));
3519         UVM_ASSERT(block_processor_page_is_populated(block, dst_id, page_index));
3520         UVM_ASSERT(block_check_resident_proximity(block, page_index, dst_id));
3521     }
3522 
3523     uvm_page_mask_or(resident_mask, resident_mask, first_touch_mask);
3524     if (!uvm_page_mask_empty(resident_mask))
3525         block_set_resident_processor(block, dst_id);
3526 
3527     // Add them to the output mask, too
3528     uvm_page_mask_or(&block_context->make_resident.pages_changed_residency,
3529                      &block_context->make_resident.pages_changed_residency,
3530                      first_touch_mask);
3531 }
3532 
3533 // Copy resident pages from other processors to the destination.
3534 // All the pages on the destination need to be populated by the caller first.
3535 // Pages not resident anywhere else need to be zeroed out as well.
3536 // The transfer_mode is only used to tell uvm_perf_event_notify_migration()
3537 // whether the copy is for a migration or read duplication.
3538 static NV_STATUS block_copy_resident_pages(uvm_va_block_t *block,
3539                                            uvm_va_block_context_t *block_context,
3540                                            uvm_processor_id_t dst_id,
3541                                            uvm_va_block_region_t region,
3542                                            const uvm_page_mask_t *page_mask,
3543                                            const uvm_page_mask_t *prefetch_page_mask,
3544                                            uvm_va_block_transfer_mode_t transfer_mode)
3545 {
3546     NV_STATUS status = NV_OK;
3547     NV_STATUS tracker_status;
3548     uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
3549     uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(block, dst_id);
3550     NvU32 missing_pages_count;
3551     NvU32 pages_copied;
3552     NvU32 pages_copied_to_cpu;
3553     uvm_processor_mask_t src_processor_mask;
3554     uvm_page_mask_t *copy_page_mask = &block_context->make_resident.page_mask;
3555     uvm_page_mask_t *migrated_pages = &block_context->make_resident.pages_migrated;
3556     uvm_page_mask_t *staged_pages = &block_context->make_resident.pages_staged;
3557 
3558     uvm_page_mask_zero(migrated_pages);
3559     uvm_page_mask_zero(staged_pages);
3560 
3561     if (page_mask)
3562         uvm_page_mask_andnot(copy_page_mask, page_mask, resident_mask);
3563     else
3564         uvm_page_mask_complement(copy_page_mask, resident_mask);
3565 
3566     missing_pages_count = uvm_page_mask_region_weight(copy_page_mask, region);
3567 
3568     if (missing_pages_count == 0)
3569         goto out;
3570 
3571     // TODO: Bug 1753731: Add P2P2P copies staged through a GPU
3572     // TODO: Bug 1753731: When a page is resident in multiple locations due to
3573     //       read-duplication, spread out the source of the copy so we don't
3574     //       bottleneck on a single location.
3575 
3576     uvm_processor_mask_zero(&src_processor_mask);
3577 
3578     if (!uvm_id_equal(dst_id, UVM_ID_CPU)) {
3579         // If the destination is a GPU, first copy everything from processors
3580         // with copy access supported. Notably this will copy pages from the CPU
3581         // as well even if later some extra copies from CPU are required for
3582         // staged copies.
3583         uvm_processor_mask_and(&src_processor_mask, block_get_can_copy_from_mask(block, dst_id), &block->resident);
3584         uvm_processor_mask_clear(&src_processor_mask, dst_id);
3585 
3586         status = block_copy_resident_pages_mask(block,
3587                                                 block_context,
3588                                                 dst_id,
3589                                                 &src_processor_mask,
3590                                                 region,
3591                                                 copy_page_mask,
3592                                                 prefetch_page_mask,
3593                                                 transfer_mode,
3594                                                 missing_pages_count,
3595                                                 migrated_pages,
3596                                                 &pages_copied,
3597                                                 &local_tracker);
3598 
3599         UVM_ASSERT(missing_pages_count >= pages_copied);
3600         missing_pages_count -= pages_copied;
3601 
3602         if (status != NV_OK)
3603             goto out;
3604 
3605         if (missing_pages_count == 0)
3606             goto out;
3607 
3608         if (pages_copied)
3609             uvm_page_mask_andnot(copy_page_mask, copy_page_mask, migrated_pages);
3610     }
3611 
3612     // Now copy from everywhere else to the CPU. This is both for when the
3613     // destination is the CPU (src_processor_mask empty) and for a staged copy
3614     // (src_processor_mask containing processors with copy access to dst_id).
3615     uvm_processor_mask_andnot(&src_processor_mask, &block->resident, &src_processor_mask);
3616     uvm_processor_mask_clear(&src_processor_mask, dst_id);
3617     uvm_processor_mask_clear(&src_processor_mask, UVM_ID_CPU);
3618 
3619     status = block_copy_resident_pages_mask(block,
3620                                             block_context,
3621                                             UVM_ID_CPU,
3622                                             &src_processor_mask,
3623                                             region,
3624                                             copy_page_mask,
3625                                             prefetch_page_mask,
3626                                             transfer_mode,
3627                                             missing_pages_count,
3628                                             staged_pages,
3629                                             &pages_copied_to_cpu,
3630                                             &local_tracker);
3631     if (status != NV_OK)
3632         goto out;
3633 
3634     // If destination is the CPU then we copied everything there above
3635     if (UVM_ID_IS_CPU(dst_id)) {
3636         uvm_page_mask_or(migrated_pages, migrated_pages, staged_pages);
3637         missing_pages_count -= pages_copied_to_cpu;
3638 
3639         goto out;
3640     }
3641 
3642     // Add everything to the block's tracker so that the
3643     // block_copy_resident_pages_between() call below will acquire it.
3644     status = uvm_tracker_add_tracker_safe(&block->tracker, &local_tracker);
3645     if (status != NV_OK)
3646         goto out;
3647     uvm_tracker_clear(&local_tracker);
3648 
3649     // Now copy staged pages from the CPU to the destination.
3650     status = block_copy_resident_pages_between(block,
3651                                                block_context,
3652                                                dst_id,
3653                                                UVM_ID_CPU,
3654                                                region,
3655                                                staged_pages,
3656                                                prefetch_page_mask,
3657                                                transfer_mode,
3658                                                migrated_pages,
3659                                                &pages_copied,
3660                                                &local_tracker);
3661 
3662     UVM_ASSERT(missing_pages_count >= pages_copied);
3663     missing_pages_count -= pages_copied;
3664 
3665     if (status != NV_OK)
3666         goto out;
3667 
3668     // If we get here, that means we were staging the copy through the CPU and
3669     // we should copy as many pages from the CPU as we copied to the CPU.
3670     UVM_ASSERT(pages_copied == pages_copied_to_cpu);
3671 
3672 out:
3673     // Add everything from the local tracker to the block's tracker.
3674     // Notably this is also needed for handling
3675     // block_copy_resident_pages_between() failures in the first loop.
3676     tracker_status = uvm_tracker_add_tracker_safe(&block->tracker, &local_tracker);
3677     uvm_tracker_deinit(&local_tracker);
3678 
3679     return status == NV_OK ? tracker_status : status;
3680 }
3681 
3682 NV_STATUS uvm_va_block_make_resident_copy(uvm_va_block_t *va_block,
3683                                           uvm_va_block_retry_t *va_block_retry,
3684                                           uvm_va_block_context_t *va_block_context,
3685                                           uvm_processor_id_t dest_id,
3686                                           uvm_va_block_region_t region,
3687                                           const uvm_page_mask_t *page_mask,
3688                                           const uvm_page_mask_t *prefetch_page_mask,
3689                                           uvm_make_resident_cause_t cause)
3690 {
3691     NV_STATUS status;
3692     uvm_processor_mask_t unmap_processor_mask;
3693     uvm_page_mask_t *unmap_page_mask = &va_block_context->make_resident.page_mask;
3694     uvm_page_mask_t *resident_mask;
3695 
3696     va_block_context->make_resident.dest_id = dest_id;
3697     va_block_context->make_resident.cause = cause;
3698 
3699     if (prefetch_page_mask) {
3700         UVM_ASSERT(cause == UVM_MAKE_RESIDENT_CAUSE_REPLAYABLE_FAULT ||
3701                    cause == UVM_MAKE_RESIDENT_CAUSE_NON_REPLAYABLE_FAULT ||
3702                    cause == UVM_MAKE_RESIDENT_CAUSE_ACCESS_COUNTER);
3703     }
3704 
3705     uvm_assert_mutex_locked(&va_block->lock);
3706     UVM_ASSERT(uvm_va_block_is_hmm(va_block) || va_block->va_range->type == UVM_VA_RANGE_TYPE_MANAGED);
3707     UVM_ASSERT(uvm_va_block_check_policy_is_valid(va_block, va_block_context->policy, region));
3708 
3709     resident_mask = block_resident_mask_get_alloc(va_block, dest_id);
3710     if (!resident_mask)
3711         return NV_ERR_NO_MEMORY;
3712 
3713     // Unmap all mapped processors except for UVM-Lite GPUs as their mappings
3714     // are largely persistent.
3715     uvm_processor_mask_andnot(&unmap_processor_mask, &va_block->mapped, block_get_uvm_lite_gpus(va_block));
3716 
3717     if (page_mask)
3718         uvm_page_mask_andnot(unmap_page_mask, page_mask, resident_mask);
3719     else
3720         uvm_page_mask_complement(unmap_page_mask, resident_mask);
3721     uvm_page_mask_region_clear_outside(unmap_page_mask, region);
3722 
3723     // Unmap all pages not resident on the destination
3724     status = uvm_va_block_unmap_mask(va_block, va_block_context, &unmap_processor_mask, region, unmap_page_mask);
3725     if (status != NV_OK)
3726         return status;
3727 
3728     if (page_mask)
3729         uvm_page_mask_and(unmap_page_mask, page_mask, &va_block->read_duplicated_pages);
3730     else
3731         uvm_page_mask_init_from_region(unmap_page_mask, region, &va_block->read_duplicated_pages);
3732     uvm_page_mask_region_clear_outside(unmap_page_mask, region);
3733 
3734     // Also unmap read-duplicated pages excluding dest_id
3735     uvm_processor_mask_clear(&unmap_processor_mask, dest_id);
3736     status = uvm_va_block_unmap_mask(va_block, va_block_context, &unmap_processor_mask, region, unmap_page_mask);
3737     if (status != NV_OK)
3738         return status;
3739 
3740     uvm_tools_record_read_duplicate_invalidate(va_block,
3741                                                dest_id,
3742                                                region,
3743                                                unmap_page_mask);
3744 
3745     // Note that block_populate_pages and block_copy_resident_pages also use
3746     // va_block_context->make_resident.page_mask.
3747     unmap_page_mask = NULL;
3748 
3749     status = block_populate_pages(va_block, va_block_retry, va_block_context, dest_id, region, page_mask);
3750     if (status != NV_OK)
3751         return status;
3752 
3753     return block_copy_resident_pages(va_block,
3754                                      va_block_context,
3755                                      dest_id,
3756                                      region,
3757                                      page_mask,
3758                                      prefetch_page_mask,
3759                                      UVM_VA_BLOCK_TRANSFER_MODE_MOVE);
3760 }
3761 
3762 static void block_make_resident_clear_evicted(uvm_va_block_t *va_block,
3763                                               uvm_processor_id_t dst_id,
3764                                               uvm_page_mask_t *page_mask)
3765 {
3766     uvm_va_block_gpu_state_t *dst_gpu_state = uvm_va_block_gpu_state_get(va_block, dst_id);
3767 
3768     UVM_ASSERT(dst_gpu_state);
3769 
3770     if (!uvm_page_mask_andnot(&dst_gpu_state->evicted, &dst_gpu_state->evicted, page_mask))
3771         uvm_processor_mask_clear(&va_block->evicted_gpus, dst_id);
3772 }
3773 
3774 static void block_make_resident_update_state(uvm_va_block_t *va_block,
3775                                              uvm_va_block_context_t *va_block_context,
3776                                              uvm_processor_id_t dst_id,
3777                                              uvm_va_block_region_t region,
3778                                              uvm_page_mask_t *copy_mask,
3779                                              uvm_make_resident_cause_t cause)
3780 {
3781     uvm_page_mask_t *dst_resident_mask = uvm_va_block_resident_mask_get(va_block, dst_id);
3782 
3783     uvm_page_mask_or(dst_resident_mask, dst_resident_mask, copy_mask);
3784     block_set_resident_processor(va_block, dst_id);
3785 
3786     // Accumulate the pages that migrated into the output mask.
3787     uvm_page_mask_or(&va_block_context->make_resident.pages_changed_residency,
3788                      &va_block_context->make_resident.pages_changed_residency,
3789                      copy_mask);
3790 
3791     // Any move operation implies that mappings have been removed from all
3792     // non-UVM-Lite GPUs.
3793     uvm_page_mask_andnot(&va_block->maybe_mapped_pages, &va_block->maybe_mapped_pages, copy_mask);
3794 
3795     // If we are migrating due to an eviction, set the GPU as evicted and
3796     // mark the evicted pages. If we are migrating away from the CPU this
3797     // means that those pages are not evicted.
3798     if (cause == UVM_MAKE_RESIDENT_CAUSE_EVICTION) {
3799         uvm_processor_id_t src_id;
3800 
3801         UVM_ASSERT(UVM_ID_IS_CPU(dst_id));
3802 
3803         // Note that the destination is the CPU so this loop excludes it.
3804         for_each_gpu_id_in_mask(src_id, &va_block_context->make_resident.all_involved_processors) {
3805             uvm_va_block_gpu_state_t *src_gpu_state = uvm_va_block_gpu_state_get(va_block, src_id);
3806 
3807             UVM_ASSERT(src_gpu_state);
3808 
3809             uvm_page_mask_or(&src_gpu_state->evicted, &src_gpu_state->evicted, copy_mask);
3810             uvm_processor_mask_set(&va_block->evicted_gpus, src_id);
3811         }
3812     }
3813     else if (UVM_ID_IS_GPU(dst_id) && uvm_processor_mask_test(&va_block->evicted_gpus, dst_id))
3814         block_make_resident_clear_evicted(va_block, dst_id, copy_mask);
3815 }
3816 
3817 void uvm_va_block_make_resident_finish(uvm_va_block_t *va_block,
3818                                        uvm_va_block_context_t *va_block_context,
3819                                        uvm_va_block_region_t region,
3820                                        const uvm_page_mask_t *page_mask)
3821 {
3822     uvm_page_mask_t *migrated_pages = &va_block_context->make_resident.pages_migrated;
3823     uvm_processor_id_t dst_id = va_block_context->make_resident.dest_id;
3824 
3825     uvm_assert_mutex_locked(&va_block->lock);
3826 
3827     if (page_mask)
3828         uvm_page_mask_and(migrated_pages, migrated_pages, page_mask);
3829 
3830     if (!uvm_page_mask_empty(migrated_pages)) {
3831         // The migrated pages are now resident on the destination.
3832         block_make_resident_update_state(va_block,
3833                                          va_block_context,
3834                                          dst_id,
3835                                          region,
3836                                          migrated_pages,
3837                                          va_block_context->make_resident.cause);
3838     }
3839 
3840     // Pages that weren't resident anywhere else were populated at the
3841     // destination directly. Mark them as resident now.
3842     block_copy_set_first_touch_residency(va_block, va_block_context, dst_id, region, page_mask);
3843 
3844     // Break read duplication and clear residency from other processors.
3845     break_read_duplication_in_region(va_block, va_block_context, dst_id, region, page_mask);
3846 
3847     // Update eviction heuristics, if needed. Notably this could repeat the call
3848     // done in block_set_resident_processor(), but that doesn't do anything bad
3849     // and it's simpler to keep it in both places.
3850     //
3851     // Skip this if we didn't do anything (the input region and/or page mask was
3852     // empty).
3853     if (uvm_processor_mask_test(&va_block->resident, dst_id))
3854         block_mark_memory_used(va_block, dst_id);
3855 }
3856 
3857 NV_STATUS uvm_va_block_make_resident(uvm_va_block_t *va_block,
3858                                      uvm_va_block_retry_t *va_block_retry,
3859                                      uvm_va_block_context_t *va_block_context,
3860                                      uvm_processor_id_t dest_id,
3861                                      uvm_va_block_region_t region,
3862                                      const uvm_page_mask_t *page_mask,
3863                                      const uvm_page_mask_t *prefetch_page_mask,
3864                                      uvm_make_resident_cause_t cause)
3865 {
3866     NV_STATUS status;
3867 
3868     status = uvm_va_block_make_resident_copy(va_block,
3869                                              va_block_retry,
3870                                              va_block_context,
3871                                              dest_id,
3872                                              region,
3873                                              page_mask,
3874                                              prefetch_page_mask,
3875                                              cause);
3876     if (status != NV_OK)
3877         return status;
3878 
3879     uvm_va_block_make_resident_finish(va_block,
3880                                       va_block_context,
3881                                       region,
3882                                       page_mask);
3883 
3884     return NV_OK;
3885 }
3886 
3887 // Combination function which prepares the input {region, page_mask} for
3888 // entering read-duplication. It:
3889 // - Unmaps all processors but revoke_id
3890 // - Revokes write access from revoke_id
3891 static NV_STATUS block_prep_read_duplicate_mapping(uvm_va_block_t *va_block,
3892                                                    uvm_va_block_context_t *va_block_context,
3893                                                    uvm_processor_id_t revoke_id,
3894                                                    uvm_va_block_region_t region,
3895                                                    const uvm_page_mask_t *page_mask)
3896 {
3897     uvm_processor_mask_t unmap_processor_mask;
3898     uvm_processor_id_t unmap_id;
3899     uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
3900     NV_STATUS status, tracker_status;
3901 
3902     // Unmap everybody except revoke_id
3903     uvm_processor_mask_andnot(&unmap_processor_mask, &va_block->mapped, block_get_uvm_lite_gpus(va_block));
3904     uvm_processor_mask_clear(&unmap_processor_mask, revoke_id);
3905 
3906     for_each_id_in_mask(unmap_id, &unmap_processor_mask) {
3907         status = uvm_va_block_unmap(va_block,
3908                                     va_block_context,
3909                                     unmap_id,
3910                                     region,
3911                                     page_mask,
3912                                     &local_tracker);
3913         if (status != NV_OK)
3914             goto out;
3915     }
3916 
3917     // Revoke WRITE/ATOMIC access permissions from the remaining mapped
3918     // processor.
3919     status = uvm_va_block_revoke_prot(va_block,
3920                                       va_block_context,
3921                                       revoke_id,
3922                                       region,
3923                                       page_mask,
3924                                       UVM_PROT_READ_WRITE,
3925                                       &local_tracker);
3926     if (status != NV_OK)
3927         goto out;
3928 
3929 out:
3930     tracker_status = uvm_tracker_add_tracker_safe(&va_block->tracker, &local_tracker);
3931     uvm_tracker_deinit(&local_tracker);
3932     return status == NV_OK ? tracker_status : status;
3933 }
3934 
3935 NV_STATUS uvm_va_block_make_resident_read_duplicate(uvm_va_block_t *va_block,
3936                                                     uvm_va_block_retry_t *va_block_retry,
3937                                                     uvm_va_block_context_t *va_block_context,
3938                                                     uvm_processor_id_t dest_id,
3939                                                     uvm_va_block_region_t region,
3940                                                     const uvm_page_mask_t *page_mask,
3941                                                     const uvm_page_mask_t *prefetch_page_mask,
3942                                                     uvm_make_resident_cause_t cause)
3943 {
3944     NV_STATUS status = NV_OK;
3945     uvm_processor_id_t src_id;
3946     uvm_page_mask_t *dst_resident_mask;
3947     uvm_page_mask_t *cpu_resident_mask;
3948     uvm_page_mask_t *migrated_pages;
3949     uvm_page_mask_t *staged_pages;
3950     uvm_page_mask_t *first_touch_mask;
3951 
3952     // TODO: Bug 3660922: need to implement HMM read duplication support.
3953     UVM_ASSERT(!uvm_va_block_is_hmm(va_block));
3954     UVM_ASSERT(va_block_context->policy == uvm_va_range_get_policy(va_block->va_range));
3955 
3956     va_block_context->make_resident.dest_id = dest_id;
3957     va_block_context->make_resident.cause = cause;
3958 
3959     if (prefetch_page_mask) {
3960         // TODO: Bug 1877578: investigate automatic read-duplicate policies
3961         UVM_ASSERT(cause == UVM_MAKE_RESIDENT_CAUSE_REPLAYABLE_FAULT ||
3962                    cause == UVM_MAKE_RESIDENT_CAUSE_NON_REPLAYABLE_FAULT ||
3963                    cause == UVM_MAKE_RESIDENT_CAUSE_ACCESS_COUNTER);
3964     }
3965 
3966     uvm_assert_mutex_locked(&va_block->lock);
3967     UVM_ASSERT(!uvm_va_block_is_dead(va_block));
3968 
3969     // For pages that are entering read-duplication we need to unmap remote
3970     // mappings and revoke RW and higher access permissions.
3971     //
3972     // The current implementation:
3973     // - Unmaps pages from all processors but the one with the resident copy
3974     // - Revokes write access from the processor with the resident copy
3975     for_each_id_in_mask(src_id, &va_block->resident) {
3976         // Note that the below calls to block_populate_pages and
3977         // block_copy_resident_pages also use
3978         // va_block_context->make_resident.page_mask.
3979         uvm_page_mask_t *preprocess_page_mask = &va_block_context->make_resident.page_mask;
3980         const uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, src_id);
3981         UVM_ASSERT(!uvm_page_mask_empty(resident_mask));
3982 
3983         if (page_mask)
3984             uvm_page_mask_andnot(preprocess_page_mask, page_mask, &va_block->read_duplicated_pages);
3985         else
3986             uvm_page_mask_complement(preprocess_page_mask, &va_block->read_duplicated_pages);
3987 
3988         // If there are no pages that need to be unmapped/revoked, skip to the
3989         // next processor
3990         if (!uvm_page_mask_and(preprocess_page_mask, preprocess_page_mask, resident_mask))
3991             continue;
3992 
3993         status = block_prep_read_duplicate_mapping(va_block, va_block_context, src_id, region, preprocess_page_mask);
3994         if (status != NV_OK)
3995             return status;
3996     }
3997 
3998     status = block_populate_pages(va_block, va_block_retry, va_block_context, dest_id, region, page_mask);
3999     if (status != NV_OK)
4000         return status;
4001 
4002     status = block_copy_resident_pages(va_block,
4003                                        va_block_context,
4004                                        dest_id,
4005                                        region,
4006                                        page_mask,
4007                                        prefetch_page_mask,
4008                                        UVM_VA_BLOCK_TRANSFER_MODE_COPY);
4009     if (status != NV_OK)
4010         return status;
4011 
4012     // Pages that weren't resident anywhere else were populated at the
4013     // destination directly. Mark them as resident now, since there were no
4014     // errors from block_copy_resident_pages() above.
4015     // Note that va_block_context->scratch_page_mask is passed to
4016     // block_copy_set_first_touch_residency() which is generally unsafe but in
4017     // this case, block_copy_set_first_touch_residency() copies page_mask
4018     // before scratch_page_mask could be clobbered.
4019     migrated_pages = &va_block_context->make_resident.pages_migrated;
4020     first_touch_mask = &va_block_context->scratch_page_mask;
4021     uvm_page_mask_init_from_region(first_touch_mask, region, page_mask);
4022     uvm_page_mask_andnot(first_touch_mask, first_touch_mask, migrated_pages);
4023 
4024     if (!uvm_page_mask_empty(first_touch_mask))
4025         block_copy_set_first_touch_residency(va_block, va_block_context, dest_id, region, first_touch_mask);
4026 
4027     staged_pages = &va_block_context->make_resident.pages_staged;
4028     if (!UVM_ID_IS_CPU(dest_id) && !uvm_page_mask_empty(staged_pages)) {
4029         cpu_resident_mask = uvm_va_block_resident_mask_get(va_block, UVM_ID_CPU);
4030         uvm_page_mask_or(cpu_resident_mask, cpu_resident_mask, staged_pages);
4031         block_set_resident_processor(va_block, UVM_ID_CPU);
4032         uvm_page_mask_or(&va_block->read_duplicated_pages, &va_block->read_duplicated_pages, staged_pages);
4033         uvm_tools_record_read_duplicate(va_block, UVM_ID_CPU, region, staged_pages);
4034     }
4035 
4036     if (!uvm_page_mask_empty(migrated_pages)) {
4037         dst_resident_mask = uvm_va_block_resident_mask_get(va_block, dest_id);
4038         uvm_page_mask_or(dst_resident_mask, dst_resident_mask, migrated_pages);
4039         block_set_resident_processor(va_block, dest_id);
4040         uvm_page_mask_or(&va_block->read_duplicated_pages, &va_block->read_duplicated_pages, migrated_pages);
4041         uvm_tools_record_read_duplicate(va_block, dest_id, region, migrated_pages);
4042     }
4043 
4044     UVM_ASSERT(cause != UVM_MAKE_RESIDENT_CAUSE_EVICTION);
4045     if (UVM_ID_IS_GPU(dest_id) && uvm_processor_mask_test(&va_block->evicted_gpus, dest_id))
4046         block_make_resident_clear_evicted(va_block, dest_id, migrated_pages);
4047 
4048     // Update eviction heuristics, if needed. Notably this could repeat the call
4049     // done in block_set_resident_processor(), but that doesn't do anything bad
4050     // and it's simpler to keep it in both places.
4051     //
4052     // Skip this if we didn't do anything (the input region and/or page mask was
4053     // empty).
4054     if (uvm_processor_mask_test(&va_block->resident, dest_id))
4055         block_mark_memory_used(va_block, dest_id);
4056 
4057     return NV_OK;
4058 }
4059 
4060 // Looks up the current CPU mapping state of page from the
4061 // block->cpu.pte_bits bitmaps. If write access is enabled,
4062 // UVM_PROT_READ_WRITE_ATOMIC is returned instead of UVM_PROT_READ_WRITE, since
4063 // write access implies atomic access for CPUs.
4064 static uvm_prot_t block_page_prot_cpu(uvm_va_block_t *block, uvm_page_index_t page_index)
4065 {
4066     uvm_prot_t prot;
4067 
4068     UVM_ASSERT(!uvm_va_block_is_dead(block));
4069 
4070     if (uvm_page_mask_test(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE], page_index))
4071         prot = UVM_PROT_READ_WRITE_ATOMIC;
4072     else if (uvm_page_mask_test(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ], page_index))
4073         prot = UVM_PROT_READ_ONLY;
4074     else
4075         prot = UVM_PROT_NONE;
4076 
4077     return prot;
4078 }
4079 
4080 // Looks up the current GPU mapping state of page from the
4081 // block->gpus[i]->pte_bits bitmaps.
4082 static uvm_prot_t block_page_prot_gpu(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_page_index_t page_index)
4083 {
4084     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
4085     uvm_prot_t prot;
4086 
4087     UVM_ASSERT(!uvm_va_block_is_dead(block));
4088 
4089     if (!gpu_state)
4090         return UVM_PROT_NONE;
4091 
4092     if (uvm_page_mask_test(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_ATOMIC], page_index))
4093         prot = UVM_PROT_READ_WRITE_ATOMIC;
4094     else if (uvm_page_mask_test(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_WRITE], page_index))
4095         prot = UVM_PROT_READ_WRITE;
4096     else if (uvm_page_mask_test(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ], page_index))
4097         prot = UVM_PROT_READ_ONLY;
4098     else
4099         prot = UVM_PROT_NONE;
4100 
4101     return prot;
4102 }
4103 
4104 static uvm_prot_t block_page_prot(uvm_va_block_t *block, uvm_processor_id_t id, uvm_page_index_t page_index)
4105 {
4106     if (UVM_ID_IS_CPU(id))
4107         return block_page_prot_cpu(block, page_index);
4108     else
4109         return block_page_prot_gpu(block, block_get_gpu(block, id), page_index);
4110 }
4111 
4112 // Returns true if the block has any valid CPU PTE mapping in the block region.
4113 static bool block_has_valid_mapping_cpu(uvm_va_block_t *block, uvm_va_block_region_t region)
4114 {
4115     size_t valid_page;
4116 
4117     UVM_ASSERT(region.outer <= uvm_va_block_num_cpu_pages(block));
4118 
4119     // Early-out: check whether any address in this block has a CPU mapping
4120     if (!uvm_processor_mask_test(&block->mapped, UVM_ID_CPU)) {
4121         UVM_ASSERT(uvm_page_mask_empty(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ]));
4122         UVM_ASSERT(uvm_page_mask_empty(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE]));
4123         return false;
4124     }
4125 
4126     // All valid mappings have at least read permissions so we only need to
4127     // inspect the read bits.
4128     valid_page = uvm_va_block_first_page_in_mask(region, &block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ]);
4129     if (valid_page == region.outer)
4130         return false;
4131 
4132     UVM_ASSERT(block_page_prot_cpu(block, valid_page) != UVM_PROT_NONE);
4133     return true;
4134 }
4135 
4136 static bool block_check_chunk_indirect_peers(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_gpu_chunk_t *chunk)
4137 {
4138     uvm_gpu_t *accessing_gpu;
4139     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
4140 
4141     if (!uvm_pmm_sysmem_mappings_indirect_supported())
4142         return true;
4143 
4144     for_each_va_space_gpu_in_mask(accessing_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)]) {
4145         NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(&gpu->pmm, chunk, accessing_gpu);
4146         uvm_reverse_map_t reverse_map;
4147         size_t num_mappings;
4148 
4149         num_mappings = uvm_pmm_sysmem_mappings_dma_to_virt(&accessing_gpu->pmm_reverse_sysmem_mappings,
4150                                                            peer_addr,
4151                                                            uvm_gpu_chunk_get_size(chunk),
4152                                                            &reverse_map,
4153                                                            1);
4154         UVM_ASSERT(num_mappings == 1);
4155         UVM_ASSERT(reverse_map.va_block == block);
4156         UVM_ASSERT(reverse_map.region.first == chunk->va_block_page_index);
4157         UVM_ASSERT(uvm_va_block_region_size(reverse_map.region) == uvm_gpu_chunk_get_size(chunk));
4158 
4159         uvm_va_block_release_no_destroy(reverse_map.va_block);
4160     }
4161 
4162     return true;
4163 }
4164 
4165 // Sanity check the given GPU's chunks array
4166 static bool block_check_gpu_chunks(uvm_va_block_t *block, uvm_gpu_id_t id)
4167 {
4168     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, id);
4169     uvm_gpu_t *gpu;
4170     size_t i, num_chunks;
4171     uvm_page_index_t page_index;
4172     uvm_chunk_size_t chunk_size;
4173 
4174     if (!gpu_state)
4175         return true;
4176 
4177     gpu = block_get_gpu(block, id);
4178 
4179     num_chunks = block_num_gpu_chunks(block, gpu);
4180     for (page_index = 0, i = 0; i < num_chunks; i++) {
4181         uvm_gpu_chunk_t *chunk = gpu_state->chunks[i];
4182         size_t chunk_index = block_gpu_chunk_index(block, gpu, page_index, &chunk_size);
4183 
4184         if (chunk_index != i) {
4185             UVM_ERR_PRINT("chunk index mismatch: calculated %zu, is in %zu. VA block [0x%llx, 0x%llx) GPU %u page_index: %u\n",
4186                            chunk_index,
4187                            i,
4188                            block->start,
4189                            block->end + 1,
4190                            uvm_id_value(id),
4191                            page_index);
4192             return false;
4193         }
4194 
4195         if (chunk) {
4196             if (chunk_size != uvm_gpu_chunk_get_size(chunk)) {
4197                 UVM_ERR_PRINT("chunk size mismatch: calc %u, actual %u. VA block [0x%llx, 0x%llx) GPU: %u page_index: %u chunk index: %zu\n",
4198                               chunk_size,
4199                               uvm_gpu_chunk_get_size(chunk),
4200                               block->start,
4201                               block->end + 1,
4202                               uvm_id_value(id),
4203                               page_index,
4204                               i);
4205                 return false;
4206             }
4207 
4208             if (chunk->state != UVM_PMM_GPU_CHUNK_STATE_ALLOCATED) {
4209                 UVM_ERR_PRINT("Invalid chunk state %s. VA block [0x%llx, 0x%llx) GPU: %u page_index: %u chunk index: %zu chunk_size: %u\n",
4210                               uvm_pmm_gpu_chunk_state_string(chunk->state),
4211                               block->start,
4212                               block->end + 1,
4213                               uvm_id_value(id),
4214                               page_index,
4215                               i,
4216                               chunk_size);
4217                 return false;
4218             }
4219 
4220             UVM_ASSERT(chunk->va_block == block);
4221             UVM_ASSERT(chunk->va_block_page_index == page_index);
4222 
4223             UVM_ASSERT(block_check_chunk_indirect_peers(block, gpu, chunk));
4224         }
4225 
4226         page_index += chunk_size / PAGE_SIZE;
4227     }
4228 
4229     return true;
4230 }
4231 
4232 static bool block_check_chunks(uvm_va_block_t *va_block)
4233 {
4234     uvm_gpu_id_t id;
4235 
4236     for_each_gpu_id(id) {
4237         if (!block_check_gpu_chunks(va_block, id))
4238             return false;
4239     }
4240 
4241     return block_check_cpu_chunks(va_block);
4242 }
4243 
4244 // Sanity checks for page mappings
4245 static bool block_check_mappings_page(uvm_va_block_t *block, uvm_page_index_t page_index)
4246 {
4247     uvm_processor_mask_t atomic_mappings, write_mappings, read_mappings;
4248     uvm_processor_mask_t lite_read_mappings, lite_atomic_mappings;
4249     uvm_processor_mask_t remaining_mappings, temp_mappings;
4250     uvm_processor_mask_t resident_processors;
4251     const uvm_processor_mask_t *residency_accessible_from = NULL;
4252     const uvm_processor_mask_t *residency_has_native_atomics = NULL;
4253     uvm_processor_id_t residency, id;
4254     uvm_va_range_t *va_range = block->va_range;
4255     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
4256     uvm_processor_id_t preferred_location = va_range ?
4257                                             uvm_va_range_get_policy(va_range)->preferred_location :
4258                                             UVM_ID_INVALID;
4259     const uvm_processor_mask_t *uvm_lite_gpus = block_get_uvm_lite_gpus(block);
4260 
4261     block_page_authorized_processors(block, page_index, UVM_PROT_READ_WRITE_ATOMIC, &atomic_mappings);
4262     block_page_authorized_processors(block, page_index, UVM_PROT_READ_WRITE, &write_mappings);
4263     block_page_authorized_processors(block, page_index, UVM_PROT_READ_ONLY, &read_mappings);
4264 
4265     // Each access bit implies all accesses below it
4266     UVM_ASSERT(uvm_processor_mask_subset(&atomic_mappings, &write_mappings));
4267     UVM_ASSERT(uvm_processor_mask_subset(&write_mappings, &read_mappings));
4268     UVM_ASSERT(uvm_processor_mask_subset(&read_mappings, &block->mapped));
4269 
4270     uvm_va_block_page_resident_processors(block, page_index, &resident_processors);
4271     UVM_ASSERT(uvm_processor_mask_subset(&resident_processors, &block->resident));
4272 
4273     // Sanity check block_get_mapped_processors
4274     uvm_processor_mask_copy(&remaining_mappings, &read_mappings);
4275     for_each_id_in_mask(residency, &resident_processors) {
4276         block_get_mapped_processors(block, residency, page_index, &temp_mappings);
4277         UVM_ASSERT(uvm_processor_mask_subset(&temp_mappings, &remaining_mappings));
4278         uvm_processor_mask_andnot(&remaining_mappings, &remaining_mappings, &temp_mappings);
4279     }
4280 
4281     // Any remaining mappings point to non-resident locations, so they must be
4282     // UVM-Lite mappings.
4283     UVM_ASSERT(uvm_processor_mask_subset(&remaining_mappings, uvm_lite_gpus));
4284 
4285     residency = uvm_processor_mask_find_first_id(&resident_processors);
4286 
4287     if (uvm_processor_mask_get_count(&resident_processors) > 0) {
4288         residency_accessible_from    = &va_space->accessible_from[uvm_id_value(residency)];
4289         residency_has_native_atomics = &va_space->has_native_atomics[uvm_id_value(residency)];
4290     }
4291 
4292     // If the page is not resident, there should be no valid mappings
4293     UVM_ASSERT_MSG(uvm_processor_mask_get_count(&resident_processors) > 0 ||
4294                    uvm_processor_mask_get_count(&read_mappings) == 0,
4295                    "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx - SWA: 0x%lx - RD: 0x%lx\n",
4296                    *resident_processors.bitmap,
4297                    *read_mappings.bitmap, *write_mappings.bitmap, *atomic_mappings.bitmap,
4298                    *va_space->system_wide_atomics_enabled_processors.bitmap,
4299                    *block->read_duplicated_pages.bitmap);
4300 
4301     // Test read_duplicated_pages mask
4302     UVM_ASSERT_MSG((uvm_processor_mask_get_count(&resident_processors) <= 1 &&
4303                      !uvm_page_mask_test(&block->read_duplicated_pages, page_index)) ||
4304                    (uvm_processor_mask_get_count(&resident_processors) > 1 &&
4305                      uvm_page_mask_test(&block->read_duplicated_pages, page_index)),
4306                    "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx - SWA: 0x%lx - RD: 0x%lx\n",
4307                    *resident_processors.bitmap,
4308                    *read_mappings.bitmap, *write_mappings.bitmap, *atomic_mappings.bitmap,
4309                    *va_space->system_wide_atomics_enabled_processors.bitmap,
4310                    *block->read_duplicated_pages.bitmap);
4311 
4312     if (!uvm_processor_mask_empty(uvm_lite_gpus))
4313         UVM_ASSERT(UVM_ID_IS_VALID(preferred_location));
4314 
4315     // UVM-Lite checks. Since the range group is made non-migratable before the
4316     // actual migrations for that range group happen, we can only make those
4317     // checks which are valid on both migratable and non-migratable range
4318     // groups.
4319     uvm_processor_mask_and(&lite_read_mappings, &read_mappings, uvm_lite_gpus);
4320     uvm_processor_mask_and(&lite_atomic_mappings, &atomic_mappings, uvm_lite_gpus);
4321 
4322     // Any mapping from a UVM-Lite GPU must be atomic...
4323     UVM_ASSERT(uvm_processor_mask_equal(&lite_read_mappings, &lite_atomic_mappings));
4324 
4325     // ... and must have access to preferred_location
4326     if (UVM_ID_IS_VALID(preferred_location)) {
4327         const uvm_processor_mask_t *preferred_location_accessible_from;
4328 
4329         preferred_location_accessible_from = &va_space->accessible_from[uvm_id_value(preferred_location)];
4330         UVM_ASSERT(uvm_processor_mask_subset(&lite_atomic_mappings, preferred_location_accessible_from));
4331     }
4332 
4333     for_each_id_in_mask(id, &lite_atomic_mappings)
4334         UVM_ASSERT(uvm_processor_mask_test(&va_space->can_access[uvm_id_value(id)], preferred_location));
4335 
4336     // Exclude uvm_lite_gpus from mappings' masks after UVM-Lite tests
4337     uvm_processor_mask_andnot(&read_mappings, &read_mappings, uvm_lite_gpus);
4338     uvm_processor_mask_andnot(&write_mappings, &write_mappings, uvm_lite_gpus);
4339     uvm_processor_mask_andnot(&atomic_mappings, &atomic_mappings, uvm_lite_gpus);
4340 
4341     // Pages set to zero in maybe_mapped_pages must not be mapped on any
4342     // non-UVM-Lite GPU
4343     if (!uvm_page_mask_test(&block->maybe_mapped_pages, page_index)) {
4344         UVM_ASSERT_MSG(uvm_processor_mask_get_count(&read_mappings) == 0,
4345                        "Resident: 0x%lx - Mappings Block: 0x%lx / Page R: 0x%lx W: 0x%lx A: 0x%lx\n",
4346                        *resident_processors.bitmap,
4347                        *block->mapped.bitmap,
4348                        *read_mappings.bitmap, *write_mappings.bitmap, *atomic_mappings.bitmap);
4349     }
4350 
4351     // atomic mappings from GPUs with disabled system-wide atomics are treated
4352     // as write mappings. Therefore, we remove them from the atomic mappings mask
4353     uvm_processor_mask_and(&atomic_mappings, &atomic_mappings, &va_space->system_wide_atomics_enabled_processors);
4354 
4355     if (!uvm_processor_mask_empty(&read_mappings)) {
4356         // Read-duplicate: if a page is resident in multiple locations, it
4357         // must be resident locally on each mapped processor.
4358         if (uvm_processor_mask_get_count(&resident_processors) > 1) {
4359             UVM_ASSERT_MSG(uvm_processor_mask_subset(&read_mappings, &resident_processors),
4360                            "Read-duplicate copies from remote processors\n"
4361                            "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx - SWA: 0x%lx - RD: 0x%lx\n",
4362                            *resident_processors.bitmap,
4363                            *read_mappings.bitmap, *write_mappings.bitmap, *atomic_mappings.bitmap,
4364                            *va_space->system_wide_atomics_enabled_processors.bitmap,
4365                            *block->read_duplicated_pages.bitmap);
4366         }
4367         else {
4368             // Processors with mappings must have access to the processor that
4369             // has the valid copy
4370             UVM_ASSERT_MSG(uvm_processor_mask_subset(&read_mappings, residency_accessible_from),
4371                            "Not all processors have access to %s\n"
4372                            "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx -"
4373                            "Access: 0x%lx - Native Atomics: 0x%lx - SWA: 0x%lx\n",
4374                            uvm_va_space_processor_name(va_space, residency),
4375                            *resident_processors.bitmap,
4376                            *read_mappings.bitmap,
4377                            *write_mappings.bitmap,
4378                            *atomic_mappings.bitmap,
4379                            *residency_accessible_from->bitmap,
4380                            *residency_has_native_atomics->bitmap,
4381                            *va_space->system_wide_atomics_enabled_processors.bitmap);
4382             for_each_id_in_mask(id, &read_mappings) {
4383                 UVM_ASSERT(uvm_processor_mask_test(&va_space->can_access[uvm_id_value(id)], residency));
4384 
4385                 if (uvm_processor_mask_test(&va_space->indirect_peers[uvm_id_value(residency)], id)) {
4386                     uvm_gpu_t *resident_gpu = uvm_va_space_get_gpu(va_space, residency);
4387                     uvm_gpu_t *mapped_gpu = uvm_va_space_get_gpu(va_space, id);
4388                     uvm_gpu_chunk_t *chunk = block_phys_page_chunk(block, block_phys_page(residency, page_index), NULL);
4389 
4390                     // This function will assert if no mapping exists
4391                     (void)uvm_pmm_gpu_indirect_peer_addr(&resident_gpu->pmm, chunk, mapped_gpu);
4392                 }
4393             }
4394         }
4395     }
4396 
4397     // If any processor has a writable mapping, there must only be one copy of
4398     // the page in the system
4399     if (!uvm_processor_mask_empty(&write_mappings)) {
4400         UVM_ASSERT_MSG(uvm_processor_mask_get_count(&resident_processors) == 1,
4401                        "Too many resident copies for pages with write_mappings\n"
4402                        "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx - SWA: 0x%lx - RD: 0x%lx\n",
4403                        *resident_processors.bitmap,
4404                        *read_mappings.bitmap,
4405                        *write_mappings.bitmap,
4406                        *atomic_mappings.bitmap,
4407                        *va_space->system_wide_atomics_enabled_processors.bitmap,
4408                        *block->read_duplicated_pages.bitmap);
4409     }
4410 
4411     if (!uvm_processor_mask_empty(&atomic_mappings)) {
4412         uvm_processor_mask_t native_atomics;
4413 
4414         uvm_processor_mask_and(&native_atomics, &atomic_mappings, residency_has_native_atomics);
4415 
4416         if (uvm_processor_mask_empty(&native_atomics)) {
4417             // No other faultable processor should be able to write
4418             uvm_processor_mask_and(&write_mappings, &write_mappings, &va_space->faultable_processors);
4419 
4420             UVM_ASSERT_MSG(uvm_processor_mask_get_count(&write_mappings) == 1,
4421                            "Too many write mappings to %s from processors with non-native atomics\n"
4422                            "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx -"
4423                            "Access: 0x%lx - Native Atomics: 0x%lx - SWA: 0x%lx\n",
4424                            uvm_va_space_processor_name(va_space, residency),
4425                            *resident_processors.bitmap,
4426                            *read_mappings.bitmap,
4427                            *write_mappings.bitmap,
4428                            *atomic_mappings.bitmap,
4429                            *residency_accessible_from->bitmap,
4430                            *residency_has_native_atomics->bitmap,
4431                            *va_space->system_wide_atomics_enabled_processors.bitmap);
4432 
4433             // Only one processor outside of the native group can have atomics enabled
4434             UVM_ASSERT_MSG(uvm_processor_mask_get_count(&atomic_mappings) == 1,
4435                            "Too many atomics mappings to %s from processors with non-native atomics\n"
4436                            "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx -"
4437                            "Access: 0x%lx - Native Atomics: 0x%lx - SWA: 0x%lx\n",
4438                            uvm_va_space_processor_name(va_space, residency),
4439                            *resident_processors.bitmap,
4440                            *read_mappings.bitmap,
4441                            *write_mappings.bitmap,
4442                            *atomic_mappings.bitmap,
4443                            *residency_accessible_from->bitmap,
4444                            *residency_has_native_atomics->bitmap,
4445                            *va_space->system_wide_atomics_enabled_processors.bitmap);
4446         }
4447         else {
4448             uvm_processor_mask_t non_native_atomics;
4449 
4450             // One or more processors within the native group have atomics enabled.
4451             // All processors outside of that group may have write but not atomic
4452             // permissions.
4453             uvm_processor_mask_andnot(&non_native_atomics, &atomic_mappings, residency_has_native_atomics);
4454 
4455             UVM_ASSERT_MSG(uvm_processor_mask_empty(&non_native_atomics),
4456                            "atomic mappings to %s from processors native and non-native\n"
4457                            "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx -"
4458                            "Access: 0x%lx - Native Atomics: 0x%lx - SWA: 0x%lx\n",
4459                            uvm_va_space_processor_name(va_space, residency),
4460                            *resident_processors.bitmap,
4461                            *read_mappings.bitmap,
4462                            *write_mappings.bitmap,
4463                            *atomic_mappings.bitmap,
4464                            *residency_accessible_from->bitmap,
4465                            *residency_has_native_atomics->bitmap,
4466                            *va_space->system_wide_atomics_enabled_processors.bitmap);
4467         }
4468     }
4469 
4470     return true;
4471 }
4472 
4473 static bool block_check_mappings_ptes(uvm_va_block_t *block, uvm_gpu_t *gpu)
4474 {
4475     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
4476     uvm_va_block_gpu_state_t *resident_gpu_state;
4477     uvm_pte_bits_gpu_t pte_bit;
4478     uvm_processor_id_t resident_id;
4479     uvm_prot_t prot;
4480     NvU32 big_page_size;
4481     size_t num_big_pages, big_page_index;
4482     uvm_va_block_region_t big_region, chunk_region;
4483     uvm_gpu_chunk_t *chunk;
4484 
4485     if (!gpu_state->page_table_range_4k.table)
4486         UVM_ASSERT(!gpu_state->activated_4k);
4487 
4488     if (!gpu_state->page_table_range_big.table) {
4489         UVM_ASSERT(!gpu_state->initialized_big);
4490         UVM_ASSERT(!gpu_state->activated_big);
4491     }
4492 
4493     // It's only safe to check the PTE mappings if we have page tables. See
4494     // uvm_va_block_get_gpu_va_space.
4495     if (!block_gpu_has_page_tables(block, gpu)) {
4496         UVM_ASSERT(!uvm_processor_mask_test(&block->mapped, gpu->id));
4497         return true;
4498     }
4499 
4500     big_page_size = uvm_va_block_gpu_big_page_size(block, gpu);
4501     num_big_pages = uvm_va_block_num_big_pages(block, big_page_size);
4502 
4503     if (block_gpu_supports_2m(block, gpu)) {
4504         if (gpu_state->page_table_range_big.table || gpu_state->page_table_range_4k.table) {
4505             // 2M blocks require the 2M entry to be allocated for the lower
4506             // ranges to also be allocated.
4507             UVM_ASSERT(gpu_state->page_table_range_2m.table);
4508         }
4509         else if (gpu_state->page_table_range_2m.table) {
4510             // If the 2M entry is present but the lower ones aren't, the PTE
4511             // must be 2M.
4512             UVM_ASSERT(gpu_state->pte_is_2m);
4513         }
4514     }
4515     else {
4516         UVM_ASSERT(!gpu_state->page_table_range_2m.table);
4517         if (num_big_pages == 0)
4518             UVM_ASSERT(!gpu_state->page_table_range_big.table);
4519     }
4520 
4521     // If we have the big table and it's in use then it must have been
4522     // initialized, even if it doesn't currently contain active PTEs.
4523     if ((!block_gpu_supports_2m(block, gpu) && gpu_state->page_table_range_big.table) ||
4524         (block_gpu_supports_2m(block, gpu) && !gpu_state->pte_is_2m && gpu_state->activated_big))
4525         UVM_ASSERT(gpu_state->initialized_big);
4526 
4527     if (gpu_state->pte_is_2m) {
4528         UVM_ASSERT(block_gpu_supports_2m(block, gpu));
4529         UVM_ASSERT(gpu_state->page_table_range_2m.table);
4530         UVM_ASSERT(bitmap_empty(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK));
4531         UVM_ASSERT(!gpu_state->force_4k_ptes);
4532 
4533         // GPU architectures which support 2M pages only support 64K as the big
4534         // page size. All of the 2M code assumes that
4535         // MAX_BIG_PAGES_PER_UVM_VA_BLOCK covers a 2M PTE exactly (bitmap_full,
4536         // bitmap_complement, etc).
4537         BUILD_BUG_ON((UVM_PAGE_SIZE_2M / UVM_PAGE_SIZE_64K) != MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
4538 
4539         prot = block_page_prot_gpu(block, gpu, 0);
4540 
4541         // All page permissions match
4542         for (pte_bit = 0; pte_bit < UVM_PTE_BITS_GPU_MAX; pte_bit++) {
4543             if (prot == UVM_PROT_NONE || pte_bit > get_gpu_pte_bit_index(prot))
4544                 UVM_ASSERT(uvm_page_mask_empty(&gpu_state->pte_bits[pte_bit]));
4545             else
4546                 UVM_ASSERT(uvm_page_mask_full(&gpu_state->pte_bits[pte_bit]));
4547         }
4548 
4549         if (prot != UVM_PROT_NONE) {
4550             resident_id = block_gpu_get_processor_to_map(block, gpu, 0);
4551 
4552             // block_check_resident_proximity verifies that no closer processor
4553             // has a resident page, so we don't need to check that all pages
4554             // have the same resident_id.
4555 
4556             // block_check_mappings_page verifies that all pages marked resident
4557             // are backed by populated memory.
4558 
4559             // The mapped processor should be fully resident and physically-
4560             // contiguous.
4561             UVM_ASSERT(uvm_page_mask_full(uvm_va_block_resident_mask_get(block, resident_id)));
4562 
4563             if (UVM_ID_IS_GPU(resident_id)) {
4564                 resident_gpu_state = uvm_va_block_gpu_state_get(block, resident_id);
4565                 UVM_ASSERT(resident_gpu_state);
4566                 UVM_ASSERT(uvm_gpu_chunk_get_size(resident_gpu_state->chunks[0]) == UVM_CHUNK_SIZE_2M);
4567             }
4568             else {
4569                 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_first_in_region(block,
4570                                                                        uvm_va_block_region_from_block(block),
4571                                                                        NULL);
4572 
4573                 UVM_ASSERT(uvm_page_mask_full(&block->cpu.allocated));
4574                 UVM_ASSERT(chunk);
4575                 UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_2M);
4576             }
4577         }
4578     }
4579     else if (!bitmap_empty(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)) {
4580         UVM_ASSERT(gpu_state->page_table_range_big.table);
4581         UVM_ASSERT(!gpu_state->force_4k_ptes);
4582         UVM_ASSERT(num_big_pages > 0);
4583         UVM_ASSERT(gpu_state->initialized_big);
4584 
4585         for (big_page_index = 0; big_page_index < num_big_pages; big_page_index++) {
4586             big_region = uvm_va_block_big_page_region(block, big_page_index, big_page_size);
4587 
4588             if (!test_bit(big_page_index, gpu_state->big_ptes)) {
4589                 // If there are valid mappings but this isn't a big PTE, the
4590                 // mapping must be using the 4k PTEs.
4591                 if (!uvm_page_mask_region_empty(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ], big_region))
4592                     UVM_ASSERT(gpu_state->page_table_range_4k.table);
4593                 continue;
4594             }
4595 
4596             prot = block_page_prot_gpu(block, gpu, big_region.first);
4597 
4598             // All page permissions match
4599             for (pte_bit = 0; pte_bit < UVM_PTE_BITS_GPU_MAX; pte_bit++) {
4600                 if (prot == UVM_PROT_NONE || pte_bit > get_gpu_pte_bit_index(prot))
4601                     UVM_ASSERT(uvm_page_mask_region_empty(&gpu_state->pte_bits[pte_bit], big_region));
4602                 else
4603                     UVM_ASSERT(uvm_page_mask_region_full(&gpu_state->pte_bits[pte_bit], big_region));
4604             }
4605 
4606             if (prot != UVM_PROT_NONE) {
4607                 resident_id = block_gpu_get_processor_to_map(block, gpu, big_region.first);
4608 
4609                 // The mapped processor should be fully resident and physically-
4610                 // contiguous. Exception: UVM-Lite GPUs always map the preferred
4611                 // location even if the memory is resident elsewhere. Skip the
4612                 // residency check but still verify contiguity.
4613                 if (!uvm_processor_mask_test(block_get_uvm_lite_gpus(block), gpu->id)) {
4614                     UVM_ASSERT(uvm_page_mask_region_full(uvm_va_block_resident_mask_get(block, resident_id),
4615                                                          big_region));
4616                 }
4617 
4618                 if (UVM_ID_IS_CPU(resident_id)) {
4619                     uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, big_region.first);
4620 
4621                     UVM_ASSERT(gpu->parent->can_map_sysmem_with_large_pages);
4622                     UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) >= uvm_va_block_region_size(big_region));
4623                 }
4624                 else {
4625                     // Check GPU chunks
4626                     chunk = block_phys_page_chunk(block, block_phys_page(resident_id, big_region.first), NULL);
4627                     chunk_region = uvm_va_block_chunk_region(block, uvm_gpu_chunk_get_size(chunk), big_region.first);
4628                     UVM_ASSERT(uvm_va_block_region_contains_region(chunk_region, big_region));
4629                 }
4630             }
4631         }
4632     }
4633 
4634     return true;
4635 }
4636 
4637 static bool block_check_mappings(uvm_va_block_t *block)
4638 {
4639     uvm_page_index_t page_index;
4640     uvm_processor_id_t id;
4641 
4642     // Verify the master masks, since block_check_mappings_page relies on them
4643     for_each_processor_id(id) {
4644         const uvm_page_mask_t *resident_mask, *map_mask;
4645 
4646         if (UVM_ID_IS_GPU(id) && !uvm_va_block_gpu_state_get(block, id)) {
4647             UVM_ASSERT(!uvm_processor_mask_test(&block->resident, id));
4648             UVM_ASSERT(!uvm_processor_mask_test(&block->mapped, id));
4649             UVM_ASSERT(!uvm_processor_mask_test(&block->evicted_gpus, id));
4650             continue;
4651         }
4652 
4653         resident_mask = uvm_va_block_resident_mask_get(block, id);
4654         UVM_ASSERT(uvm_processor_mask_test(&block->resident, id) == !uvm_page_mask_empty(resident_mask));
4655 
4656         map_mask = uvm_va_block_map_mask_get(block, id);
4657         UVM_ASSERT(uvm_processor_mask_test(&block->mapped, id) == !uvm_page_mask_empty(map_mask));
4658 
4659         if (UVM_ID_IS_GPU(id)) {
4660             const uvm_page_mask_t *evicted_mask = block_evicted_mask_get(block, id);
4661             UVM_ASSERT(uvm_processor_mask_test(&block->evicted_gpus, id) == !uvm_page_mask_empty(evicted_mask));
4662 
4663             // Pages cannot be resident if they are marked as evicted
4664             UVM_ASSERT(!uvm_page_mask_intersects(evicted_mask, resident_mask));
4665 
4666             // Pages cannot be resident on a GPU with no memory
4667             if (!block_processor_has_memory(block, id))
4668                 UVM_ASSERT(!uvm_processor_mask_test(&block->resident, id));
4669         }
4670     }
4671 
4672     // Check that every page has coherent mappings
4673     for_each_va_block_page(page_index, block)
4674         block_check_mappings_page(block, page_index);
4675 
4676     for_each_gpu_id(id) {
4677         if (uvm_va_block_gpu_state_get(block, id)) {
4678             uvm_gpu_t *gpu = block_get_gpu(block, id);
4679 
4680             // Check big and/or 2M PTE state
4681             block_check_mappings_ptes(block, gpu);
4682         }
4683     }
4684 
4685     return true;
4686 }
4687 
4688 // See the comments on uvm_va_block_unmap
4689 static void block_unmap_cpu(uvm_va_block_t *block, uvm_va_block_region_t region, const uvm_page_mask_t *unmap_pages)
4690 {
4691     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
4692     uvm_pte_bits_cpu_t pte_bit;
4693     bool unmapped_something = false;
4694     uvm_va_block_region_t subregion;
4695     NvU32 num_mapped_processors;
4696 
4697     // Early-out if nothing in the region is mapped or being unmapped.
4698     if (!block_has_valid_mapping_cpu(block, region) ||
4699         (unmap_pages && !uvm_page_mask_intersects(unmap_pages, &block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ])))
4700         return;
4701 
4702     // We can't actually unmap HMM ranges from the CPU here.
4703     // Unmapping happens as part of migrate_vma_setup().
4704     if (uvm_va_block_is_hmm(block)) {
4705         UVM_ASSERT(!uvm_va_block_is_hmm(block));
4706         return;
4707     }
4708 
4709     num_mapped_processors = uvm_processor_mask_get_count(&block->mapped);
4710 
4711     // If we are unmapping a page which we are tracking due to CPU faults with
4712     // correct permissions, clear the info. This will cover both the unmap and
4713     // revoke cases (since we implement CPU revocation by unmap + map)
4714     if (block->cpu.fault_authorized.first_fault_stamp &&
4715         uvm_page_mask_region_test(unmap_pages, region, block->cpu.fault_authorized.page_index))
4716         block->cpu.fault_authorized.first_fault_stamp = 0;
4717 
4718     for_each_va_block_subregion_in_mask(subregion, unmap_pages, region) {
4719         if (!block_has_valid_mapping_cpu(block, subregion))
4720             continue;
4721 
4722         unmap_mapping_range(va_space->mapping,
4723                             uvm_va_block_region_start(block, subregion),
4724                             uvm_va_block_region_size(subregion), 1);
4725 
4726         for (pte_bit = 0; pte_bit < UVM_PTE_BITS_CPU_MAX; pte_bit++)
4727             uvm_page_mask_region_clear(&block->cpu.pte_bits[pte_bit], subregion);
4728 
4729         // If the CPU is the only processor with mappings we can safely mark
4730         // the pages as fully unmapped
4731         if (num_mapped_processors == 1)
4732             uvm_page_mask_region_clear(&block->maybe_mapped_pages, subregion);
4733 
4734         unmapped_something = true;
4735     }
4736 
4737     if (!unmapped_something)
4738         return;
4739 
4740     // Check whether the block has any more mappings
4741     if (uvm_page_mask_empty(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ])) {
4742         UVM_ASSERT(uvm_page_mask_empty(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE]));
4743         uvm_processor_mask_clear(&block->mapped, UVM_ID_CPU);
4744     }
4745 
4746     UVM_ASSERT(block_check_mappings(block));
4747 }
4748 
4749 // Given a mask of mapped pages, returns true if any of the pages in the mask
4750 // are mapped remotely by the given GPU.
4751 static bool block_has_remote_mapping_gpu(uvm_va_block_t *block,
4752                                          uvm_va_block_context_t *block_context,
4753                                          uvm_gpu_id_t gpu_id,
4754                                          const uvm_page_mask_t *mapped_pages)
4755 {
4756     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu_id);
4757 
4758     if (!gpu_state)
4759         return false;
4760 
4761     // The caller must ensure that all pages of the input mask are really mapped
4762     UVM_ASSERT(uvm_page_mask_subset(mapped_pages, &gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ]));
4763 
4764     // UVM-Lite GPUs map the preferred location if it's accessible, regardless
4765     // of the resident location.
4766     if (uvm_processor_mask_test(block_get_uvm_lite_gpus(block), gpu_id)) {
4767         if (uvm_page_mask_empty(mapped_pages))
4768             return false;
4769 
4770         return !uvm_id_equal(uvm_va_range_get_policy(block->va_range)->preferred_location, gpu_id);
4771     }
4772 
4773     // Remote pages are pages which are mapped but not resident locally
4774     return uvm_page_mask_andnot(&block_context->scratch_page_mask, mapped_pages, &gpu_state->resident);
4775 }
4776 
4777 // Writes pte_clear_val to the 4k PTEs covered by clear_page_mask. If
4778 // clear_page_mask is NULL, all 4k PTEs in the {block, gpu} are written.
4779 //
4780 // If tlb_batch is provided, the 4k PTEs written are added to the batch. The
4781 // caller is responsible for ending the TLB batch with the appropriate membar.
4782 static void block_gpu_pte_clear_4k(uvm_va_block_t *block,
4783                                    uvm_gpu_t *gpu,
4784                                    const uvm_page_mask_t *clear_page_mask,
4785                                    NvU64 pte_clear_val,
4786                                    uvm_pte_batch_t *pte_batch,
4787                                    uvm_tlb_batch_t *tlb_batch)
4788 {
4789     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
4790     uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
4791     uvm_gpu_phys_address_t pte_addr;
4792     NvU32 pte_size = uvm_mmu_pte_size(tree, UVM_PAGE_SIZE_4K);
4793     uvm_va_block_region_t region = uvm_va_block_region_from_block(block);
4794     uvm_va_block_region_t subregion;
4795     size_t num_ptes, ptes_per_page = PAGE_SIZE / UVM_PAGE_SIZE_4K;
4796 
4797     for_each_va_block_subregion_in_mask(subregion, clear_page_mask, region) {
4798         num_ptes = uvm_va_block_region_num_pages(subregion) * ptes_per_page;
4799 
4800         pte_addr = uvm_page_table_range_entry_address(tree,
4801                                                       &gpu_state->page_table_range_4k,
4802                                                       subregion.first * ptes_per_page);
4803 
4804         uvm_pte_batch_clear_ptes(pte_batch, pte_addr, pte_clear_val, pte_size, num_ptes);
4805 
4806         if (tlb_batch) {
4807             uvm_tlb_batch_invalidate(tlb_batch,
4808                                      uvm_va_block_region_start(block, subregion),
4809                                      uvm_va_block_region_size(subregion),
4810                                      UVM_PAGE_SIZE_4K,
4811                                      UVM_MEMBAR_NONE);
4812         }
4813     }
4814 }
4815 
4816 // Writes the 4k PTEs covered by write_page_mask using memory from resident_id
4817 // with new_prot permissions. new_prot must not be UVM_PROT_NONE: use
4818 // block_gpu_pte_clear_4k instead.
4819 //
4820 // If write_page_mask is NULL, all 4k PTEs in the {block, gpu} are written.
4821 //
4822 // If tlb_batch is provided, the 4k PTEs written are added to the batch. The
4823 // caller is responsible for ending the TLB batch with the appropriate membar.
4824 static void block_gpu_pte_write_4k(uvm_va_block_t *block,
4825                                    uvm_gpu_t *gpu,
4826                                    uvm_processor_id_t resident_id,
4827                                    uvm_prot_t new_prot,
4828                                    const uvm_page_mask_t *write_page_mask,
4829                                    uvm_pte_batch_t *pte_batch,
4830                                    uvm_tlb_batch_t *tlb_batch)
4831 {
4832     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
4833     uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
4834     NvU32 pte_size = uvm_mmu_pte_size(tree, UVM_PAGE_SIZE_4K);
4835     const size_t ptes_per_page = PAGE_SIZE / UVM_PAGE_SIZE_4K;
4836     uvm_va_block_region_t contig_region = {0};
4837     uvm_gpu_phys_address_t contig_addr = {0};
4838     uvm_gpu_phys_address_t page_addr = {0};
4839     uvm_page_index_t page_index;
4840     NvU64 pte_flags = block_gpu_pte_flag_cacheable(block, gpu, resident_id);
4841 
4842     UVM_ASSERT(new_prot != UVM_PROT_NONE);
4843     UVM_ASSERT(UVM_ID_IS_VALID(resident_id));
4844 
4845     for_each_va_block_page_in_mask(page_index, write_page_mask, block) {
4846         uvm_gpu_phys_address_t pte_addr;
4847         size_t i;
4848 
4849         // Assume that this mapping will be used to write to the page
4850         if (new_prot > UVM_PROT_READ_ONLY && UVM_ID_IS_CPU(resident_id) && !uvm_va_block_is_hmm(block))
4851             block_mark_cpu_page_dirty(block, page_index);
4852 
4853         if (page_index >= contig_region.outer) {
4854             contig_region = block_phys_contig_region(block, page_index, resident_id);
4855             contig_addr = block_phys_page_address(block, block_phys_page(resident_id, contig_region.first), gpu);
4856             page_addr = contig_addr;
4857         }
4858 
4859         page_addr.address = contig_addr.address + (page_index - contig_region.first) * PAGE_SIZE;
4860 
4861         pte_addr = uvm_page_table_range_entry_address(tree,
4862                                                       &gpu_state->page_table_range_4k,
4863                                                       page_index * ptes_per_page);
4864 
4865         // Handle PAGE_SIZE > GPU PTE size
4866         for (i = 0; i < ptes_per_page; i++) {
4867             NvU64 pte_val = tree->hal->make_pte(page_addr.aperture, page_addr.address, new_prot, pte_flags);
4868             uvm_pte_batch_write_pte(pte_batch, pte_addr, pte_val, pte_size);
4869             page_addr.address += UVM_PAGE_SIZE_4K;
4870             pte_addr.address += pte_size;
4871         }
4872 
4873         if (tlb_batch) {
4874             NvU64 page_virt_addr = uvm_va_block_cpu_page_address(block, page_index);
4875             uvm_tlb_batch_invalidate(tlb_batch, page_virt_addr, PAGE_SIZE, UVM_PAGE_SIZE_4K, UVM_MEMBAR_NONE);
4876         }
4877     }
4878 }
4879 
4880 // Writes all 4k PTEs under the big PTE regions described by big_ptes_covered.
4881 // This is used to initialize the 4k PTEs when splitting 2M and big PTEs. It
4882 // only writes 4k PTEs, not big PTEs.
4883 //
4884 // For those 4k PTEs, new_pages_mask indicates which ones should inherit the
4885 // mapping from the corresponding big page (0) and which ones should be written
4886 // using memory from resident_id and new_prot (1). Unlike the other pte_write
4887 // functions, new_prot may be UVM_PROT_NONE.
4888 //
4889 // If resident_id is UVM_ID_INVALID, this function looks up the resident ID
4890 // which should inherit the current permissions. new_prot must be UVM_PROT_NONE
4891 // in this case.
4892 //
4893 // new_pages_mask must not be NULL.
4894 //
4895 // No TLB invalidates are required since we've set up the lower PTEs to never be
4896 // cached by the GPU's MMU when covered by larger PTEs.
4897 static void block_gpu_pte_big_split_write_4k(uvm_va_block_t *block,
4898                                              uvm_va_block_context_t *block_context,
4899                                              uvm_gpu_t *gpu,
4900                                              uvm_processor_id_t resident_id,
4901                                              uvm_prot_t new_prot,
4902                                              const unsigned long *big_ptes_covered,
4903                                              const uvm_page_mask_t *new_pages_mask,
4904                                              uvm_pte_batch_t *pte_batch)
4905 {
4906     uvm_va_block_region_t big_region;
4907     size_t big_page_index;
4908     uvm_processor_id_t curr_resident_id;
4909     uvm_prot_t curr_prot;
4910     NvU32 big_page_size = uvm_va_block_gpu_big_page_size(block, gpu);
4911 
4912     if (UVM_ID_IS_INVALID(resident_id))
4913         UVM_ASSERT(new_prot == UVM_PROT_NONE);
4914 
4915     for_each_set_bit(big_page_index, big_ptes_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) {
4916         big_region = uvm_va_block_big_page_region(block, big_page_index, big_page_size);
4917 
4918         curr_prot = block_page_prot_gpu(block, gpu, big_region.first);
4919 
4920         // The unmap path doesn't know the current residency ahead of time, so
4921         // we have to look it up.
4922         if (UVM_ID_IS_INVALID(resident_id)) {
4923             curr_resident_id = block_gpu_get_processor_to_map(block, gpu, big_region.first);
4924         }
4925         else {
4926             // Check that we aren't changing the aperture of the existing
4927             // mappings. It could be legal in some cases (switching from {RO, A}
4928             // to {RO, B} for example) but we'd need to issue TLB membars.
4929             if (curr_prot != UVM_PROT_NONE)
4930                 UVM_ASSERT(uvm_id_equal(block_gpu_get_processor_to_map(block, gpu, big_region.first), resident_id));
4931 
4932             curr_resident_id = resident_id;
4933         }
4934 
4935         // pages in new_pages_mask under this big page get new_prot
4936         uvm_page_mask_zero(&block_context->scratch_page_mask);
4937         uvm_page_mask_region_fill(&block_context->scratch_page_mask, big_region);
4938         if (uvm_page_mask_and(&block_context->scratch_page_mask, &block_context->scratch_page_mask, new_pages_mask)) {
4939             if (new_prot == UVM_PROT_NONE) {
4940                 block_gpu_pte_clear_4k(block, gpu, &block_context->scratch_page_mask, 0, pte_batch, NULL);
4941             }
4942             else {
4943                 block_gpu_pte_write_4k(block,
4944                                        gpu,
4945                                        curr_resident_id,
4946                                        new_prot,
4947                                        &block_context->scratch_page_mask,
4948                                        pte_batch,
4949                                        NULL);
4950             }
4951         }
4952 
4953         // All other pages under this big page inherit curr_prot
4954         uvm_page_mask_zero(&block_context->scratch_page_mask);
4955         uvm_page_mask_region_fill(&block_context->scratch_page_mask, big_region);
4956         if (uvm_page_mask_andnot(&block_context->scratch_page_mask, &block_context->scratch_page_mask, new_pages_mask)) {
4957             if (curr_prot == UVM_PROT_NONE) {
4958                 block_gpu_pte_clear_4k(block, gpu, &block_context->scratch_page_mask, 0, pte_batch, NULL);
4959             }
4960             else {
4961                 block_gpu_pte_write_4k(block,
4962                                        gpu,
4963                                        curr_resident_id,
4964                                        curr_prot,
4965                                        &block_context->scratch_page_mask,
4966                                        pte_batch,
4967                                        NULL);
4968             }
4969         }
4970     }
4971 }
4972 
4973 // Writes pte_clear_val to the big PTEs in big_ptes_mask. If big_ptes_mask is
4974 // NULL, all big PTEs in the {block, gpu} are cleared.
4975 //
4976 // If tlb_batch is provided, the big PTEs written are added to the batch. The
4977 // caller is responsible for ending the TLB batch with the appropriate membar.
4978 static void block_gpu_pte_clear_big(uvm_va_block_t *block,
4979                                     uvm_gpu_t *gpu,
4980                                     const unsigned long *big_ptes_mask,
4981                                     NvU64 pte_clear_val,
4982                                     uvm_pte_batch_t *pte_batch,
4983                                     uvm_tlb_batch_t *tlb_batch)
4984 {
4985     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
4986     uvm_gpu_va_space_t *gpu_va_space = uvm_va_block_get_gpu_va_space(block, gpu);
4987     NvU32 big_page_size = gpu_va_space->page_tables.big_page_size;
4988     uvm_gpu_phys_address_t pte_addr;
4989     NvU32 pte_size = uvm_mmu_pte_size(&gpu_va_space->page_tables, big_page_size);
4990     size_t big_page_index;
4991     DECLARE_BITMAP(big_ptes_to_clear, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
4992 
4993     if (big_ptes_mask)
4994         bitmap_copy(big_ptes_to_clear, big_ptes_mask, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
4995     else
4996         bitmap_set(big_ptes_to_clear, 0, uvm_va_block_num_big_pages(block, big_page_size));
4997 
4998     for_each_set_bit(big_page_index, big_ptes_to_clear, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) {
4999         pte_addr = uvm_page_table_range_entry_address(&gpu_va_space->page_tables,
5000                                                       &gpu_state->page_table_range_big,
5001                                                       big_page_index);
5002         uvm_pte_batch_clear_ptes(pte_batch, pte_addr, pte_clear_val, pte_size, 1);
5003 
5004         if (tlb_batch) {
5005             uvm_tlb_batch_invalidate(tlb_batch,
5006                                      uvm_va_block_big_page_addr(block, big_page_index, big_page_size),
5007                                      big_page_size,
5008                                      big_page_size,
5009                                      UVM_MEMBAR_NONE);
5010         }
5011     }
5012 }
5013 
5014 // Writes the big PTEs in big_ptes_mask using memory from resident_id with
5015 // new_prot permissions. new_prot must not be UVM_PROT_NONE: use
5016 // block_gpu_pte_clear_big instead.
5017 //
5018 // Unlike block_gpu_pte_clear_big, big_ptes_mask must not be NULL.
5019 //
5020 // If tlb_batch is provided, the big PTEs written are added to the batch. The
5021 // caller is responsible for ending the TLB batch with the appropriate membar.
5022 static void block_gpu_pte_write_big(uvm_va_block_t *block,
5023                                     uvm_gpu_t *gpu,
5024                                     uvm_processor_id_t resident_id,
5025                                     uvm_prot_t new_prot,
5026                                     const unsigned long *big_ptes_mask,
5027                                     uvm_pte_batch_t *pte_batch,
5028                                     uvm_tlb_batch_t *tlb_batch)
5029 {
5030     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
5031     uvm_gpu_va_space_t *gpu_va_space = uvm_va_block_get_gpu_va_space(block, gpu);
5032     uvm_page_tree_t *tree = &gpu_va_space->page_tables;
5033     NvU32 big_page_size = tree->big_page_size;
5034     NvU32 pte_size = uvm_mmu_pte_size(tree, big_page_size);
5035     size_t big_page_index;
5036     uvm_va_block_region_t contig_region = {0};
5037     uvm_gpu_phys_address_t contig_addr = {0};
5038     uvm_gpu_phys_address_t page_addr = {0};
5039     NvU64 pte_flags = block_gpu_pte_flag_cacheable(block, gpu, resident_id);
5040 
5041     UVM_ASSERT(new_prot != UVM_PROT_NONE);
5042     UVM_ASSERT(UVM_ID_IS_VALID(resident_id));
5043     UVM_ASSERT(big_ptes_mask);
5044 
5045     if (!bitmap_empty(big_ptes_mask, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)) {
5046         UVM_ASSERT(uvm_va_block_num_big_pages(block, big_page_size) > 0);
5047 
5048         if (!gpu->parent->can_map_sysmem_with_large_pages)
5049             UVM_ASSERT(UVM_ID_IS_GPU(resident_id));
5050     }
5051 
5052     for_each_set_bit(big_page_index, big_ptes_mask, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) {
5053         NvU64 pte_val;
5054         uvm_gpu_phys_address_t pte_addr;
5055         uvm_va_block_region_t big_region = uvm_va_block_big_page_region(block, big_page_index, big_page_size);
5056 
5057         // Assume that this mapping will be used to write to the page
5058         if (new_prot > UVM_PROT_READ_ONLY && UVM_ID_IS_CPU(resident_id) && !uvm_va_block_is_hmm(block)) {
5059             uvm_page_index_t page_index;
5060 
5061             for_each_va_block_page_in_region(page_index, big_region)
5062                 block_mark_cpu_page_dirty(block, page_index);
5063         }
5064 
5065         if (big_region.first >= contig_region.outer) {
5066             contig_region = block_phys_contig_region(block, big_region.first, resident_id);
5067             contig_addr = block_phys_page_address(block, block_phys_page(resident_id, contig_region.first), gpu);
5068             page_addr = contig_addr;
5069         }
5070 
5071         page_addr.address = contig_addr.address + (big_region.first - contig_region.first) * PAGE_SIZE;
5072 
5073         pte_addr = uvm_page_table_range_entry_address(tree, &gpu_state->page_table_range_big, big_page_index);
5074         pte_val = tree->hal->make_pte(page_addr.aperture, page_addr.address, new_prot, pte_flags);
5075         uvm_pte_batch_write_pte(pte_batch, pte_addr, pte_val, pte_size);
5076 
5077         if (tlb_batch) {
5078             uvm_tlb_batch_invalidate(tlb_batch,
5079                                      uvm_va_block_region_start(block, big_region),
5080                                      big_page_size,
5081                                      big_page_size,
5082                                      UVM_MEMBAR_NONE);
5083         }
5084     }
5085 }
5086 
5087 // Switches any mix of valid or invalid 4k PTEs under the big PTEs in
5088 // big_ptes_to_merge to an unmapped big PTE. This also ends both pte_batch and
5089 // tlb_batch in order to poison the now-unused 4k PTEs.
5090 //
5091 // The 4k PTEs are invalidated with the specified membar.
5092 static void block_gpu_pte_merge_big_and_end(uvm_va_block_t *block,
5093                                             uvm_va_block_context_t *block_context,
5094                                             uvm_gpu_t *gpu,
5095                                             const unsigned long *big_ptes_to_merge,
5096                                             uvm_push_t *push,
5097                                             uvm_pte_batch_t *pte_batch,
5098                                             uvm_tlb_batch_t *tlb_batch,
5099                                             uvm_membar_t tlb_membar)
5100 {
5101     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
5102     uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
5103     NvU32 big_page_size = tree->big_page_size;
5104     NvU64 unmapped_pte_val = tree->hal->unmapped_pte(big_page_size);
5105     size_t big_page_index;
5106     DECLARE_BITMAP(dummy_big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5107 
5108     UVM_ASSERT(!bitmap_empty(big_ptes_to_merge, MAX_BIG_PAGES_PER_UVM_VA_BLOCK));
5109     UVM_ASSERT(!bitmap_and(dummy_big_ptes, gpu_state->big_ptes, big_ptes_to_merge, MAX_BIG_PAGES_PER_UVM_VA_BLOCK));
5110 
5111     // We can be called with the 4k PTEs in two cases:
5112     // 1) 4k PTEs allocated. In this case the 4k PTEs are currently active.
5113     //
5114     // 2) 4k PTEs unallocated. In this case the GPU may not have invalid 4k PTEs
5115     //    active under the big PTE, depending on whether neighboring blocks
5116     //    caused the page tables to be allocated.
5117     //
5118     // In both cases we need to invalidate the 4k PTEs in case the GPU MMU has
5119     // them cached.
5120 
5121     // Each big PTE is currently invalid so the 4ks are active (or unallocated).
5122     // First make the big PTEs unmapped to disable future lookups of the 4ks
5123     // under it. We can't directly transition the entry from valid 4k PTEs to
5124     // valid big PTEs, because that could cause the GPU TLBs to cache the same
5125     // VA in different cache lines. That could cause memory ordering to not be
5126     // maintained.
5127     block_gpu_pte_clear_big(block, gpu, big_ptes_to_merge, unmapped_pte_val, pte_batch, tlb_batch);
5128 
5129     // Now invalidate the big PTEs we just wrote as well as all 4ks under them.
5130     // Subsequent MMU fills will stop at the now-unmapped big PTEs, so we only
5131     // need to invalidate the 4k PTEs without actually writing them.
5132     for_each_set_bit(big_page_index, big_ptes_to_merge, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) {
5133         uvm_tlb_batch_invalidate(tlb_batch,
5134                                  uvm_va_block_big_page_addr(block, big_page_index, big_page_size),
5135                                  big_page_size,
5136                                  big_page_size | UVM_PAGE_SIZE_4K,
5137                                  UVM_MEMBAR_NONE);
5138     }
5139 
5140     // End the batches for the caller. We need to do this here in order to
5141     // poison the 4ks below.
5142     uvm_pte_batch_end(pte_batch);
5143     uvm_tlb_batch_end(tlb_batch, push, tlb_membar);
5144 
5145     // As a guard against bad PTE writes/TLB invalidates, fill the now-unused
5146     // PTEs with a pattern which will trigger fatal faults on access. We have to
5147     // do this after the TLB invalidate of the big PTEs, or the GPU might use
5148     // the new values.
5149     if (UVM_IS_DEBUG() && gpu_state->page_table_range_4k.table) {
5150         uvm_page_mask_init_from_big_ptes(block, gpu, &block_context->scratch_page_mask, big_ptes_to_merge);
5151         uvm_pte_batch_begin(push, pte_batch);
5152         block_gpu_pte_clear_4k(block,
5153                                gpu,
5154                                &block_context->scratch_page_mask,
5155                                tree->hal->poisoned_pte(),
5156                                pte_batch,
5157                                NULL);
5158         uvm_pte_batch_end(pte_batch);
5159     }
5160 }
5161 
5162 // Writes 0 (invalid) to the 2M PTE for this {block, gpu}.
5163 //
5164 // If tlb_batch is provided, the 2M PTE is added to the batch. The caller is
5165 // responsible for ending the TLB batch with the appropriate membar.
5166 static void block_gpu_pte_clear_2m(uvm_va_block_t *block,
5167                                    uvm_gpu_t *gpu,
5168                                    uvm_pte_batch_t *pte_batch,
5169                                    uvm_tlb_batch_t *tlb_batch)
5170 {
5171     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
5172     uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
5173     uvm_gpu_phys_address_t pte_addr = uvm_page_table_range_entry_address(tree, &gpu_state->page_table_range_2m, 0);
5174     NvU32 pte_size = uvm_mmu_pte_size(tree, UVM_PAGE_SIZE_2M);
5175 
5176     // uvm_pte_batch_write_pte only writes the lower 8 bytes of the 16-byte PTE,
5177     // which would cause a problem when trying to make the entry invalid since
5178     // both halves must be 0. Using uvm_pte_batch_clear_ptes writes the entire
5179     // 16 bytes.
5180     uvm_pte_batch_clear_ptes(pte_batch, pte_addr, 0, pte_size, 1);
5181 
5182     if (tlb_batch)
5183         uvm_tlb_batch_invalidate(tlb_batch, block->start, UVM_PAGE_SIZE_2M, UVM_PAGE_SIZE_2M, UVM_MEMBAR_NONE);
5184 }
5185 
5186 // Writes the 2M PTE for {block, gpu} using memory from resident_id with
5187 // new_prot permissions. new_prot must not be UVM_PROT_NONE: use
5188 // block_gpu_pte_clear_2m instead.
5189 //
5190 // If tlb_batch is provided, the 2M PTE is added to the batch. The caller is
5191 // responsible for ending the TLB batch with the appropriate membar.
5192 static void block_gpu_pte_write_2m(uvm_va_block_t *block,
5193                                    uvm_gpu_t *gpu,
5194                                    uvm_processor_id_t resident_id,
5195                                    uvm_prot_t new_prot,
5196                                    uvm_pte_batch_t *pte_batch,
5197                                    uvm_tlb_batch_t *tlb_batch)
5198 {
5199     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
5200     uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
5201     uvm_gpu_phys_address_t pte_addr = uvm_page_table_range_entry_address(tree, &gpu_state->page_table_range_2m, 0);
5202     uvm_gpu_phys_address_t page_addr;
5203     NvU32 pte_size = uvm_mmu_pte_size(tree, UVM_PAGE_SIZE_2M);
5204     NvU64 pte_val;
5205     NvU64 pte_flags = block_gpu_pte_flag_cacheable(block, gpu, resident_id);
5206 
5207     UVM_ASSERT(new_prot != UVM_PROT_NONE);
5208     UVM_ASSERT(UVM_ID_IS_VALID(resident_id));
5209 
5210     if (UVM_ID_IS_CPU(resident_id) && !uvm_va_block_is_hmm(block))
5211         block_mark_cpu_page_dirty(block, 0);
5212 
5213     page_addr = block_phys_page_address(block, block_phys_page(resident_id, 0), gpu);
5214     pte_val = tree->hal->make_pte(page_addr.aperture, page_addr.address, new_prot, pte_flags);
5215     uvm_pte_batch_write_pte(pte_batch, pte_addr, pte_val, pte_size);
5216 
5217     if (tlb_batch)
5218         uvm_tlb_batch_invalidate(tlb_batch, block->start, UVM_PAGE_SIZE_2M, UVM_PAGE_SIZE_2M, UVM_MEMBAR_NONE);
5219 }
5220 
5221 static bool block_gpu_needs_to_activate_table(uvm_va_block_t *block, uvm_gpu_t *gpu)
5222 {
5223     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
5224 
5225     if (!block_gpu_supports_2m(block, gpu))
5226         return false;
5227 
5228     if ((gpu_state->page_table_range_big.table && !gpu_state->activated_big) ||
5229         (gpu_state->page_table_range_4k.table  && !gpu_state->activated_4k))
5230         return true;
5231 
5232     return false;
5233 }
5234 
5235 // Only used if 2M PTEs are supported. Either transitions a 2M PTE to a PDE, or
5236 // activates a newly-allocated page table (big or 4k) while the other is already
5237 // active. The caller must have already written the new PTEs under the table
5238 // with the appropriate membar.
5239 static void block_gpu_write_pde(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_push_t *push, uvm_tlb_batch_t *tlb_batch)
5240 {
5241     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
5242     uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
5243 
5244     if (!gpu_state->pte_is_2m)
5245         UVM_ASSERT(block_gpu_needs_to_activate_table(block, gpu));
5246 
5247     UVM_ASSERT(gpu_state->page_table_range_big.table || gpu_state->page_table_range_4k.table);
5248 
5249     // We always need a membar to order PDE/PTE writes with the TLB invalidate.
5250     // write_pde will do a MEMBAR_SYS by default.
5251     if (uvm_page_table_range_aperture(&gpu_state->page_table_range_2m) == UVM_APERTURE_VID)
5252         uvm_push_set_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU);
5253     uvm_page_tree_write_pde(tree, &gpu_state->page_table_range_2m, push);
5254 
5255     gpu->parent->host_hal->wait_for_idle(push);
5256 
5257     // Invalidate just the PDE
5258     uvm_tlb_batch_invalidate(tlb_batch, block->start, UVM_PAGE_SIZE_2M, UVM_PAGE_SIZE_2M, UVM_MEMBAR_NONE);
5259 
5260     if (gpu_state->page_table_range_big.table)
5261         gpu_state->activated_big = true;
5262 
5263     if (gpu_state->page_table_range_4k.table)
5264         gpu_state->activated_4k = true;
5265 }
5266 
5267 // Called to switch the 2M PTE (valid or invalid) to a PDE. The caller should
5268 // have written all lower PTEs as appropriate into the given pte_batch already.
5269 // This function ends the PTE batch, activates the 2M PDE, and does a TLB
5270 // invalidate.
5271 //
5272 // The caller does not need to do any TLB invalidates since none of the lower
5273 // PTEs could be cached.
5274 static void block_gpu_pte_finish_split_2m(uvm_va_block_t *block,
5275                                           uvm_gpu_t *gpu,
5276                                           uvm_push_t *push,
5277                                           uvm_pte_batch_t *pte_batch,
5278                                           uvm_tlb_batch_t *tlb_batch,
5279                                           uvm_membar_t tlb_membar)
5280 {
5281     uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
5282     uvm_prot_t curr_prot = block_page_prot_gpu(block, gpu, 0);
5283 
5284     // Step 1: Make the 2M entry invalid. We can't directly transition from a
5285     //         valid 2M PTE to valid lower PTEs, because that could cause the
5286     //         GPU TLBs to cache the same VA in different cache lines. That
5287     //         could cause memory ordering to not be maintained.
5288     //
5289     //         If the 2M PTE is already invalid, no TLB invalidate is needed.
5290 
5291     if (curr_prot == UVM_PROT_NONE) {
5292         // If we aren't downgrading, then we don't need a membar.
5293         UVM_ASSERT(tlb_membar == UVM_MEMBAR_NONE);
5294 
5295         // End the batch, which pushes a membar to ensure that the caller's PTE
5296         // writes below 2M are observed before the PDE write we're about to do.
5297         uvm_pte_batch_end(pte_batch);
5298     }
5299     else {
5300         // The 64k and 4k PTEs can't possibly be cached since the 2M entry is
5301         // not yet a PDE, so we just need to invalidate this single 2M entry.
5302         uvm_tlb_batch_begin(tree, tlb_batch);
5303         block_gpu_pte_clear_2m(block, gpu, pte_batch, tlb_batch);
5304 
5305         // Make sure the PTE writes are observed before the TLB invalidate
5306         uvm_pte_batch_end(pte_batch);
5307         uvm_tlb_batch_end(tlb_batch, push, tlb_membar);
5308     }
5309 
5310     // Step 2: Switch the 2M entry from invalid to a PDE. This activates the
5311     //         smaller PTEs.
5312     uvm_tlb_batch_begin(tree, tlb_batch);
5313     block_gpu_write_pde(block, gpu, push, tlb_batch);
5314     uvm_tlb_batch_end(tlb_batch, push, UVM_MEMBAR_NONE);
5315 }
5316 
5317 // Switches any mix of valid or invalid 4k or 64k PTEs to an invalid 2M PTE.
5318 // Any lower PTEs are invalidated with the specified membar.
5319 static void block_gpu_pte_merge_2m(uvm_va_block_t *block,
5320                                    uvm_va_block_context_t *block_context,
5321                                    uvm_gpu_t *gpu,
5322                                    uvm_push_t *push,
5323                                    uvm_membar_t tlb_membar)
5324 {
5325     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
5326     uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
5327     uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch;
5328     uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch;
5329     NvU32 tlb_inval_sizes;
5330 
5331     UVM_ASSERT(!gpu_state->pte_is_2m);
5332     UVM_ASSERT(gpu_state->page_table_range_big.table || gpu_state->page_table_range_4k.table);
5333 
5334     // The 2M entry is currently a PDE, so first make it invalid. We can't
5335     // directly transition the entry from a valid PDE to a valid 2M PTE, because
5336     // that could cause the GPU TLBs to cache the same VA in different cache
5337     // lines. That could cause memory ordering to not be maintained.
5338     uvm_pte_batch_begin(push, pte_batch);
5339     block_gpu_pte_clear_2m(block, gpu, pte_batch, NULL);
5340     uvm_pte_batch_end(pte_batch);
5341 
5342     // Now invalidate both the 2M entry we just wrote as well as all lower-level
5343     // entries which could be cached. Subsequent MMU fills will stop at the now-
5344     // invalid 2M entry, so we only need to invalidate the lower PTEs without
5345     // actually writing them.
5346     tlb_inval_sizes = UVM_PAGE_SIZE_2M;
5347     if (gpu_state->page_table_range_big.table)
5348         tlb_inval_sizes |= UVM_PAGE_SIZE_64K;
5349 
5350     // Strictly-speaking we only need to invalidate those 4k ranges which are
5351     // not covered by a big pte. However, any such invalidate will require
5352     // enough 4k invalidates to force the TLB batching to invalidate everything
5353     // anyway, so just do the simpler thing.
5354     if (!bitmap_full(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK))
5355         tlb_inval_sizes |= UVM_PAGE_SIZE_4K;
5356 
5357     uvm_tlb_batch_begin(tree, tlb_batch);
5358     uvm_tlb_batch_invalidate(tlb_batch, block->start, UVM_PAGE_SIZE_2M, tlb_inval_sizes, UVM_MEMBAR_NONE);
5359     uvm_tlb_batch_end(tlb_batch, push, tlb_membar);
5360 
5361     // As a guard against bad PTE writes/TLB invalidates, fill the now-unused
5362     // PTEs with a pattern which will trigger fatal faults on access. We have to
5363     // do this after the TLB invalidate of the 2M entry, or the GPU might use
5364     // the new values.
5365     if (UVM_IS_DEBUG()) {
5366         uvm_pte_batch_begin(push, pte_batch);
5367 
5368         if (gpu_state->page_table_range_big.table) {
5369             block_gpu_pte_clear_big(block,
5370                                     gpu,
5371                                     NULL,
5372                                     tree->hal->poisoned_pte(),
5373                                     pte_batch,
5374                                     NULL);
5375         }
5376 
5377         if (gpu_state->page_table_range_4k.table) {
5378             block_gpu_pte_clear_4k(block,
5379                                    gpu,
5380                                    NULL,
5381                                    tree->hal->poisoned_pte(),
5382                                    pte_batch,
5383                                    NULL);
5384         }
5385 
5386         uvm_pte_batch_end(pte_batch);
5387     }
5388 }
5389 
5390 static uvm_membar_t block_pte_op_membar(block_pte_op_t pte_op, uvm_gpu_t *gpu, uvm_processor_id_t resident_id)
5391 {
5392     // Permissions upgrades (MAP) don't need membars
5393     if (pte_op == BLOCK_PTE_OP_MAP)
5394         return UVM_MEMBAR_NONE;
5395 
5396     UVM_ASSERT(UVM_ID_IS_VALID(resident_id));
5397     UVM_ASSERT(pte_op == BLOCK_PTE_OP_REVOKE);
5398 
5399     return uvm_hal_downgrade_membar_type(gpu, uvm_id_equal(gpu->id, resident_id));
5400 }
5401 
5402 // Write the 2M PTE for {block, gpu} to the memory on resident_id with new_prot
5403 // permissions. If the 2M entry is currently a PDE, it is first merged into a
5404 // PTE.
5405 //
5406 // new_prot must not be UVM_PROT_NONE: use block_gpu_unmap_to_2m instead.
5407 //
5408 // pte_op specifies whether this is a MAP or REVOKE operation, which determines
5409 // the TLB membar required.
5410 static void block_gpu_map_to_2m(uvm_va_block_t *block,
5411                                 uvm_va_block_context_t *block_context,
5412                                 uvm_gpu_t *gpu,
5413                                 uvm_processor_id_t resident_id,
5414                                 uvm_prot_t new_prot,
5415                                 uvm_push_t *push,
5416                                 block_pte_op_t pte_op)
5417 {
5418     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
5419     uvm_gpu_va_space_t *gpu_va_space = uvm_va_block_get_gpu_va_space(block, gpu);
5420     uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch;
5421     uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch;
5422     uvm_membar_t tlb_membar;
5423 
5424     UVM_ASSERT(new_prot != UVM_PROT_NONE);
5425 
5426     // If we have a mix of big and 4k PTEs, we have to first merge them to an
5427     // invalid 2M PTE.
5428     if (!gpu_state->pte_is_2m) {
5429         block_gpu_pte_merge_2m(block, block_context, gpu, push, UVM_MEMBAR_NONE);
5430 
5431         gpu_state->pte_is_2m = true;
5432         bitmap_zero(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5433     }
5434 
5435     // Write the new permissions
5436     uvm_pte_batch_begin(push, pte_batch);
5437     uvm_tlb_batch_begin(&gpu_va_space->page_tables, tlb_batch);
5438 
5439     block_gpu_pte_write_2m(block, gpu, resident_id, new_prot, pte_batch, tlb_batch);
5440 
5441     uvm_pte_batch_end(pte_batch);
5442 
5443     tlb_membar = block_pte_op_membar(pte_op, gpu, resident_id);
5444     uvm_tlb_batch_end(tlb_batch, push, tlb_membar);
5445 }
5446 
5447 // Combination split + map operation, called when only part of a 2M PTE mapping
5448 // is being changed. This splits an existing valid or invalid 2M PTE into the
5449 // mix of big and 4k PTEs described by block_context->mapping.new_pte_state.
5450 //
5451 // The PTEs covering the pages in pages_to_write are written to the memory on
5452 // resident_id with new_prot permissions. new_prot must not be UVM_PROT_NONE.
5453 //
5454 // The PTEs covering the pages not set in pages_to_write inherit the mapping of
5455 // the current 2M PTE. If the current mapping is valid, it must target
5456 // resident_id.
5457 //
5458 // pte_op specifies whether this is a MAP or REVOKE operation, which determines
5459 // the TLB membar required.
5460 static void block_gpu_map_split_2m(uvm_va_block_t *block,
5461                                    uvm_va_block_context_t *block_context,
5462                                    uvm_gpu_t *gpu,
5463                                    uvm_processor_id_t resident_id,
5464                                    const uvm_page_mask_t *pages_to_write,
5465                                    uvm_prot_t new_prot,
5466                                    uvm_push_t *push,
5467                                    block_pte_op_t pte_op)
5468 {
5469     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
5470     uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
5471     uvm_va_block_new_pte_state_t *new_pte_state = &block_context->mapping.new_pte_state;
5472     uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch;
5473     uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch;
5474     uvm_prot_t curr_prot = block_page_prot_gpu(block, gpu, 0);
5475     uvm_membar_t tlb_membar;
5476     DECLARE_BITMAP(big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5477     DECLARE_BITMAP(big_ptes_inherit, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5478     DECLARE_BITMAP(big_ptes_new_prot, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5479 
5480     UVM_ASSERT(gpu_state->pte_is_2m);
5481 
5482     if (!gpu_state->page_table_range_4k.table)
5483         UVM_ASSERT(bitmap_full(new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK));
5484 
5485     uvm_pte_batch_begin(push, pte_batch);
5486 
5487     // Since the 2M entry is active as a PTE, the GPU MMU can't fetch entries
5488     // from the lower levels. This means we don't need to issue a TLB invalidate
5489     // when writing those levels.
5490 
5491     // Cases to handle:
5492     // 1) Big PTEs which inherit curr_prot
5493     // 2) Big PTEs which get new_prot
5494     // 3) Big PTEs which are split to 4k
5495     //    a) 4k PTEs which inherit curr_prot under the split big PTEs
5496     //    b) 4k PTEs which get new_prot under the split big PTEs
5497 
5498     // Compute the big PTEs which will need to be split to 4k, if any.
5499     bitmap_complement(big_ptes_split, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5500 
5501     if (gpu_state->page_table_range_big.table) {
5502         // Case 1: Write the big PTEs which will inherit the 2M permissions, if
5503         // any. These are the big PTEs which are unchanged (uncovered) by the
5504         // operation.
5505         bitmap_andnot(big_ptes_inherit,
5506                       new_pte_state->big_ptes,
5507                       new_pte_state->big_ptes_covered,
5508                       MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5509 
5510         if (curr_prot == UVM_PROT_NONE) {
5511             block_gpu_pte_clear_big(block,
5512                                     gpu,
5513                                     big_ptes_inherit,
5514                                     tree->hal->unmapped_pte(UVM_PAGE_SIZE_64K),
5515                                     pte_batch,
5516                                     NULL);
5517         }
5518         else {
5519             block_gpu_pte_write_big(block, gpu, resident_id, curr_prot, big_ptes_inherit, pte_batch, NULL);
5520         }
5521 
5522         // Case 2: Write the new big PTEs
5523         bitmap_and(big_ptes_new_prot,
5524                    new_pte_state->big_ptes,
5525                    new_pte_state->big_ptes_covered,
5526                    MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5527         block_gpu_pte_write_big(block, gpu, resident_id, new_prot, big_ptes_new_prot, pte_batch, NULL);
5528 
5529         // Case 3: Write the big PTEs which cover 4k PTEs
5530         block_gpu_pte_clear_big(block, gpu, big_ptes_split, 0, pte_batch, NULL);
5531 
5532         // We just wrote all possible big PTEs, so mark them as initialized
5533         gpu_state->initialized_big = true;
5534     }
5535     else {
5536         UVM_ASSERT(bitmap_empty(new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK));
5537     }
5538 
5539     // Cases 3a and 3b: Write all 4k PTEs under all now-split big PTEs
5540     block_gpu_pte_big_split_write_4k(block,
5541                                      block_context,
5542                                      gpu,
5543                                      resident_id,
5544                                      new_prot,
5545                                      big_ptes_split,
5546                                      pages_to_write,
5547                                      pte_batch);
5548 
5549     // Activate the 2M PDE. This ends the pte_batch and issues a single TLB
5550     // invalidate for the 2M entry.
5551     tlb_membar = block_pte_op_membar(pte_op, gpu, resident_id);
5552     block_gpu_pte_finish_split_2m(block, gpu, push, pte_batch, tlb_batch, tlb_membar);
5553 
5554     gpu_state->pte_is_2m = false;
5555     bitmap_copy(gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5556 }
5557 
5558 // Split the existing 2M PTE into big and 4k PTEs. No permissions are changed.
5559 //
5560 // new_big_ptes specifies which PTEs should be big. NULL means all PTEs should
5561 // be 4k.
5562 static void block_gpu_split_2m(uvm_va_block_t *block,
5563                                uvm_va_block_context_t *block_context,
5564                                uvm_gpu_t *gpu,
5565                                const unsigned long *new_big_ptes,
5566                                uvm_push_t *push)
5567 {
5568     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
5569     uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
5570     uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch;
5571     uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch;
5572     uvm_prot_t curr_prot = block_page_prot_gpu(block, gpu, 0);
5573     DECLARE_BITMAP(new_big_ptes_local, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5574     DECLARE_BITMAP(big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5575     NvU64 unmapped_pte_val;
5576     uvm_processor_id_t curr_residency;
5577 
5578     UVM_ASSERT(gpu_state->pte_is_2m);
5579 
5580     if (new_big_ptes)
5581         bitmap_copy(new_big_ptes_local, new_big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5582     else
5583         bitmap_zero(new_big_ptes_local, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5584 
5585     if (!bitmap_empty(new_big_ptes_local, MAX_BIG_PAGES_PER_UVM_VA_BLOCK))
5586         UVM_ASSERT(gpu_state->page_table_range_big.table);
5587 
5588     // We're splitting from 2M to big only, so we'll be writing all big PTEs
5589     if (gpu_state->page_table_range_big.table)
5590         gpu_state->initialized_big = true;
5591 
5592     // Cases to handle:
5593     // 1) Big PTEs which inherit curr_prot
5594     // 2) Big PTEs which are split to 4k
5595     //    a) 4k PTEs inherit curr_prot under the split big PTEs
5596 
5597     // big_ptes_split will cover the 4k regions
5598     bitmap_complement(big_ptes_split, new_big_ptes_local, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5599     uvm_page_mask_init_from_big_ptes(block, gpu, &block_context->scratch_page_mask, big_ptes_split);
5600 
5601     uvm_pte_batch_begin(push, pte_batch);
5602 
5603     // Since the 2M entry is active as a PTE, the GPU MMU can't fetch entries
5604     // from the lower levels. This means we don't need to issue a TLB invalidate
5605     // when writing those levels.
5606 
5607     if (curr_prot == UVM_PROT_NONE) {
5608         unmapped_pte_val = tree->hal->unmapped_pte(tree->big_page_size);
5609 
5610         // Case 2a: Clear the 4k PTEs under big_ptes_split
5611         block_gpu_pte_clear_4k(block, gpu, &block_context->scratch_page_mask, 0, pte_batch, NULL);
5612 
5613         // Case 1: Make the remaining big PTEs unmapped
5614         block_gpu_pte_clear_big(block, gpu, new_big_ptes_local, unmapped_pte_val, pte_batch, NULL);
5615     }
5616     else {
5617         curr_residency = block_gpu_get_processor_to_map(block, gpu, 0);
5618 
5619         // Case 2a: Write the new 4k PTEs under big_ptes_split
5620         block_gpu_pte_write_4k(block,
5621                                gpu,
5622                                curr_residency,
5623                                curr_prot,
5624                                &block_context->scratch_page_mask,
5625                                pte_batch,
5626                                NULL);
5627 
5628         // Case 1: Write the new big PTEs
5629         block_gpu_pte_write_big(block, gpu, curr_residency, curr_prot, new_big_ptes_local, pte_batch, NULL);
5630     }
5631 
5632     // Case 2: Make big_ptes_split invalid to activate the 4k PTEs
5633     if (gpu_state->page_table_range_big.table)
5634         block_gpu_pte_clear_big(block, gpu, big_ptes_split, 0, pte_batch, NULL);
5635 
5636     // Activate the 2M PDE. This ends the pte_batch and issues a single TLB
5637     // invalidate for the 2M entry. No membar is necessary since we aren't
5638     // changing permissions.
5639     block_gpu_pte_finish_split_2m(block, gpu, push, pte_batch, tlb_batch, UVM_MEMBAR_NONE);
5640 
5641     gpu_state->pte_is_2m = false;
5642     bitmap_copy(gpu_state->big_ptes, new_big_ptes_local, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5643 }
5644 
5645 // Split the big PTEs in big_ptes_to_split into 4k PTEs. No permissions are
5646 // changed.
5647 //
5648 // big_ptes_to_split must not be NULL.
5649 static void block_gpu_split_big(uvm_va_block_t *block,
5650                                 uvm_va_block_context_t *block_context,
5651                                 uvm_gpu_t *gpu,
5652                                 const unsigned long *big_ptes_to_split,
5653                                 uvm_push_t *push)
5654 {
5655     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
5656     uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
5657     uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch;
5658     uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch;
5659     NvU32 big_page_size = tree->big_page_size;
5660     uvm_va_block_region_t big_region;
5661     uvm_processor_id_t resident_id;
5662     size_t big_page_index;
5663     uvm_prot_t curr_prot;
5664     DECLARE_BITMAP(big_ptes_valid, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5665 
5666     UVM_ASSERT(!gpu_state->pte_is_2m);
5667     UVM_ASSERT(bitmap_subset(big_ptes_to_split, gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK));
5668     UVM_ASSERT(!bitmap_empty(big_ptes_to_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK));
5669 
5670     uvm_pte_batch_begin(push, pte_batch);
5671     uvm_tlb_batch_begin(tree, tlb_batch);
5672 
5673     // Write all 4k PTEs under all big PTEs which are being split. We'll make
5674     // the big PTEs inactive below after flushing these writes. No TLB
5675     // invalidate is needed since the big PTE is active.
5676     bitmap_zero(big_ptes_valid, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5677     for_each_set_bit(big_page_index, big_ptes_to_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) {
5678         big_region = uvm_va_block_big_page_region(block, big_page_index, big_page_size);
5679         curr_prot = block_page_prot_gpu(block, gpu, big_region.first);
5680 
5681         uvm_page_mask_zero(&block_context->scratch_page_mask);
5682         uvm_page_mask_region_fill(&block_context->scratch_page_mask, big_region);
5683         if (curr_prot == UVM_PROT_NONE) {
5684             block_gpu_pte_clear_4k(block, gpu, &block_context->scratch_page_mask, 0, pte_batch, NULL);
5685         }
5686         else {
5687             __set_bit(big_page_index, big_ptes_valid);
5688 
5689             resident_id = block_gpu_get_processor_to_map(block, gpu, big_region.first);
5690 
5691             block_gpu_pte_write_4k(block,
5692                                    gpu,
5693                                    resident_id,
5694                                    curr_prot,
5695                                    &block_context->scratch_page_mask,
5696                                    pte_batch,
5697                                    NULL);
5698         }
5699     }
5700 
5701     // Unmap the big PTEs which are valid and are being split to 4k. We can't
5702     // directly transition from a valid big PTE to valid lower PTEs, because
5703     // that could cause the GPU TLBs to cache the same VA in different cache
5704     // lines. That could cause memory ordering to not be maintained.
5705     block_gpu_pte_clear_big(block, gpu, big_ptes_valid, tree->hal->unmapped_pte(big_page_size), pte_batch, tlb_batch);
5706 
5707     // End the batches. We have to commit the membars and TLB invalidates
5708     // before we finish splitting formerly-big PTEs. No membar is necessary
5709     // since we aren't changing permissions.
5710     uvm_pte_batch_end(pte_batch);
5711     uvm_tlb_batch_end(tlb_batch, push, UVM_MEMBAR_NONE);
5712 
5713     // Finish the split by switching the big PTEs from unmapped to invalid. This
5714     // causes the GPU MMU to start reading the 4k PTEs instead of stopping at
5715     // the unmapped big PTEs.
5716     uvm_pte_batch_begin(push, pte_batch);
5717     uvm_tlb_batch_begin(tree, tlb_batch);
5718 
5719     block_gpu_pte_clear_big(block, gpu, big_ptes_to_split, 0, pte_batch, tlb_batch);
5720 
5721     uvm_pte_batch_end(pte_batch);
5722 
5723     // Finally, activate the page tables if they're inactive
5724     if (block_gpu_needs_to_activate_table(block, gpu))
5725         block_gpu_write_pde(block, gpu, push, tlb_batch);
5726 
5727     uvm_tlb_batch_end(tlb_batch, push, UVM_MEMBAR_NONE);
5728 
5729     bitmap_andnot(gpu_state->big_ptes, gpu_state->big_ptes, big_ptes_to_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5730 }
5731 
5732 // Changes permissions on some pre-existing mix of big and 4k PTEs into some
5733 // other mix of big and 4k PTEs, as described by
5734 // block_context->mapping.new_pte_state.
5735 //
5736 // The PTEs covering the pages in pages_to_write are written to the memory on
5737 // resident_id with new_prot permissions. new_prot must not be UVM_PROT_NONE.
5738 //
5739 // pte_op specifies whether this is a MAP or REVOKE operation, which determines
5740 // the TLB membar required.
5741 static void block_gpu_map_big_and_4k(uvm_va_block_t *block,
5742                                      uvm_va_block_context_t *block_context,
5743                                      uvm_gpu_t *gpu,
5744                                      uvm_processor_id_t resident_id,
5745                                      const uvm_page_mask_t *pages_to_write,
5746                                      uvm_prot_t new_prot,
5747                                      uvm_push_t *push,
5748                                      block_pte_op_t pte_op)
5749 {
5750     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
5751     uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
5752     uvm_va_block_new_pte_state_t *new_pte_state = &block_context->mapping.new_pte_state;
5753     uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch;
5754     uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch;
5755     DECLARE_BITMAP(big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5756     DECLARE_BITMAP(big_ptes_before_or_after, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5757     DECLARE_BITMAP(big_ptes_merge, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5758     DECLARE_BITMAP(big_ptes_mask, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5759     uvm_va_block_region_t big_region;
5760     size_t big_page_index;
5761     NvU32 big_page_size = tree->big_page_size;
5762     uvm_membar_t tlb_membar = block_pte_op_membar(pte_op, gpu, resident_id);
5763 
5764     UVM_ASSERT(!gpu_state->pte_is_2m);
5765 
5766     uvm_pte_batch_begin(push, pte_batch);
5767     uvm_tlb_batch_begin(tree, tlb_batch);
5768 
5769     // All of these cases might be perfomed in the same call:
5770     // 1) Split currently-big PTEs to 4k
5771     //    a) Write new 4k PTEs which inherit curr_prot under the split big PTEs
5772     //    b) Write new 4k PTEs which get new_prot under the split big PTEs
5773     // 2) Merge currently-4k PTEs to big with new_prot
5774     // 3) Write currently-big PTEs which wholly get new_prot
5775     // 4) Write currently-4k PTEs which get new_prot
5776     // 5) Initialize big PTEs which are not covered by this operation
5777 
5778     // Cases 1a and 1b: Write all 4k PTEs under all currently-big PTEs which are
5779     // being split. We'll make the big PTEs inactive below after flushing these
5780     // writes. No TLB invalidate is needed since the big PTE is active.
5781     //
5782     // Mask computation: big_before && !big_after
5783     bitmap_andnot(big_ptes_split, gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5784 
5785     block_gpu_pte_big_split_write_4k(block,
5786                                      block_context,
5787                                      gpu,
5788                                      resident_id,
5789                                      new_prot,
5790                                      big_ptes_split,
5791                                      pages_to_write,
5792                                      pte_batch);
5793 
5794     // Case 4: Write the 4k PTEs which weren't covered by a big PTE before, and
5795     // remain uncovered after the operation.
5796     //
5797     // Mask computation: !big_before && !big_after
5798     bitmap_or(big_ptes_before_or_after, gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5799     uvm_page_mask_init_from_big_ptes(block, gpu, &block_context->scratch_page_mask, big_ptes_before_or_after);
5800     if (uvm_page_mask_andnot(&block_context->scratch_page_mask, pages_to_write, &block_context->scratch_page_mask)) {
5801         block_gpu_pte_write_4k(block,
5802                                gpu,
5803                                resident_id,
5804                                new_prot,
5805                                &block_context->scratch_page_mask,
5806                                pte_batch,
5807                                tlb_batch);
5808     }
5809 
5810     // Case 5: If the big page table is newly-allocated, make sure that all big
5811     // PTEs we aren't otherwise writing (that is, those which cover 4k PTEs) are
5812     // all initialized to invalid.
5813     //
5814     // The similar case of making newly-allocated big PTEs unmapped when no
5815     // lower 4k table is present is handled by having
5816     // block_gpu_compute_new_pte_state set new_pte_state->big_ptes
5817     // appropriately.
5818     if (gpu_state->page_table_range_big.table && !gpu_state->initialized_big) {
5819         // TODO: Bug 1766424: If we have the 4k page table already, we could
5820         //       attempt to merge all uncovered big PTE regions when first
5821         //       allocating the big table. That's probably not worth doing.
5822         UVM_ASSERT(gpu_state->page_table_range_4k.table);
5823         UVM_ASSERT(bitmap_empty(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK));
5824         bitmap_complement(big_ptes_mask, new_pte_state->big_ptes, uvm_va_block_num_big_pages(block, big_page_size));
5825         block_gpu_pte_clear_big(block, gpu, big_ptes_mask, 0, pte_batch, tlb_batch);
5826         gpu_state->initialized_big = true;
5827     }
5828 
5829     // Case 1 (step 1): Unmap the currently-big PTEs which are valid and are
5830     // being split to 4k. We can't directly transition from a valid big PTE to
5831     // valid lower PTEs, because that could cause the GPU TLBs to cache the same
5832     // VA in different cache lines. That could cause memory ordering to not be
5833     // maintained.
5834     bitmap_zero(big_ptes_mask, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5835     for_each_set_bit(big_page_index, big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) {
5836         big_region = uvm_va_block_big_page_region(block, big_page_index, big_page_size);
5837         if (uvm_page_mask_test(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ], big_region.first))
5838             __set_bit(big_page_index, big_ptes_mask);
5839     }
5840 
5841     block_gpu_pte_clear_big(block, gpu, big_ptes_mask, tree->hal->unmapped_pte(big_page_size), pte_batch, tlb_batch);
5842 
5843     // Case 3: Write the currently-big PTEs which remain big PTEs, and are
5844     // wholly changing permissions.
5845     //
5846     // Mask computation: big_before && big_after && covered
5847     bitmap_and(big_ptes_mask, gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5848     if (bitmap_and(big_ptes_mask, big_ptes_mask, new_pte_state->big_ptes_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK))
5849         block_gpu_pte_write_big(block, gpu, resident_id, new_prot, big_ptes_mask, pte_batch, tlb_batch);
5850 
5851     // Case 2 (step 1): Merge the new big PTEs and end the batches, now that
5852     // we've done all of the independent PTE writes we can. This also merges
5853     // newly-allocated uncovered big PTEs to unmapped (see
5854     // block_gpu_compute_new_pte_state).
5855     //
5856     // Mask computation: !big_before && big_after
5857     if (bitmap_andnot(big_ptes_merge, new_pte_state->big_ptes, gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)) {
5858         // This writes the newly-big PTEs to unmapped and ends the PTE and TLB
5859         // batches.
5860         block_gpu_pte_merge_big_and_end(block,
5861                                         block_context,
5862                                         gpu,
5863                                         big_ptes_merge,
5864                                         push,
5865                                         pte_batch,
5866                                         tlb_batch,
5867                                         tlb_membar);
5868 
5869         // Remove uncovered big PTEs. We needed to merge them to unmapped above,
5870         // but they shouldn't get new_prot below.
5871         bitmap_and(big_ptes_merge, big_ptes_merge, new_pte_state->big_ptes_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5872     }
5873     else {
5874         // End the batches. We have to commit the membars and TLB invalidates
5875         // before we finish splitting formerly-big PTEs.
5876         uvm_pte_batch_end(pte_batch);
5877         uvm_tlb_batch_end(tlb_batch, push, tlb_membar);
5878     }
5879 
5880     if (!bitmap_empty(big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) ||
5881         !bitmap_empty(big_ptes_merge, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) ||
5882         block_gpu_needs_to_activate_table(block, gpu)) {
5883 
5884         uvm_pte_batch_begin(push, pte_batch);
5885         uvm_tlb_batch_begin(tree, tlb_batch);
5886 
5887         // Case 1 (step 2): Finish splitting our big PTEs, if we have any, by
5888         // switching them from unmapped to invalid. This causes the GPU MMU to
5889         // start reading the 4k PTEs instead of stopping at the unmapped big
5890         // PTEs.
5891         block_gpu_pte_clear_big(block, gpu, big_ptes_split, 0, pte_batch, tlb_batch);
5892 
5893         // Case 2 (step 2): Finish merging our big PTEs, if we have any, by
5894         // switching them from unmapped to new_prot.
5895         block_gpu_pte_write_big(block, gpu, resident_id, new_prot, big_ptes_merge, pte_batch, tlb_batch);
5896 
5897         uvm_pte_batch_end(pte_batch);
5898 
5899         // Finally, activate the page tables if they're inactive
5900         if (block_gpu_needs_to_activate_table(block, gpu))
5901             block_gpu_write_pde(block, gpu, push, tlb_batch);
5902 
5903         uvm_tlb_batch_end(tlb_batch, push, UVM_MEMBAR_NONE);
5904     }
5905 
5906     // Update gpu_state
5907     bitmap_copy(gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5908 }
5909 
5910 // Unmap all PTEs for {block, gpu}. If the 2M entry is currently a PDE, it is
5911 // merged into a PTE.
5912 static void block_gpu_unmap_to_2m(uvm_va_block_t *block,
5913                                   uvm_va_block_context_t *block_context,
5914                                   uvm_gpu_t *gpu,
5915                                   uvm_push_t *push,
5916                                   uvm_membar_t tlb_membar)
5917 {
5918     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
5919     uvm_gpu_va_space_t *gpu_va_space = uvm_va_block_get_gpu_va_space(block, gpu);
5920     uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch;
5921     uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch;
5922 
5923     if (gpu_state->pte_is_2m) {
5924         // If we're already mapped as a valid 2M PTE, just write it to invalid
5925         uvm_pte_batch_begin(push, pte_batch);
5926         uvm_tlb_batch_begin(&gpu_va_space->page_tables, tlb_batch);
5927 
5928         block_gpu_pte_clear_2m(block, gpu, pte_batch, tlb_batch);
5929 
5930         uvm_pte_batch_end(pte_batch);
5931         uvm_tlb_batch_end(tlb_batch, push, tlb_membar);
5932     }
5933     else {
5934         // Otherwise we have a mix of big and 4K PTEs which need to be merged
5935         // into an invalid 2M PTE.
5936         block_gpu_pte_merge_2m(block, block_context, gpu, push, tlb_membar);
5937 
5938         gpu_state->pte_is_2m = true;
5939         bitmap_zero(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5940     }
5941 }
5942 
5943 // Combination split + unmap operation, called when only part of a valid 2M PTE
5944 // mapping is being unmapped. The 2M PTE is split into a mix of valid and
5945 // invalid big and/or 4k PTEs, as described by
5946 // block_context->mapping.new_pte_state.
5947 //
5948 // The PTEs covering the pages in pages_to_unmap are cleared (unmapped).
5949 //
5950 // The PTEs covering the pages not set in pages_to_unmap inherit the mapping of
5951 // the current 2M PTE.
5952 static void block_gpu_unmap_split_2m(uvm_va_block_t *block,
5953                                      uvm_va_block_context_t *block_context,
5954                                      uvm_gpu_t *gpu,
5955                                      const uvm_page_mask_t *pages_to_unmap,
5956                                      uvm_push_t *push,
5957                                      uvm_membar_t tlb_membar)
5958 {
5959     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
5960     uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
5961     uvm_va_block_new_pte_state_t *new_pte_state = &block_context->mapping.new_pte_state;
5962     uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch;
5963     uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch;
5964     uvm_prot_t curr_prot = block_page_prot_gpu(block, gpu, 0);
5965     uvm_processor_id_t resident_id;
5966     DECLARE_BITMAP(big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5967     DECLARE_BITMAP(big_ptes_inherit, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5968     DECLARE_BITMAP(big_ptes_new_prot, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5969 
5970     UVM_ASSERT(gpu_state->pte_is_2m);
5971 
5972     resident_id = block_gpu_get_processor_to_map(block, gpu, 0);
5973 
5974     uvm_pte_batch_begin(push, pte_batch);
5975 
5976     // Since the 2M entry is active as a PTE, the GPU MMU can't fetch entries
5977     // from the lower levels. This means we don't need to issue a TLB invalidate
5978     // when writing those levels.
5979 
5980     // Cases to handle:
5981     // 1) Big PTEs which inherit curr_prot
5982     // 2) Big PTEs which get unmapped
5983     // 3) Big PTEs which are split to 4k
5984     //    a) 4k PTEs which inherit curr_prot under the split big PTEs
5985     //    b) 4k PTEs which get unmapped under the split big PTEs
5986 
5987     // Compute the big PTEs which will need to be split to 4k, if any.
5988     bitmap_complement(big_ptes_split, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5989 
5990     if (gpu_state->page_table_range_big.table) {
5991         // Case 1: Write the big PTEs which will inherit the 2M permissions, if
5992         // any. These are the big PTEs which are unchanged (uncovered) by the
5993         // operation.
5994         bitmap_andnot(big_ptes_inherit,
5995                       new_pte_state->big_ptes,
5996                       new_pte_state->big_ptes_covered,
5997                       MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5998 
5999         block_gpu_pte_write_big(block, gpu, resident_id, curr_prot, big_ptes_inherit, pte_batch, NULL);
6000 
6001         // Case 2: Clear the new big PTEs which get unmapped (those not covering
6002         // 4ks)
6003         bitmap_and(big_ptes_new_prot,
6004                    new_pte_state->big_ptes,
6005                    new_pte_state->big_ptes_covered,
6006                    MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6007 
6008         block_gpu_pte_clear_big(block,
6009                                 gpu,
6010                                 big_ptes_new_prot,
6011                                 tree->hal->unmapped_pte(UVM_PAGE_SIZE_64K),
6012                                 pte_batch,
6013                                 NULL);
6014 
6015         // Case 3: Write the big PTEs which cover 4k PTEs
6016         block_gpu_pte_clear_big(block, gpu, big_ptes_split, 0, pte_batch, NULL);
6017 
6018         // We just wrote all possible big PTEs, so mark them as initialized
6019         gpu_state->initialized_big = true;
6020     }
6021     else {
6022         UVM_ASSERT(bitmap_empty(new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK));
6023         UVM_ASSERT(bitmap_full(new_pte_state->big_ptes_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK));
6024     }
6025 
6026     // Cases 3a and 3b: Write all 4k PTEs under all now-split big PTEs
6027     block_gpu_pte_big_split_write_4k(block,
6028                                      block_context,
6029                                      gpu,
6030                                      resident_id,
6031                                      UVM_PROT_NONE,
6032                                      big_ptes_split,
6033                                      pages_to_unmap,
6034                                      pte_batch);
6035 
6036     // And activate the 2M PDE. This ends the pte_batch and issues a single TLB
6037     // invalidate for the 2M entry.
6038     block_gpu_pte_finish_split_2m(block, gpu, push, pte_batch, tlb_batch, tlb_membar);
6039 
6040     gpu_state->pte_is_2m = false;
6041     bitmap_copy(gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6042 }
6043 
6044 // Unmap some pre-existing mix of big and 4k PTEs into some other mix of big
6045 // and 4k PTEs.
6046 //
6047 // The PTEs covering the pages in pages_to_unmap are cleared (unmapped).
6048 static void block_gpu_unmap_big_and_4k(uvm_va_block_t *block,
6049                                        uvm_va_block_context_t *block_context,
6050                                        uvm_gpu_t *gpu,
6051                                        const uvm_page_mask_t *pages_to_unmap,
6052                                        uvm_push_t *push,
6053                                        uvm_membar_t tlb_membar)
6054 {
6055     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
6056     uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
6057     uvm_va_block_new_pte_state_t *new_pte_state = &block_context->mapping.new_pte_state;
6058     uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch;
6059     uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch;
6060     DECLARE_BITMAP(big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6061     DECLARE_BITMAP(big_ptes_before_or_after, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6062     DECLARE_BITMAP(big_ptes_mask, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6063     NvU32 big_page_size = tree->big_page_size;
6064     NvU64 unmapped_pte_val = tree->hal->unmapped_pte(big_page_size);
6065 
6066     UVM_ASSERT(!gpu_state->pte_is_2m);
6067 
6068     uvm_pte_batch_begin(push, pte_batch);
6069     uvm_tlb_batch_begin(tree, tlb_batch);
6070 
6071     // All of these cases might be perfomed in the same call:
6072     // 1) Split currently-big PTEs to 4k
6073     //    a) Write new 4k PTEs which inherit curr_prot under the split big PTEs
6074     //    b) Clear new 4k PTEs which get unmapped under the split big PTEs
6075     // 2) Merge currently-4k PTEs to unmapped big
6076     // 3) Clear currently-big PTEs which wholly get unmapped
6077     // 4) Clear currently-4k PTEs which get unmapped
6078     // 5) Initialize big PTEs which are not covered by this operation
6079 
6080     // Cases 1a and 1b: Write all 4k PTEs under all currently-big PTEs which are
6081     // being split. We'll make the big PTEs inactive below after flushing these
6082     // writes. No TLB invalidate is needed since the big PTE is active.
6083     //
6084     // Mask computation: big_before && !big_after
6085     bitmap_andnot(big_ptes_split, gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6086 
6087     block_gpu_pte_big_split_write_4k(block,
6088                                      block_context,
6089                                      gpu,
6090                                      UVM_ID_INVALID,
6091                                      UVM_PROT_NONE,
6092                                      big_ptes_split,
6093                                      pages_to_unmap,
6094                                      pte_batch);
6095 
6096     // Case 4: Clear the 4k PTEs which weren't covered by a big PTE before, and
6097     // remain uncovered after the unmap.
6098     //
6099     // Mask computation: !big_before && !big_after
6100     bitmap_or(big_ptes_before_or_after, gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6101     uvm_page_mask_init_from_big_ptes(block, gpu, &block_context->scratch_page_mask, big_ptes_before_or_after);
6102     if (uvm_page_mask_andnot(&block_context->scratch_page_mask, pages_to_unmap, &block_context->scratch_page_mask))
6103         block_gpu_pte_clear_4k(block, gpu, &block_context->scratch_page_mask, 0, pte_batch, tlb_batch);
6104 
6105     // Case 5: If the big page table is newly-allocated, make sure that all big
6106     // PTEs we aren't otherwise writing (that is, those which cover 4k PTEs) are
6107     // all initialized to invalid.
6108     //
6109     // The similar case of making newly-allocated big PTEs unmapped when no
6110     // lower 4k table is present is handled by having
6111     // block_gpu_compute_new_pte_state set new_pte_state->big_ptes
6112     // appropriately.
6113     if (gpu_state->page_table_range_big.table && !gpu_state->initialized_big) {
6114         // TODO: Bug 1766424: If we have the 4k page table already, we could
6115         //       attempt to merge all uncovered big PTE regions when first
6116         //       allocating the big table. That's probably not worth doing.
6117         UVM_ASSERT(gpu_state->page_table_range_4k.table);
6118         UVM_ASSERT(bitmap_empty(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK));
6119         bitmap_complement(big_ptes_mask, new_pte_state->big_ptes, uvm_va_block_num_big_pages(block, big_page_size));
6120         block_gpu_pte_clear_big(block, gpu, big_ptes_mask, 0, pte_batch, tlb_batch);
6121         gpu_state->initialized_big = true;
6122     }
6123 
6124     // Case 3 and step 1 of case 1: Unmap both currently-big PTEs which are
6125     // getting wholly unmapped, and those currently-big PTEs which are being
6126     // split to 4k. We can't directly transition from a valid big PTE to valid
6127     // lower PTEs, because that could cause the GPU TLBs to cache the same VA in
6128     // different cache lines. That could cause memory ordering to not be
6129     // maintained.
6130     //
6131     // Mask computation: (big_before && big_after && covered) ||
6132     //                   (big_before && !big_after)
6133     bitmap_and(big_ptes_mask, gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6134     bitmap_and(big_ptes_mask, big_ptes_mask, new_pte_state->big_ptes_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6135     bitmap_or(big_ptes_mask, big_ptes_mask, big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6136     block_gpu_pte_clear_big(block, gpu, big_ptes_mask, unmapped_pte_val, pte_batch, tlb_batch);
6137 
6138     // Case 2: Merge the new big PTEs and end the batches, now that we've done
6139     // all of the independent PTE writes we can.
6140     //
6141     // Mask computation: !big_before && big_after
6142     if (bitmap_andnot(big_ptes_mask, new_pte_state->big_ptes, gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)) {
6143         // This writes the newly-big PTEs to unmapped and ends the PTE and TLB
6144         // batches.
6145         block_gpu_pte_merge_big_and_end(block,
6146                                         block_context,
6147                                         gpu,
6148                                         big_ptes_mask,
6149                                         push,
6150                                         pte_batch,
6151                                         tlb_batch,
6152                                         tlb_membar);
6153     }
6154     else {
6155         // End the batches. We have to commit the membars and TLB invalidates
6156         // before we finish splitting formerly-big PTEs.
6157         uvm_pte_batch_end(pte_batch);
6158         uvm_tlb_batch_end(tlb_batch, push, tlb_membar);
6159     }
6160 
6161     if (!bitmap_empty(big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) ||
6162         block_gpu_needs_to_activate_table(block, gpu)) {
6163         uvm_pte_batch_begin(push, pte_batch);
6164         uvm_tlb_batch_begin(tree, tlb_batch);
6165 
6166         // Case 1 (step 2): Finish splitting our big PTEs, if we have any, by
6167         // switching them from unmapped to invalid. This causes the GPU MMU to
6168         // start reading the 4k PTEs instead of stopping at the unmapped big
6169         // PTEs.
6170         block_gpu_pte_clear_big(block, gpu, big_ptes_split, 0, pte_batch, tlb_batch);
6171 
6172         uvm_pte_batch_end(pte_batch);
6173 
6174         // Finally, activate the page tables if they're inactive
6175         if (block_gpu_needs_to_activate_table(block, gpu))
6176             block_gpu_write_pde(block, gpu, push, tlb_batch);
6177 
6178         uvm_tlb_batch_end(tlb_batch, push, UVM_MEMBAR_NONE);
6179     }
6180 
6181     // Update gpu_state
6182     bitmap_copy(gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6183 }
6184 
6185 // When PTE state is about to change (for example due to a map/unmap/revoke
6186 // operation), this function decides how to split and merge the PTEs in response
6187 // to that operation.
6188 //
6189 // The operation is described with the two page masks:
6190 //
6191 // - pages_changing indicates which pages will have their PTE mappings changed
6192 //   on the GPU in some way as a result of the operation (for example, which
6193 //   pages will actually have their mapping permissions upgraded).
6194 //
6195 // - page_mask_after indicates which pages on this GPU will have exactly the
6196 //   same PTE attributes (permissions, residency) as pages_changing after the
6197 //   operation is applied.
6198 //
6199 // PTEs are merged eagerly.
6200 static void block_gpu_compute_new_pte_state(uvm_va_block_t *block,
6201                                             uvm_gpu_t *gpu,
6202                                             uvm_processor_id_t resident_id,
6203                                             const uvm_page_mask_t *pages_changing,
6204                                             const uvm_page_mask_t *page_mask_after,
6205                                             uvm_va_block_new_pte_state_t *new_pte_state)
6206 {
6207     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
6208     uvm_va_block_region_t big_region_all, big_page_region, region;
6209     NvU32 big_page_size;
6210     uvm_page_index_t page_index;
6211     size_t big_page_index;
6212     DECLARE_BITMAP(big_ptes_not_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6213     bool can_make_new_big_ptes;
6214 
6215     memset(new_pte_state, 0, sizeof(*new_pte_state));
6216     new_pte_state->needs_4k = true;
6217 
6218     // TODO: Bug 1676485: Force a specific page size for perf testing
6219 
6220     if (gpu_state->force_4k_ptes)
6221         return;
6222 
6223     // Limit HMM GPU allocations to PAGE_SIZE since migrate_vma_*(),
6224     // hmm_range_fault(), and make_device_exclusive_range() don't handle folios
6225     // yet. Also, it makes mremap() difficult since the new address may not
6226     // align with the GPU block size otherwise.
6227     // If PAGE_SIZE is 64K, the code following this check is OK since 64K
6228     // big_pages is supported on all HMM supported GPUs (Turing+).
6229     // TODO: Bug 3368756: add support for transparent huge pages (THP).
6230     if (uvm_va_block_is_hmm(block) && PAGE_SIZE == UVM_PAGE_SIZE_4K)
6231         return;
6232 
6233     UVM_ASSERT(uvm_page_mask_subset(pages_changing, page_mask_after));
6234 
6235     // If all pages in the 2M mask have the same attributes after the
6236     // operation is applied, we can use a 2M PTE.
6237     if (block_gpu_supports_2m(block, gpu) &&
6238         uvm_page_mask_full(page_mask_after) &&
6239         (UVM_ID_IS_INVALID(resident_id) || is_block_phys_contig(block, resident_id))) {
6240         new_pte_state->pte_is_2m = true;
6241         new_pte_state->needs_4k = false;
6242         return;
6243     }
6244 
6245     // Find big PTEs with matching attributes
6246 
6247     // Can this block fit any big pages?
6248     big_page_size = uvm_va_block_gpu_big_page_size(block, gpu);
6249     big_region_all = uvm_va_block_big_page_region_all(block, big_page_size);
6250     if (big_region_all.first >= big_region_all.outer)
6251         return;
6252 
6253     new_pte_state->needs_4k = false;
6254 
6255     can_make_new_big_ptes = true;
6256 
6257     // Big pages can be used when mapping sysmem if the GPU supports it (Pascal+).
6258     if (UVM_ID_IS_CPU(resident_id) && !gpu->parent->can_map_sysmem_with_large_pages)
6259         can_make_new_big_ptes = false;
6260 
6261     // We must not fail during teardown: unmap (resident_id == UVM_ID_INVALID)
6262     // with no splits required. That means we should avoid allocating PTEs
6263     // which are only needed for merges.
6264     //
6265     // This only matters if we're merging to big PTEs. If we're merging to 2M,
6266     // then we must already have the 2M level (since it has to be allocated
6267     // before the lower levels).
6268     //
6269     // If pte_is_2m already and we don't have a big table, we're splitting so we
6270     // have to allocate.
6271     if (UVM_ID_IS_INVALID(resident_id) && !gpu_state->page_table_range_big.table && !gpu_state->pte_is_2m)
6272         can_make_new_big_ptes = false;
6273 
6274     for_each_va_block_page_in_region_mask(page_index, pages_changing, big_region_all) {
6275         uvm_va_block_region_t contig_region = {0};
6276 
6277         big_page_index = uvm_va_block_big_page_index(block, page_index, big_page_size);
6278         big_page_region = uvm_va_block_big_page_region(block, big_page_index, big_page_size);
6279 
6280         if (!UVM_ID_IS_INVALID(resident_id))
6281             contig_region = block_phys_contig_region(block, page_index, resident_id);
6282 
6283         __set_bit(big_page_index, new_pte_state->big_ptes_covered);
6284 
6285         // When mapping sysmem, we can use big pages only if we are mapping all
6286         // pages in the big page subregion and the CPU pages backing the
6287         // subregion are physically contiguous.
6288         if (can_make_new_big_ptes &&
6289             uvm_page_mask_region_full(page_mask_after, big_page_region) &&
6290             (!UVM_ID_IS_CPU(resident_id) ||
6291              (contig_region.first <= big_page_region.first && contig_region.outer >= big_page_region.outer))) {
6292             __set_bit(big_page_index, new_pte_state->big_ptes);
6293         }
6294 
6295         if (!test_bit(big_page_index, new_pte_state->big_ptes))
6296             new_pte_state->needs_4k = true;
6297 
6298         // Skip to the end of the region
6299         page_index = big_page_region.outer - 1;
6300     }
6301 
6302     if (!new_pte_state->needs_4k) {
6303         // All big page regions in pages_changing will be big PTEs. Now check if
6304         // there are any unaligned pages outside of big_region_all which are
6305         // changing.
6306         region = uvm_va_block_region(0, big_region_all.first);
6307         if (!uvm_page_mask_region_empty(pages_changing, region)) {
6308             new_pte_state->needs_4k = true;
6309         }
6310         else {
6311             region = uvm_va_block_region(big_region_all.outer, uvm_va_block_num_cpu_pages(block));
6312             if (!uvm_page_mask_region_empty(pages_changing, region))
6313                 new_pte_state->needs_4k = true;
6314         }
6315     }
6316 
6317     // Now add in the PTEs which should be big but weren't covered by this
6318     // operation.
6319     //
6320     // Note that we can't assume that a given page table range has been
6321     // initialized if it's present here, since it could have been allocated by a
6322     // thread which had to restart its operation due to allocation retry.
6323     if (gpu_state->pte_is_2m || (block_gpu_supports_2m(block, gpu) && !gpu_state->page_table_range_2m.table)) {
6324         // We're splitting a 2M PTE so all of the uncovered big PTE regions will
6325         // become big PTEs which inherit the 2M permissions. If we haven't
6326         // allocated the 2M table yet, it will start as a 2M PTE until the lower
6327         // levels are allocated, so it's the same split case regardless of
6328         // whether this operation will need to retry a later allocation.
6329         bitmap_complement(big_ptes_not_covered, new_pte_state->big_ptes_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6330     }
6331     else if (!gpu_state->page_table_range_4k.table && !new_pte_state->needs_4k) {
6332         // If we don't have 4k PTEs and we won't be allocating them for this
6333         // operation, all of our PTEs need to be big.
6334         UVM_ASSERT(!bitmap_empty(new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK));
6335         bitmap_zero(big_ptes_not_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6336         bitmap_set(big_ptes_not_covered, 0, uvm_va_block_num_big_pages(block, big_page_size));
6337     }
6338     else {
6339         // Otherwise, add in all of the currently-big PTEs which are unchanging.
6340         // They won't be written, but they need to be carried into the new
6341         // gpu_state->big_ptes when it's updated.
6342         bitmap_andnot(big_ptes_not_covered,
6343                       gpu_state->big_ptes,
6344                       new_pte_state->big_ptes_covered,
6345                       MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6346     }
6347 
6348     bitmap_or(new_pte_state->big_ptes, new_pte_state->big_ptes, big_ptes_not_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6349 }
6350 
6351 // Wrapper around uvm_page_tree_get_ptes() and uvm_page_tree_alloc_table() that
6352 // handles allocation retry. If the block lock has been unlocked and relocked as
6353 // part of the allocation, NV_ERR_MORE_PROCESSING_REQUIRED is returned to signal
6354 // to the caller that the operation likely needs to be restarted. If that
6355 // happens, the pending tracker is added to the block's tracker.
6356 static NV_STATUS block_alloc_pt_range_with_retry(uvm_va_block_t *va_block,
6357                                                  uvm_gpu_t *gpu,
6358                                                  NvU32 page_size,
6359                                                  uvm_page_table_range_t *page_table_range,
6360                                                  uvm_tracker_t *pending_tracker)
6361 {
6362     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id);
6363     uvm_gpu_va_space_t *gpu_va_space = uvm_va_block_get_gpu_va_space(va_block, gpu);
6364     uvm_page_tree_t *page_tables = &gpu_va_space->page_tables;
6365     uvm_va_block_test_t *va_block_test = uvm_va_block_get_test(va_block);
6366     uvm_page_table_range_t local_range;
6367     NV_STATUS status;
6368 
6369     // Blocks may contain large PTEs without starting on a PTE boundary or
6370     // having an aligned size. Cover the PTEs of this size in the block's
6371     // interior so we match uvm_va_block_gpu_state_t::big_ptes.
6372     NvU64 start = UVM_ALIGN_UP(va_block->start, page_size);
6373     NvU64 size  = UVM_ALIGN_DOWN(va_block->end + 1, page_size) - start;
6374 
6375     // VA blocks which can use the 2MB level as either a PTE or a PDE need to
6376     // account for the PDE specially, so they must use uvm_page_tree_alloc_table
6377     // to allocate the lower levels.
6378     bool use_alloc_table = block_gpu_supports_2m(va_block, gpu) && page_size < UVM_PAGE_SIZE_2M;
6379 
6380     UVM_ASSERT(page_table_range->table == NULL);
6381 
6382     if (va_block_test && va_block_test->page_table_allocation_retry_force_count > 0) {
6383         --va_block_test->page_table_allocation_retry_force_count;
6384         status = NV_ERR_NO_MEMORY;
6385     }
6386     else if (use_alloc_table) {
6387         // Pascal+: 4k/64k tables under a 2M entry
6388         UVM_ASSERT(gpu_state->page_table_range_2m.table);
6389         status = uvm_page_tree_alloc_table(page_tables,
6390                                            page_size,
6391                                            UVM_PMM_ALLOC_FLAGS_NONE,
6392                                            &gpu_state->page_table_range_2m,
6393                                            page_table_range);
6394     }
6395     else {
6396         // 4k/big tables on pre-Pascal, and the 2M entry on Pascal+
6397         status = uvm_page_tree_get_ptes(page_tables,
6398                                         page_size,
6399                                         start,
6400                                         size,
6401                                         UVM_PMM_ALLOC_FLAGS_NONE,
6402                                         page_table_range);
6403     }
6404 
6405     if (status == NV_OK)
6406         goto allocated;
6407 
6408     if (status != NV_ERR_NO_MEMORY)
6409         return status;
6410 
6411     // Before unlocking the block lock, any pending work on the block has to be
6412     // added to the block's tracker.
6413     if (pending_tracker) {
6414         status = uvm_tracker_add_tracker_safe(&va_block->tracker, pending_tracker);
6415         if (status != NV_OK)
6416             return status;
6417     }
6418 
6419     // Unlock the va block and retry with eviction enabled
6420     uvm_mutex_unlock(&va_block->lock);
6421 
6422     if (use_alloc_table) {
6423         // Although we don't hold the block lock here, it's safe to pass
6424         // gpu_state->page_table_range_2m to the page tree code because we know
6425         // that the 2m range has already been allocated, and that it can't go
6426         // away while we have the va_space lock held.
6427         status = uvm_page_tree_alloc_table(page_tables,
6428                                            page_size,
6429                                            UVM_PMM_ALLOC_FLAGS_EVICT,
6430                                            &gpu_state->page_table_range_2m,
6431                                            &local_range);
6432     }
6433     else {
6434         status = uvm_page_tree_get_ptes(page_tables,
6435                                         page_size,
6436                                         start,
6437                                         size,
6438                                         UVM_PMM_ALLOC_FLAGS_EVICT,
6439                                         &local_range);
6440     }
6441 
6442     uvm_mutex_lock(&va_block->lock);
6443 
6444     if (status != NV_OK)
6445         return status;
6446 
6447     status = NV_ERR_MORE_PROCESSING_REQUIRED;
6448 
6449     if (page_table_range->table) {
6450         // A different caller allocated the page tables in the meantime, release the
6451         // local copy.
6452         uvm_page_tree_put_ptes(page_tables, &local_range);
6453         return status;
6454     }
6455 
6456     *page_table_range = local_range;
6457 
6458 allocated:
6459     // Mark the 2M PTE as active when we first allocate it, since we don't have
6460     // any PTEs below it yet.
6461     if (page_size == UVM_PAGE_SIZE_2M) {
6462         UVM_ASSERT(!gpu_state->pte_is_2m);
6463         gpu_state->pte_is_2m = true;
6464     }
6465     else if (page_size != UVM_PAGE_SIZE_4K) {
6466         // uvm_page_tree_get_ptes initializes big PTEs to invalid.
6467         // uvm_page_tree_alloc_table does not, so we'll have to do it later.
6468         if (use_alloc_table)
6469             UVM_ASSERT(!gpu_state->initialized_big);
6470         else
6471             gpu_state->initialized_big = true;
6472     }
6473 
6474     return status;
6475 }
6476 
6477 // Helper which allocates all page table ranges necessary for the given page
6478 // sizes. See block_alloc_pt_range_with_retry.
6479 static NV_STATUS block_alloc_ptes_with_retry(uvm_va_block_t *va_block,
6480                                              uvm_gpu_t *gpu,
6481                                              NvU32 page_sizes,
6482                                              uvm_tracker_t *pending_tracker)
6483 {
6484     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id);
6485     uvm_gpu_va_space_t *gpu_va_space = uvm_va_block_get_gpu_va_space(va_block, gpu);
6486     uvm_page_table_range_t *range;
6487     NvU32 page_size;
6488     NV_STATUS status, final_status = NV_OK;
6489 
6490     UVM_ASSERT(gpu_state);
6491 
6492     // Blocks which can map 2M PTE/PDEs must always allocate the 2MB level first
6493     // in order to allocate the levels below.
6494     if (block_gpu_supports_2m(va_block, gpu))
6495         page_sizes |= UVM_PAGE_SIZE_2M;
6496 
6497     UVM_ASSERT((page_sizes & gpu_va_space->page_tables.hal->page_sizes()) == page_sizes);
6498 
6499     for_each_chunk_size_rev(page_size, page_sizes) {
6500         if (page_size == UVM_PAGE_SIZE_2M)
6501             range = &gpu_state->page_table_range_2m;
6502         else if (page_size == UVM_PAGE_SIZE_4K)
6503             range = &gpu_state->page_table_range_4k;
6504         else
6505             range = &gpu_state->page_table_range_big;
6506 
6507         if (range->table)
6508             continue;
6509 
6510         if (page_size == UVM_PAGE_SIZE_2M) {
6511             UVM_ASSERT(!gpu_state->pte_is_2m);
6512             UVM_ASSERT(!gpu_state->page_table_range_big.table);
6513             UVM_ASSERT(!gpu_state->page_table_range_4k.table);
6514         }
6515         else if (page_size != UVM_PAGE_SIZE_4K) {
6516             UVM_ASSERT(uvm_va_block_num_big_pages(va_block, uvm_va_block_gpu_big_page_size(va_block, gpu)) > 0);
6517             UVM_ASSERT(bitmap_empty(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK));
6518         }
6519 
6520         status = block_alloc_pt_range_with_retry(va_block, gpu, page_size, range, pending_tracker);
6521 
6522         // Keep going to allocate the remaining levels even if the allocation
6523         // requires a retry, since we'll likely still need them when we retry
6524         // anyway.
6525         if (status == NV_ERR_MORE_PROCESSING_REQUIRED)
6526             final_status = NV_ERR_MORE_PROCESSING_REQUIRED;
6527         else if (status != NV_OK)
6528             return status;
6529     }
6530 
6531     return final_status;
6532 }
6533 
6534 static NV_STATUS block_alloc_ptes_new_state(uvm_va_block_t *va_block,
6535                                             uvm_gpu_t *gpu,
6536                                             uvm_va_block_new_pte_state_t *new_pte_state,
6537                                             uvm_tracker_t *pending_tracker)
6538 {
6539     NvU32 page_sizes = 0;
6540 
6541     if (new_pte_state->pte_is_2m) {
6542         page_sizes |= UVM_PAGE_SIZE_2M;
6543     }
6544     else {
6545         if (!bitmap_empty(new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK))
6546             page_sizes |= uvm_va_block_gpu_big_page_size(va_block, gpu);
6547 
6548         if (new_pte_state->needs_4k)
6549             page_sizes |= UVM_PAGE_SIZE_4K;
6550         else
6551             UVM_ASSERT(!bitmap_empty(new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK));
6552     }
6553 
6554     return block_alloc_ptes_with_retry(va_block, gpu, page_sizes, pending_tracker);
6555 }
6556 
6557 // Make sure that GMMU PDEs down to PDE1 are populated for the given VA block.
6558 // This is currently used on ATS systems to prevent GPUs from inadvertently
6559 // accessing sysmem via ATS because there is no PDE1 in the GMMU page tables,
6560 // which is where the NOATS bit resides.
6561 //
6562 // The current implementation simply pre-allocates the PTEs for the VA Block,
6563 // which is wasteful because the GPU may never need them.
6564 //
6565 // TODO: Bug 2064188: Change the MMU code to be able to directly refcount PDE1
6566 // page table entries without having to request PTEs.
6567 static NV_STATUS block_pre_populate_pde1_gpu(uvm_va_block_t *block,
6568                                              uvm_gpu_va_space_t *gpu_va_space,
6569                                              uvm_tracker_t *pending_tracker)
6570 {
6571     NvU32 page_sizes;
6572     NvU32 big_page_size;
6573     uvm_gpu_t *gpu;
6574     uvm_va_block_gpu_state_t *gpu_state;
6575 
6576     UVM_ASSERT(block);
6577     UVM_ASSERT(gpu_va_space);
6578     UVM_ASSERT(gpu_va_space->ats.enabled);
6579     UVM_ASSERT(uvm_gpu_va_space_state(gpu_va_space) == UVM_GPU_VA_SPACE_STATE_ACTIVE);
6580 
6581     gpu = gpu_va_space->gpu;
6582     big_page_size = gpu_va_space->page_tables.big_page_size;
6583 
6584     gpu_state = block_gpu_state_get_alloc(block, gpu);
6585     if (!gpu_state)
6586         return NV_ERR_NO_MEMORY;
6587 
6588     // If the VA Block supports 2M pages, allocate the 2M PTE only, as it
6589     // requires less memory
6590     if (block_gpu_supports_2m(block, gpu))
6591         page_sizes = UVM_PAGE_SIZE_2M;
6592     else if (uvm_va_block_num_big_pages(block, big_page_size) > 0)
6593         page_sizes = big_page_size;
6594     else
6595         page_sizes = UVM_PAGE_SIZE_4K;
6596 
6597     return block_alloc_ptes_with_retry(block, gpu, page_sizes, pending_tracker);
6598 }
6599 
6600 static NV_STATUS block_pre_populate_pde1_all_gpus(uvm_va_block_t *block, uvm_tracker_t *pending_tracker)
6601 {
6602     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
6603     NV_STATUS status = NV_OK;
6604 
6605     // Pre-populate PDEs down to PDE1 for all GPU VA spaces on ATS systems. See
6606     // comments in block_pre_populate_pde1_gpu.
6607     if (g_uvm_global.ats.enabled && !block->cpu.ever_mapped) {
6608         uvm_gpu_va_space_t *gpu_va_space;
6609 
6610         for_each_gpu_va_space(gpu_va_space, va_space) {
6611             // We only care about systems where ATS is supported and the application
6612             // enabled it.
6613             if (!gpu_va_space->ats.enabled)
6614                 continue;
6615 
6616             status = block_pre_populate_pde1_gpu(block, gpu_va_space, pending_tracker);
6617             if (status != NV_OK)
6618                 break;
6619         }
6620     }
6621 
6622     return status;
6623 }
6624 
6625 static NV_STATUS block_unmap_gpu(uvm_va_block_t *block,
6626                                  uvm_va_block_context_t *block_context,
6627                                  uvm_gpu_t *gpu,
6628                                  const uvm_page_mask_t *unmap_page_mask,
6629                                  uvm_tracker_t *out_tracker)
6630 {
6631     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
6632     uvm_pte_bits_gpu_t pte_bit;
6633     uvm_push_t push;
6634     uvm_membar_t tlb_membar;
6635     bool only_local_mappings;
6636     uvm_page_mask_t *pages_to_unmap = &block_context->mapping.page_mask;
6637     NV_STATUS status;
6638     uvm_va_block_new_pte_state_t *new_pte_state = &block_context->mapping.new_pte_state;
6639     bool mask_empty;
6640 
6641     // We have to check gpu_state before looking at any VA space state like our
6642     // gpu_va_space, because we could be on the eviction path where we don't
6643     // have a lock on that state. However, since remove_gpu_va_space walks each
6644     // block to unmap the GPU before destroying the gpu_va_space, we're
6645     // guaranteed that if this GPU has page tables, the gpu_va_space can't go
6646     // away while we're holding the block lock.
6647     if (!block_gpu_has_page_tables(block, gpu))
6648         return NV_OK;
6649 
6650     if (!uvm_page_mask_and(pages_to_unmap, unmap_page_mask, &gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ]))
6651         return NV_OK;
6652 
6653     // block_gpu_compute_new_pte_state needs a mask of pages which will have
6654     // matching attributes after the operation is performed. In the case of
6655     // unmap, those are the pages with unset bits.
6656     uvm_page_mask_andnot(&block_context->scratch_page_mask, &gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ], pages_to_unmap);
6657     uvm_page_mask_complement(&block_context->scratch_page_mask, &block_context->scratch_page_mask);
6658     block_gpu_compute_new_pte_state(block,
6659                                     gpu,
6660                                     UVM_ID_INVALID,
6661                                     pages_to_unmap,
6662                                     &block_context->scratch_page_mask,
6663                                     new_pte_state);
6664 
6665     status = block_alloc_ptes_new_state(block, gpu, new_pte_state, out_tracker);
6666     if (status != NV_OK)
6667         return status;
6668 
6669     only_local_mappings = !block_has_remote_mapping_gpu(block, block_context, gpu->id, pages_to_unmap);
6670     tlb_membar = uvm_hal_downgrade_membar_type(gpu, only_local_mappings);
6671 
6672     status = uvm_push_begin_acquire(gpu->channel_manager,
6673                                     UVM_CHANNEL_TYPE_MEMOPS,
6674                                     &block->tracker,
6675                                     &push,
6676                                     "Unmapping pages in block [0x%llx, 0x%llx)",
6677                                     block->start,
6678                                     block->end + 1);
6679     if (status != NV_OK)
6680         return status;
6681 
6682     if (new_pte_state->pte_is_2m) {
6683         // We're either unmapping a whole valid 2M PTE, or we're unmapping all
6684         // remaining pages in a split 2M PTE.
6685         block_gpu_unmap_to_2m(block, block_context, gpu, &push, tlb_membar);
6686     }
6687     else if (gpu_state->pte_is_2m) {
6688         // The block is currently mapped as a valid 2M PTE and we're unmapping
6689         // some pages within the 2M, so we have to split it into the appropriate
6690         // mix of big and 4k PTEs.
6691         block_gpu_unmap_split_2m(block, block_context, gpu, pages_to_unmap, &push, tlb_membar);
6692     }
6693     else {
6694         // We're unmapping some pre-existing mix of big and 4K PTEs into some
6695         // other mix of big and 4K PTEs.
6696         block_gpu_unmap_big_and_4k(block, block_context, gpu, pages_to_unmap, &push, tlb_membar);
6697     }
6698 
6699     uvm_push_end(&push);
6700 
6701     if (!uvm_processor_mask_test(block_get_uvm_lite_gpus(block), gpu->id)) {
6702         uvm_processor_mask_t non_uvm_lite_gpus;
6703         uvm_processor_mask_andnot(&non_uvm_lite_gpus, &block->mapped, block_get_uvm_lite_gpus(block));
6704 
6705         UVM_ASSERT(uvm_processor_mask_test(&non_uvm_lite_gpus, gpu->id));
6706 
6707         // If the GPU is the only non-UVM-Lite processor with mappings, we can
6708         // safely mark pages as fully unmapped
6709         if (uvm_processor_mask_get_count(&non_uvm_lite_gpus) == 1)
6710             uvm_page_mask_andnot(&block->maybe_mapped_pages, &block->maybe_mapped_pages, pages_to_unmap);
6711     }
6712 
6713     // Clear block PTE state
6714     for (pte_bit = 0; pte_bit < UVM_PTE_BITS_GPU_MAX; pte_bit++) {
6715         mask_empty = !uvm_page_mask_andnot(&gpu_state->pte_bits[pte_bit],
6716                                            &gpu_state->pte_bits[pte_bit],
6717                                            pages_to_unmap);
6718         if (pte_bit == UVM_PTE_BITS_GPU_READ && mask_empty)
6719             uvm_processor_mask_clear(&block->mapped, gpu->id);
6720     }
6721 
6722     UVM_ASSERT(block_check_mappings(block));
6723 
6724     return uvm_tracker_add_push_safe(out_tracker, &push);
6725 }
6726 
6727 NV_STATUS uvm_va_block_unmap(uvm_va_block_t *va_block,
6728                              uvm_va_block_context_t *va_block_context,
6729                              uvm_processor_id_t id,
6730                              uvm_va_block_region_t region,
6731                              const uvm_page_mask_t *unmap_page_mask,
6732                              uvm_tracker_t *out_tracker)
6733 {
6734     uvm_page_mask_t *region_page_mask = &va_block_context->mapping.map_running_page_mask;
6735 
6736     UVM_ASSERT(!uvm_va_block_is_dead(va_block));
6737     uvm_assert_mutex_locked(&va_block->lock);
6738 
6739     if (UVM_ID_IS_CPU(id)) {
6740        block_unmap_cpu(va_block, region, unmap_page_mask);
6741        return NV_OK;
6742     }
6743 
6744     uvm_page_mask_init_from_region(region_page_mask, region, unmap_page_mask);
6745 
6746     return block_unmap_gpu(va_block, va_block_context, block_get_gpu(va_block, id), region_page_mask, out_tracker);
6747 }
6748 
6749 // This function essentially works as a wrapper around vm_insert_page (hence
6750 // the similar function prototype). This is needed since vm_insert_page
6751 // doesn't take permissions as input, but uses vma->vm_page_prot instead.
6752 // Since we may have multiple VA blocks under one VMA which need to map
6753 // with different permissions, we have to manually change vma->vm_page_prot for
6754 // each call to vm_insert_page. Multiple faults under one VMA in separate
6755 // blocks can be serviced concurrently, so the VMA wrapper lock is used
6756 // to protect access to vma->vm_page_prot.
6757 static NV_STATUS uvm_cpu_insert_page(struct vm_area_struct *vma,
6758                                      NvU64 addr,
6759                                      struct page *page,
6760                                      uvm_prot_t new_prot)
6761 {
6762     uvm_vma_wrapper_t *vma_wrapper;
6763     unsigned long target_flags;
6764     pgprot_t target_pgprot;
6765     int ret;
6766 
6767     UVM_ASSERT(vma);
6768     UVM_ASSERT(vma->vm_private_data);
6769 
6770     vma_wrapper = vma->vm_private_data;
6771     target_flags = vma->vm_flags;
6772 
6773     if (new_prot == UVM_PROT_READ_ONLY)
6774         target_flags &= ~VM_WRITE;
6775 
6776     target_pgprot = vm_get_page_prot(target_flags);
6777 
6778     // Take VMA wrapper lock to check vma->vm_page_prot
6779     uvm_down_read(&vma_wrapper->lock);
6780 
6781     // Take a write lock if we need to modify the VMA vm_page_prot
6782     // - vma->vm_page_prot creates writable PTEs but new prot is RO
6783     // - vma->vm_page_prot creates read-only PTEs but new_prot is RW
6784     if (pgprot_val(vma->vm_page_prot) != pgprot_val(target_pgprot)) {
6785         uvm_up_read(&vma_wrapper->lock);
6786         uvm_down_write(&vma_wrapper->lock);
6787 
6788         vma->vm_page_prot = target_pgprot;
6789 
6790         uvm_downgrade_write(&vma_wrapper->lock);
6791     }
6792 
6793     ret = vm_insert_page(vma, addr, page);
6794     uvm_up_read(&vma_wrapper->lock);
6795     if (ret) {
6796         UVM_ASSERT_MSG(ret == -ENOMEM, "ret: %d\n", ret);
6797         return errno_to_nv_status(ret);
6798     }
6799 
6800     return NV_OK;
6801 }
6802 
6803 static uvm_prot_t compute_logical_prot(uvm_va_block_t *va_block,
6804                                        uvm_va_block_context_t *va_block_context,
6805                                        uvm_page_index_t page_index)
6806 {
6807     struct vm_area_struct *vma;
6808     uvm_prot_t logical_prot;
6809 
6810     if (uvm_va_block_is_hmm(va_block)) {
6811         NvU64 addr = uvm_va_block_cpu_page_address(va_block, page_index);
6812 
6813         logical_prot = uvm_hmm_compute_logical_prot(va_block, va_block_context, addr);
6814     }
6815     else {
6816         uvm_va_range_t *va_range = va_block->va_range;
6817 
6818         UVM_ASSERT(va_range->type == UVM_VA_RANGE_TYPE_MANAGED);
6819 
6820         // Zombified VA ranges no longer have a vma, so they have no permissions
6821         if (uvm_va_range_is_managed_zombie(va_range)) {
6822             logical_prot = UVM_PROT_NONE;
6823         }
6824         else {
6825             vma = uvm_va_range_vma(va_range);
6826 
6827             if (!(vma->vm_flags & VM_READ))
6828                 logical_prot = UVM_PROT_NONE;
6829             else if (!(vma->vm_flags & VM_WRITE))
6830                 logical_prot = UVM_PROT_READ_ONLY;
6831             else
6832                 logical_prot = UVM_PROT_READ_WRITE_ATOMIC;
6833         }
6834     }
6835 
6836     return logical_prot;
6837 }
6838 
6839 static struct page *block_page_get(uvm_va_block_t *block, block_phys_page_t block_page)
6840 {
6841     struct page *page;
6842 
6843     if (UVM_ID_IS_CPU(block_page.processor)) {
6844         page = uvm_cpu_chunk_get_cpu_page(block, block_page.page_index);
6845     }
6846     else {
6847         uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
6848         uvm_gpu_t *gpu = uvm_va_space_get_gpu(va_space, block_page.processor);
6849         size_t chunk_offset;
6850         uvm_gpu_chunk_t *chunk = block_phys_page_chunk(block, block_page, &chunk_offset);
6851 
6852         UVM_ASSERT(gpu->mem_info.numa.enabled);
6853         page = uvm_gpu_chunk_to_page(&gpu->pmm, chunk) + chunk_offset / PAGE_SIZE;
6854     }
6855 
6856     UVM_ASSERT(page);
6857     return page;
6858 }
6859 
6860 // Creates or upgrades a CPU mapping for the given page, updating the block's
6861 // mapping and pte_bits bitmaps as appropriate. Upon successful return, the page
6862 // will be mapped with at least new_prot permissions.
6863 //
6864 // This never downgrades mappings, so new_prot must not be UVM_PROT_NONE. Use
6865 // block_unmap_cpu or uvm_va_block_revoke_prot instead.
6866 //
6867 // If the existing mapping is >= new_prot already, this is a no-op.
6868 //
6869 // It is the caller's responsibility to:
6870 //  - Revoke mappings from other processors as appropriate so the CPU can map
6871 //    with new_prot permissions
6872 //  - Guarantee that vm_insert_page is safe to use (vma->vm_mm has a reference
6873 //    and mmap_lock is held in at least read mode)
6874 //  - Ensure that the struct page corresponding to the physical memory being
6875 //    mapped exists
6876 //  - Manage the block's residency bitmap
6877 //  - Ensure that the block hasn't been killed (block->va_range is present)
6878 //  - Update the pte/mapping tracking state on success
6879 static NV_STATUS block_map_cpu_page_to(uvm_va_block_t *block,
6880                                        uvm_va_block_context_t *va_block_context,
6881                                        uvm_processor_id_t resident_id,
6882                                        uvm_page_index_t page_index,
6883                                        uvm_prot_t new_prot)
6884 {
6885     uvm_prot_t curr_prot = block_page_prot_cpu(block, page_index);
6886     uvm_va_range_t *va_range = block->va_range;
6887     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
6888     struct vm_area_struct *vma;
6889     NV_STATUS status;
6890     NvU64 addr;
6891     struct page *page;
6892 
6893     UVM_ASSERT(uvm_va_block_is_hmm(block) || va_range->type == UVM_VA_RANGE_TYPE_MANAGED);
6894     UVM_ASSERT(new_prot != UVM_PROT_NONE);
6895     UVM_ASSERT(new_prot < UVM_PROT_MAX);
6896     UVM_ASSERT(uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(resident_id)], UVM_ID_CPU));
6897 
6898     uvm_assert_mutex_locked(&block->lock);
6899     if (UVM_ID_IS_CPU(resident_id))
6900         UVM_ASSERT(uvm_page_mask_test(&block->cpu.allocated, page_index));
6901 
6902     // For the CPU, write implies atomic
6903     if (new_prot == UVM_PROT_READ_WRITE)
6904         new_prot = UVM_PROT_READ_WRITE_ATOMIC;
6905 
6906     // Only upgrades are supported in this function
6907     UVM_ASSERT(curr_prot <= new_prot);
6908 
6909     if (new_prot == curr_prot)
6910         return NV_OK;
6911 
6912     // Check for existing VMA permissions. They could have been modified after
6913     // the initial mmap by mprotect.
6914     if (new_prot > compute_logical_prot(block, va_block_context, page_index))
6915         return NV_ERR_INVALID_ACCESS_TYPE;
6916 
6917     if (uvm_va_block_is_hmm(block)) {
6918         // Do not map CPU pages because they belong to the Linux kernel.
6919         return NV_OK;
6920     }
6921 
6922     UVM_ASSERT(va_range);
6923 
6924     if (UVM_ID_IS_CPU(resident_id) && UVM_ID_IS_CPU(uvm_va_range_get_policy(va_range)->preferred_location)) {
6925         // Add the page's range group range to the range group's migrated list.
6926         uvm_range_group_range_t *rgr = uvm_range_group_range_find(va_space,
6927                                                                   uvm_va_block_cpu_page_address(block, page_index));
6928         if (rgr != NULL) {
6929             uvm_spin_lock(&rgr->range_group->migrated_ranges_lock);
6930             if (list_empty(&rgr->range_group_migrated_list_node))
6931                 list_move_tail(&rgr->range_group_migrated_list_node, &rgr->range_group->migrated_ranges);
6932             uvm_spin_unlock(&rgr->range_group->migrated_ranges_lock);
6933         }
6934     }
6935 
6936     // It's possible here that current->mm != vma->vm_mm. That can happen for
6937     // example due to access_process_vm (ptrace) or get_user_pages from another
6938     // driver.
6939     //
6940     // In such cases the caller has taken care of ref counting vma->vm_mm for
6941     // us, so we can safely operate on the vma but we can't use
6942     // uvm_va_range_vma_current.
6943     vma = uvm_va_range_vma(va_range);
6944     uvm_assert_mmap_lock_locked(vma->vm_mm);
6945     UVM_ASSERT(!uvm_va_space_mm_enabled(va_space) || va_space->va_space_mm.mm == vma->vm_mm);
6946 
6947     // Add the mapping
6948     addr = uvm_va_block_cpu_page_address(block, page_index);
6949 
6950     // This unmap handles upgrades as vm_insert_page returns -EBUSY when
6951     // there's already a mapping present at fault_addr, so we have to unmap
6952     // first anyway when upgrading from RO -> RW.
6953     if (curr_prot != UVM_PROT_NONE)
6954         unmap_mapping_range(va_space->mapping, addr, PAGE_SIZE, 1);
6955 
6956     // Don't map the CPU until prior copies and GPU PTE updates finish,
6957     // otherwise we might not stay coherent.
6958     status = uvm_tracker_wait(&block->tracker);
6959     if (status != NV_OK)
6960         return status;
6961 
6962     page = block_page_get(block, block_phys_page(resident_id, page_index));
6963     return uvm_cpu_insert_page(vma, addr, page, new_prot);
6964 }
6965 
6966 // Maps the CPU to the given pages which are resident on resident_id.
6967 // map_page_mask is an in/out parameter: the pages which are mapped to
6968 // resident_id are removed from the mask before returning.
6969 //
6970 // Caller must ensure that:
6971 // -  Pages in map_page_mask must not be set in the corresponding cpu.pte_bits
6972 // mask for the requested protection.
6973 static NV_STATUS block_map_cpu_to(uvm_va_block_t *block,
6974                                   uvm_va_block_context_t *block_context,
6975                                   uvm_processor_id_t resident_id,
6976                                   uvm_va_block_region_t region,
6977                                   uvm_page_mask_t *map_page_mask,
6978                                   uvm_prot_t new_prot,
6979                                   uvm_tracker_t *out_tracker)
6980 {
6981     NV_STATUS status = NV_OK;
6982     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
6983     uvm_page_index_t page_index;
6984     uvm_page_mask_t *pages_to_map = &block_context->mapping.page_mask;
6985     const uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(block, resident_id);
6986     uvm_pte_bits_cpu_t prot_pte_bit = get_cpu_pte_bit_index(new_prot);
6987     uvm_pte_bits_cpu_t pte_bit;
6988 
6989     UVM_ASSERT(uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(resident_id)], UVM_ID_CPU));
6990 
6991     // TODO: Bug 1766424: Check if optimizing the unmap_mapping_range calls
6992     //       within block_map_cpu_page_to by doing them once here is helpful.
6993 
6994     UVM_ASSERT(!uvm_page_mask_and(&block_context->scratch_page_mask,
6995                                   map_page_mask,
6996                                   &block->cpu.pte_bits[prot_pte_bit]));
6997 
6998     // The pages which will actually change are those in the input page mask
6999     // which are resident on the target.
7000     if (!uvm_page_mask_and(pages_to_map, map_page_mask, resident_mask))
7001         return NV_OK;
7002 
7003     status = block_pre_populate_pde1_all_gpus(block, out_tracker);
7004     if (status != NV_OK)
7005         return status;
7006 
7007     block->cpu.ever_mapped = true;
7008 
7009     for_each_va_block_page_in_region_mask(page_index, pages_to_map, region) {
7010         status = block_map_cpu_page_to(block,
7011                                        block_context,
7012                                        resident_id,
7013                                        page_index,
7014                                        new_prot);
7015         if (status != NV_OK)
7016             break;
7017 
7018         uvm_processor_mask_set(&block->mapped, UVM_ID_CPU);
7019     }
7020 
7021     // If there was some error, shrink the region so that we only update the
7022     // pte/mapping tracking bits for the pages that succeeded
7023     if (status != NV_OK) {
7024         region = uvm_va_block_region(region.first, page_index);
7025         uvm_page_mask_region_clear_outside(pages_to_map, region);
7026     }
7027 
7028     // If pages are mapped from a remote residency, notify the remote mapping
7029     // events to tools. We skip event notification if the cause is Invalid. We
7030     // use it to signal that this function is being called from the revocation
7031     // path to avoid reporting duplicate events.
7032     if (UVM_ID_IS_GPU(resident_id) &&
7033         va_space->tools.enabled &&
7034         block_context->mapping.cause != UvmEventMapRemoteCauseInvalid) {
7035         uvm_va_block_region_t subregion;
7036         for_each_va_block_subregion_in_mask(subregion, pages_to_map, region) {
7037             uvm_tools_record_map_remote(block,
7038                                         NULL,
7039                                         UVM_ID_CPU,
7040                                         resident_id,
7041                                         uvm_va_block_region_start(block, subregion),
7042                                         uvm_va_block_region_size(subregion),
7043                                         block_context->mapping.cause);
7044         }
7045     }
7046 
7047     // Update CPU mapping state
7048     for (pte_bit = 0; pte_bit <= prot_pte_bit; pte_bit++)
7049         uvm_page_mask_or(&block->cpu.pte_bits[pte_bit], &block->cpu.pte_bits[pte_bit], pages_to_map);
7050 
7051     uvm_page_mask_or(&block->maybe_mapped_pages, &block->maybe_mapped_pages, pages_to_map);
7052 
7053     UVM_ASSERT(block_check_mappings(block));
7054 
7055     // Remove all pages that were newly-mapped from the input mask
7056     uvm_page_mask_andnot(map_page_mask, map_page_mask, pages_to_map);
7057 
7058     return status;
7059 }
7060 
7061 // Maps the GPU to the given pages which are resident on resident_id.
7062 // map_page_mask is an in/out parameter: the pages which are mapped
7063 // to resident_id are removed from the mask before returning.
7064 //
7065 // Caller must ensure that:
7066 // -  Pages in map_page_mask must not be set in the corresponding pte_bits mask
7067 // for the requested protection on the mapping GPU.
7068 static NV_STATUS block_map_gpu_to(uvm_va_block_t *va_block,
7069                                   uvm_va_block_context_t *block_context,
7070                                   uvm_gpu_t *gpu,
7071                                   uvm_processor_id_t resident_id,
7072                                   uvm_page_mask_t *map_page_mask,
7073                                   uvm_prot_t new_prot,
7074                                   uvm_tracker_t *out_tracker)
7075 {
7076     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id);
7077     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
7078     uvm_push_t push;
7079     NV_STATUS status;
7080     uvm_page_mask_t *pages_to_map = &block_context->mapping.page_mask;
7081     const uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, resident_id);
7082     uvm_pte_bits_gpu_t pte_bit;
7083     uvm_pte_bits_gpu_t prot_pte_bit = get_gpu_pte_bit_index(new_prot);
7084     uvm_va_block_new_pte_state_t *new_pte_state = &block_context->mapping.new_pte_state;
7085     block_pte_op_t pte_op;
7086 
7087     UVM_ASSERT(map_page_mask);
7088     UVM_ASSERT(uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(resident_id)], gpu->id));
7089 
7090     if (uvm_processor_mask_test(block_get_uvm_lite_gpus(va_block), gpu->id))
7091         UVM_ASSERT(uvm_id_equal(resident_id, uvm_va_range_get_policy(va_block->va_range)->preferred_location));
7092 
7093     UVM_ASSERT(!uvm_page_mask_and(&block_context->scratch_page_mask,
7094                                   map_page_mask,
7095                                   &gpu_state->pte_bits[prot_pte_bit]));
7096 
7097     // The pages which will actually change are those in the input page mask
7098     // which are resident on the target.
7099     if (!uvm_page_mask_and(pages_to_map, map_page_mask, resident_mask))
7100         return NV_OK;
7101 
7102     UVM_ASSERT(block_check_mapping_residency(va_block, gpu, resident_id, pages_to_map));
7103 
7104     // For PTE merge/split computation, compute all resident pages which will
7105     // have exactly new_prot after performing the mapping.
7106     uvm_page_mask_or(&block_context->scratch_page_mask, &gpu_state->pte_bits[prot_pte_bit], pages_to_map);
7107     if (prot_pte_bit < UVM_PTE_BITS_GPU_ATOMIC) {
7108         uvm_page_mask_andnot(&block_context->scratch_page_mask,
7109                              &block_context->scratch_page_mask,
7110                              &gpu_state->pte_bits[prot_pte_bit + 1]);
7111     }
7112     uvm_page_mask_and(&block_context->scratch_page_mask, &block_context->scratch_page_mask, resident_mask);
7113 
7114     block_gpu_compute_new_pte_state(va_block,
7115                                     gpu,
7116                                     resident_id,
7117                                     pages_to_map,
7118                                     &block_context->scratch_page_mask,
7119                                     new_pte_state);
7120 
7121     status = block_alloc_ptes_new_state(va_block, gpu, new_pte_state, out_tracker);
7122     if (status != NV_OK)
7123         return status;
7124 
7125     status = uvm_push_begin_acquire(gpu->channel_manager,
7126                                     UVM_CHANNEL_TYPE_MEMOPS,
7127                                     &va_block->tracker,
7128                                     &push,
7129                                     "Mapping pages in block [0x%llx, 0x%llx) as %s",
7130                                     va_block->start,
7131                                     va_block->end + 1,
7132                                     uvm_prot_string(new_prot));
7133     if (status != NV_OK)
7134         return status;
7135 
7136     pte_op = BLOCK_PTE_OP_MAP;
7137     if (new_pte_state->pte_is_2m) {
7138         // We're either modifying permissions of a pre-existing 2M PTE, or all
7139         // permissions match so we can merge to a new 2M PTE.
7140         block_gpu_map_to_2m(va_block, block_context, gpu, resident_id, new_prot, &push, pte_op);
7141     }
7142     else if (gpu_state->pte_is_2m) {
7143         // Permissions on a subset of the existing 2M PTE are being upgraded, so
7144         // we have to split it into the appropriate mix of big and 4k PTEs.
7145         block_gpu_map_split_2m(va_block, block_context, gpu, resident_id, pages_to_map, new_prot, &push, pte_op);
7146     }
7147     else {
7148         // We're upgrading permissions on some pre-existing mix of big and 4K
7149         // PTEs into some other mix of big and 4K PTEs.
7150         block_gpu_map_big_and_4k(va_block, block_context, gpu, resident_id, pages_to_map, new_prot, &push, pte_op);
7151     }
7152 
7153     // If we are mapping remotely, record the event
7154     if (va_space->tools.enabled && !uvm_id_equal(resident_id, gpu->id)) {
7155         uvm_va_block_region_t subregion, region = uvm_va_block_region_from_block(va_block);
7156 
7157         UVM_ASSERT(block_context->mapping.cause != UvmEventMapRemoteCauseInvalid);
7158 
7159         for_each_va_block_subregion_in_mask(subregion, pages_to_map, region) {
7160             uvm_tools_record_map_remote(va_block,
7161                                         &push,
7162                                         gpu->id,
7163                                         resident_id,
7164                                         uvm_va_block_region_start(va_block, subregion),
7165                                         uvm_va_block_region_size(subregion),
7166                                         block_context->mapping.cause);
7167         }
7168     }
7169 
7170     uvm_push_end(&push);
7171 
7172     // Update GPU mapping state
7173     for (pte_bit = 0; pte_bit <= prot_pte_bit; pte_bit++)
7174         uvm_page_mask_or(&gpu_state->pte_bits[pte_bit], &gpu_state->pte_bits[pte_bit], pages_to_map);
7175 
7176     uvm_processor_mask_set(&va_block->mapped, gpu->id);
7177 
7178     // If we are mapping a UVM-Lite GPU do not update maybe_mapped_pages
7179     if (!uvm_processor_mask_test(block_get_uvm_lite_gpus(va_block), gpu->id))
7180         uvm_page_mask_or(&va_block->maybe_mapped_pages, &va_block->maybe_mapped_pages, pages_to_map);
7181 
7182     // Remove all pages resident on this processor from the input mask, which
7183     // were newly-mapped.
7184     uvm_page_mask_andnot(map_page_mask, map_page_mask, pages_to_map);
7185 
7186     UVM_ASSERT(block_check_mappings(va_block));
7187 
7188     return uvm_tracker_add_push_safe(out_tracker, &push);
7189 }
7190 
7191 static void map_get_allowed_destinations(uvm_va_block_t *block,
7192                                          uvm_va_block_context_t *va_block_context,
7193                                          const uvm_va_policy_t *policy,
7194                                          uvm_processor_id_t id,
7195                                          uvm_processor_mask_t *allowed_mask)
7196 {
7197     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
7198 
7199     if (uvm_processor_mask_test(block_get_uvm_lite_gpus(block), id)) {
7200         // UVM-Lite can only map resident pages on the preferred location
7201         uvm_processor_mask_zero(allowed_mask);
7202         uvm_processor_mask_set(allowed_mask, policy->preferred_location);
7203     }
7204     else if ((uvm_va_policy_is_read_duplicate(policy, va_space) ||
7205               (uvm_id_equal(policy->preferred_location, id) &&
7206                !is_uvm_fault_force_sysmem_set() &&
7207                !uvm_hmm_must_use_sysmem(block, va_block_context))) &&
7208              uvm_va_space_processor_has_memory(va_space, id)) {
7209         // When operating under read-duplication we should only map the local
7210         // processor to cause fault-and-duplicate of remote pages.
7211         //
7212         // The same holds when this processor is the preferred location: only
7213         // create local mappings to force remote pages to fault-and-migrate.
7214         uvm_processor_mask_zero(allowed_mask);
7215         uvm_processor_mask_set(allowed_mask, id);
7216     }
7217     else {
7218         // Common case: Just map wherever the memory happens to reside
7219         uvm_processor_mask_and(allowed_mask, &block->resident, &va_space->can_access[uvm_id_value(id)]);
7220         return;
7221     }
7222 
7223     // Clamp to resident and accessible processors
7224     uvm_processor_mask_and(allowed_mask, allowed_mask, &block->resident);
7225     uvm_processor_mask_and(allowed_mask, allowed_mask, &va_space->can_access[uvm_id_value(id)]);
7226 }
7227 
7228 NV_STATUS uvm_va_block_map(uvm_va_block_t *va_block,
7229                            uvm_va_block_context_t *va_block_context,
7230                            uvm_processor_id_t id,
7231                            uvm_va_block_region_t region,
7232                            const uvm_page_mask_t *map_page_mask,
7233                            uvm_prot_t new_prot,
7234                            UvmEventMapRemoteCause cause,
7235                            uvm_tracker_t *out_tracker)
7236 {
7237     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
7238     uvm_gpu_t *gpu = NULL;
7239     uvm_processor_mask_t allowed_destinations;
7240     uvm_processor_id_t resident_id;
7241     const uvm_page_mask_t *pte_mask;
7242     uvm_page_mask_t *running_page_mask = &va_block_context->mapping.map_running_page_mask;
7243     NV_STATUS status;
7244 
7245     va_block_context->mapping.cause = cause;
7246 
7247     UVM_ASSERT(new_prot != UVM_PROT_NONE);
7248     UVM_ASSERT(new_prot < UVM_PROT_MAX);
7249     uvm_assert_mutex_locked(&va_block->lock);
7250     UVM_ASSERT(uvm_va_block_check_policy_is_valid(va_block, va_block_context->policy, region));
7251 
7252     // Mapping is not supported on the eviction path that doesn't hold the VA
7253     // space lock.
7254     uvm_assert_rwsem_locked(&va_space->lock);
7255 
7256     if (UVM_ID_IS_CPU(id)) {
7257         uvm_pte_bits_cpu_t prot_pte_bit;
7258 
7259         // Check if the current thread is allowed to call vm_insert_page
7260         if (!uvm_va_block_is_hmm(va_block) && !uvm_va_range_vma_check(va_block->va_range, va_block_context->mm))
7261             return NV_OK;
7262 
7263         prot_pte_bit = get_cpu_pte_bit_index(new_prot);
7264         pte_mask = &va_block->cpu.pte_bits[prot_pte_bit];
7265     }
7266     else {
7267         uvm_va_block_gpu_state_t *gpu_state;
7268         uvm_pte_bits_gpu_t prot_pte_bit;
7269 
7270         gpu = uvm_va_space_get_gpu(va_space, id);
7271 
7272         // Although this GPU UUID is registered in the VA space, it might not have a
7273         // GPU VA space registered.
7274         if (!uvm_gpu_va_space_get(va_space, gpu))
7275             return NV_OK;
7276 
7277         gpu_state = block_gpu_state_get_alloc(va_block, gpu);
7278         if (!gpu_state)
7279             return NV_ERR_NO_MEMORY;
7280 
7281         prot_pte_bit = get_gpu_pte_bit_index(new_prot);
7282         pte_mask = &gpu_state->pte_bits[prot_pte_bit];
7283     }
7284 
7285     uvm_page_mask_init_from_region(running_page_mask, region, map_page_mask);
7286 
7287     if (!uvm_page_mask_andnot(running_page_mask, running_page_mask, pte_mask))
7288         return NV_OK;
7289 
7290     // Map per resident location so we can more easily detect physically-
7291     // contiguous mappings.
7292     map_get_allowed_destinations(va_block, va_block_context, va_block_context->policy, id, &allowed_destinations);
7293 
7294     for_each_closest_id(resident_id, &allowed_destinations, id, va_space) {
7295         if (UVM_ID_IS_CPU(id)) {
7296             status = block_map_cpu_to(va_block,
7297                                       va_block_context,
7298                                       resident_id,
7299                                       region,
7300                                       running_page_mask,
7301                                       new_prot,
7302                                       out_tracker);
7303         }
7304         else {
7305             status = block_map_gpu_to(va_block,
7306                                       va_block_context,
7307                                       gpu,
7308                                       resident_id,
7309                                       running_page_mask,
7310                                       new_prot,
7311                                       out_tracker);
7312         }
7313 
7314         if (status != NV_OK)
7315             return status;
7316 
7317         // If we've mapped all requested pages, we're done
7318         if (uvm_page_mask_region_empty(running_page_mask, region))
7319             break;
7320     }
7321 
7322     return NV_OK;
7323 }
7324 
7325 // Revokes the given pages mapped by cpu. This is implemented by unmapping all
7326 // pages and mapping them later with the lower permission. This is required
7327 // because vm_insert_page can only be used for upgrades from Invalid.
7328 //
7329 // Caller must ensure that:
7330 // -  Pages in revoke_page_mask must be set in the
7331 // cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE] mask.
7332 static NV_STATUS block_revoke_cpu_write(uvm_va_block_t *block,
7333                                         uvm_va_block_context_t *block_context,
7334                                         uvm_va_block_region_t region,
7335                                         const uvm_page_mask_t *revoke_page_mask,
7336                                         uvm_tracker_t *out_tracker)
7337 {
7338     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
7339     uvm_va_block_region_t subregion;
7340 
7341     UVM_ASSERT(revoke_page_mask);
7342 
7343     UVM_ASSERT(uvm_page_mask_subset(revoke_page_mask, &block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE]));
7344 
7345     block_unmap_cpu(block, region, revoke_page_mask);
7346 
7347     // Coalesce revocation event notification
7348     for_each_va_block_subregion_in_mask(subregion, revoke_page_mask, region) {
7349         uvm_perf_event_notify_revocation(&va_space->perf_events,
7350                                          block,
7351                                          UVM_ID_CPU,
7352                                          uvm_va_block_region_start(block, subregion),
7353                                          uvm_va_block_region_size(subregion),
7354                                          UVM_PROT_READ_WRITE_ATOMIC,
7355                                          UVM_PROT_READ_ONLY);
7356     }
7357 
7358     // uvm_va_block_map will skip this remap if we aren't holding the right mm
7359     // lock.
7360     return uvm_va_block_map(block,
7361                             block_context,
7362                             UVM_ID_CPU,
7363                             region,
7364                             revoke_page_mask,
7365                             UVM_PROT_READ_ONLY,
7366                             UvmEventMapRemoteCauseInvalid,
7367                             out_tracker);
7368 }
7369 
7370 static void block_revoke_prot_gpu_perf_notify(uvm_va_block_t *block,
7371                                               uvm_va_block_context_t *block_context,
7372                                               uvm_gpu_t *gpu,
7373                                               uvm_prot_t prot_revoked,
7374                                               const uvm_page_mask_t *pages_revoked)
7375 {
7376     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
7377     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
7378     uvm_va_block_region_t subregion, region = uvm_va_block_region_from_block(block);
7379     uvm_pte_bits_gpu_t pte_bit;
7380 
7381     for (pte_bit = UVM_PTE_BITS_GPU_ATOMIC; pte_bit >= get_gpu_pte_bit_index(prot_revoked); pte_bit--) {
7382         uvm_prot_t old_prot;
7383 
7384         if (!uvm_page_mask_and(&block_context->scratch_page_mask, &gpu_state->pte_bits[pte_bit], pages_revoked))
7385             continue;
7386 
7387         if (pte_bit == UVM_PTE_BITS_GPU_ATOMIC)
7388             old_prot = UVM_PROT_READ_WRITE_ATOMIC;
7389         else
7390             old_prot = UVM_PROT_READ_WRITE;
7391 
7392         for_each_va_block_subregion_in_mask(subregion, &block_context->scratch_page_mask, region) {
7393             uvm_perf_event_notify_revocation(&va_space->perf_events,
7394                                              block,
7395                                              gpu->id,
7396                                              uvm_va_block_region_start(block, subregion),
7397                                              uvm_va_block_region_size(subregion),
7398                                              old_prot,
7399                                              prot_revoked - 1);
7400         }
7401     }
7402 }
7403 
7404 // Revokes the given pages mapped by gpu which are resident on resident_id.
7405 // revoke_page_mask is an in/out parameter: the pages which have the appropriate
7406 // permissions and are mapped to resident_id are removed from the mask before
7407 // returning.
7408 //
7409 // Caller must ensure that:
7410 // -  Pages in map_page_mask must be set in the corresponding pte_bits mask for
7411 // the protection to be revoked on the mapping GPU.
7412 static NV_STATUS block_revoke_prot_gpu_to(uvm_va_block_t *va_block,
7413                                           uvm_va_block_context_t *block_context,
7414                                           uvm_gpu_t *gpu,
7415                                           uvm_processor_id_t resident_id,
7416                                           uvm_page_mask_t *revoke_page_mask,
7417                                           uvm_prot_t prot_to_revoke,
7418                                           uvm_tracker_t *out_tracker)
7419 {
7420     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id);
7421     uvm_push_t push;
7422     NV_STATUS status;
7423     uvm_pte_bits_gpu_t pte_bit;
7424     uvm_pte_bits_gpu_t prot_pte_bit = get_gpu_pte_bit_index(prot_to_revoke);
7425     uvm_prot_t new_prot = prot_to_revoke - 1;
7426     uvm_va_block_new_pte_state_t *new_pte_state = &block_context->mapping.new_pte_state;
7427     block_pte_op_t pte_op;
7428     const uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, resident_id);
7429     uvm_page_mask_t *pages_to_revoke = &block_context->mapping.page_mask;
7430 
7431     UVM_ASSERT(revoke_page_mask);
7432     UVM_ASSERT(uvm_page_mask_subset(revoke_page_mask, &gpu_state->pte_bits[prot_pte_bit]));
7433 
7434     // The pages which will actually change are those in the input page mask
7435     // which are resident on the target.
7436     if (!uvm_page_mask_and(pages_to_revoke, revoke_page_mask, resident_mask))
7437         return NV_OK;
7438 
7439     UVM_ASSERT(block_check_mapping_residency(va_block, gpu, resident_id, pages_to_revoke));
7440 
7441     // For PTE merge/split computation, compute all resident pages which will
7442     // have exactly prot_to_revoke-1 after performing the revocation.
7443     uvm_page_mask_andnot(&block_context->scratch_page_mask, &gpu_state->pte_bits[prot_pte_bit], pages_to_revoke);
7444     uvm_page_mask_andnot(&block_context->scratch_page_mask,
7445                          &gpu_state->pte_bits[prot_pte_bit - 1],
7446                          &block_context->scratch_page_mask);
7447     uvm_page_mask_and(&block_context->scratch_page_mask, &block_context->scratch_page_mask, resident_mask);
7448 
7449     block_gpu_compute_new_pte_state(va_block,
7450                                     gpu,
7451                                     resident_id,
7452                                     pages_to_revoke,
7453                                     &block_context->scratch_page_mask,
7454                                     new_pte_state);
7455 
7456     status = block_alloc_ptes_new_state(va_block, gpu, new_pte_state, out_tracker);
7457     if (status != NV_OK)
7458         return status;
7459 
7460     status = uvm_push_begin_acquire(gpu->channel_manager,
7461                                     UVM_CHANNEL_TYPE_MEMOPS,
7462                                     &va_block->tracker,
7463                                     &push,
7464                                     "Revoking %s access privileges in block [0x%llx, 0x%llx) ",
7465                                     uvm_prot_string(prot_to_revoke),
7466                                     va_block->start,
7467                                     va_block->end + 1);
7468     if (status != NV_OK)
7469         return status;
7470 
7471     pte_op = BLOCK_PTE_OP_REVOKE;
7472     if (new_pte_state->pte_is_2m) {
7473         // We're either modifying permissions of a pre-existing 2M PTE, or all
7474         // permissions match so we can merge to a new 2M PTE.
7475         block_gpu_map_to_2m(va_block, block_context, gpu, resident_id, new_prot, &push, pte_op);
7476     }
7477     else if (gpu_state->pte_is_2m) {
7478         // Permissions on a subset of the existing 2M PTE are being downgraded,
7479         // so we have to split it into the appropriate mix of big and 4k PTEs.
7480         block_gpu_map_split_2m(va_block, block_context, gpu, resident_id, pages_to_revoke, new_prot, &push, pte_op);
7481     }
7482     else {
7483         // We're downgrading permissions on some pre-existing mix of big and 4K
7484         // PTEs into some other mix of big and 4K PTEs.
7485         block_gpu_map_big_and_4k(va_block, block_context, gpu, resident_id, pages_to_revoke, new_prot, &push, pte_op);
7486     }
7487 
7488     uvm_push_end(&push);
7489 
7490     block_revoke_prot_gpu_perf_notify(va_block, block_context, gpu, prot_to_revoke, pages_to_revoke);
7491 
7492     // Update GPU mapping state
7493     for (pte_bit = UVM_PTE_BITS_GPU_ATOMIC; pte_bit >= prot_pte_bit; pte_bit--)
7494         uvm_page_mask_andnot(&gpu_state->pte_bits[pte_bit], &gpu_state->pte_bits[pte_bit], pages_to_revoke);
7495 
7496     // Remove all pages resident on this processor from the input mask, which
7497     // pages which were revoked and pages which already had the correct
7498     // permissions.
7499     uvm_page_mask_andnot(revoke_page_mask, revoke_page_mask, pages_to_revoke);
7500 
7501     UVM_ASSERT(block_check_mappings(va_block));
7502 
7503     return uvm_tracker_add_push_safe(out_tracker, &push);
7504 }
7505 
7506 NV_STATUS uvm_va_block_revoke_prot(uvm_va_block_t *va_block,
7507                                    uvm_va_block_context_t *va_block_context,
7508                                    uvm_processor_id_t id,
7509                                    uvm_va_block_region_t region,
7510                                    const uvm_page_mask_t *revoke_page_mask,
7511                                    uvm_prot_t prot_to_revoke,
7512                                    uvm_tracker_t *out_tracker)
7513 {
7514     uvm_gpu_t *gpu;
7515     uvm_va_block_gpu_state_t *gpu_state;
7516     uvm_processor_mask_t resident_procs;
7517     uvm_processor_id_t resident_id;
7518     uvm_page_mask_t *running_page_mask = &va_block_context->mapping.revoke_running_page_mask;
7519     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
7520     uvm_pte_bits_gpu_t prot_pte_bit;
7521 
7522     UVM_ASSERT(prot_to_revoke > UVM_PROT_READ_ONLY);
7523     UVM_ASSERT(prot_to_revoke < UVM_PROT_MAX);
7524     uvm_assert_mutex_locked(&va_block->lock);
7525 
7526     if (UVM_ID_IS_CPU(id)) {
7527         if (prot_to_revoke == UVM_PROT_READ_WRITE_ATOMIC)
7528             return NV_OK;
7529 
7530         if (uvm_va_block_is_hmm(va_block)) {
7531             // Linux is responsible for CPU page table updates.
7532             uvm_page_mask_region_clear(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE], region);
7533             return NV_OK;
7534         }
7535 
7536         uvm_page_mask_init_from_region(running_page_mask, region, revoke_page_mask);
7537 
7538         if (uvm_page_mask_and(running_page_mask, running_page_mask, &va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE]))
7539             return block_revoke_cpu_write(va_block, va_block_context, region, running_page_mask, out_tracker);
7540 
7541         return NV_OK;
7542     }
7543 
7544     gpu = uvm_va_space_get_gpu(va_space, id);
7545 
7546     // UVM-Lite GPUs should never have access revoked
7547     UVM_ASSERT_MSG(!uvm_processor_mask_test(block_get_uvm_lite_gpus(va_block), gpu->id),
7548                    "GPU %s\n", uvm_gpu_name(gpu));
7549 
7550     // Return early if there are no mappings for the GPU present in the block
7551     if (!uvm_processor_mask_test(&va_block->mapped, gpu->id))
7552         return NV_OK;
7553 
7554     gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id);
7555     prot_pte_bit = get_gpu_pte_bit_index(prot_to_revoke);
7556 
7557     uvm_page_mask_init_from_region(running_page_mask, region, revoke_page_mask);
7558 
7559     if (!uvm_page_mask_and(running_page_mask, running_page_mask, &gpu_state->pte_bits[prot_pte_bit]))
7560         return NV_OK;
7561 
7562     // Revoke per resident location so we can more easily detect physically-
7563     // contiguous mappings.
7564     uvm_processor_mask_copy(&resident_procs, &va_block->resident);
7565 
7566     for_each_closest_id(resident_id, &resident_procs, gpu->id, va_space) {
7567         NV_STATUS status = block_revoke_prot_gpu_to(va_block,
7568                                                     va_block_context,
7569                                                     gpu,
7570                                                     resident_id,
7571                                                     running_page_mask,
7572                                                     prot_to_revoke,
7573                                                     out_tracker);
7574         if (status != NV_OK)
7575             return status;
7576 
7577         // If we've revoked all requested pages, we're done
7578         if (uvm_page_mask_region_empty(running_page_mask, region))
7579             break;
7580     }
7581 
7582     return NV_OK;
7583 }
7584 
7585 NV_STATUS uvm_va_block_map_mask(uvm_va_block_t *va_block,
7586                                 uvm_va_block_context_t *va_block_context,
7587                                 const uvm_processor_mask_t *map_processor_mask,
7588                                 uvm_va_block_region_t region,
7589                                 const uvm_page_mask_t *map_page_mask,
7590                                 uvm_prot_t new_prot,
7591                                 UvmEventMapRemoteCause cause)
7592 {
7593     uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
7594     NV_STATUS status = NV_OK;
7595     NV_STATUS tracker_status;
7596     uvm_processor_id_t id;
7597 
7598     UVM_ASSERT(uvm_va_block_check_policy_is_valid(va_block, va_block_context->policy, region));
7599 
7600     for_each_id_in_mask(id, map_processor_mask) {
7601         status = uvm_va_block_map(va_block,
7602                                   va_block_context,
7603                                   id,
7604                                   region,
7605                                   map_page_mask,
7606                                   new_prot,
7607                                   cause,
7608                                   &local_tracker);
7609         if (status != NV_OK)
7610             break;
7611     }
7612 
7613     // Regardless of error, add the successfully-pushed mapping operations into
7614     // the block's tracker. Note that we can't overwrite the tracker because we
7615     // aren't guaranteed that the map actually pushed anything (in which case it
7616     // would've acquired the block tracker first).
7617     tracker_status = uvm_tracker_add_tracker_safe(&va_block->tracker, &local_tracker);
7618     uvm_tracker_deinit(&local_tracker);
7619 
7620     return status == NV_OK ? tracker_status : status;
7621 }
7622 
7623 NV_STATUS uvm_va_block_unmap_mask(uvm_va_block_t *va_block,
7624                                   uvm_va_block_context_t *va_block_context,
7625                                   const uvm_processor_mask_t *unmap_processor_mask,
7626                                   uvm_va_block_region_t region,
7627                                   const uvm_page_mask_t *unmap_page_mask)
7628 {
7629     uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
7630     NV_STATUS status = NV_OK;
7631     NV_STATUS tracker_status;
7632     uvm_processor_id_t id;
7633 
7634     // Watch out, unmap_mask could change during iteration since it could be
7635     // va_block->mapped.
7636     for_each_id_in_mask(id, unmap_processor_mask) {
7637         // Errors could either be a system-fatal error (ECC) or an allocation
7638         // retry due to PTE splitting. In either case we should stop after
7639         // hitting the first one.
7640         status = uvm_va_block_unmap(va_block, va_block_context, id, region, unmap_page_mask, &local_tracker);
7641         if (status != NV_OK)
7642             break;
7643     }
7644 
7645     // See the comment in uvm_va_block_map_mask for adding to the tracker.
7646     tracker_status = uvm_tracker_add_tracker_safe(&va_block->tracker, &local_tracker);
7647     uvm_tracker_deinit(&local_tracker);
7648 
7649     return status == NV_OK ? tracker_status : status;
7650 }
7651 
7652 NV_STATUS uvm_va_block_revoke_prot_mask(uvm_va_block_t *va_block,
7653                                         uvm_va_block_context_t *va_block_context,
7654                                         const uvm_processor_mask_t *revoke_processor_mask,
7655                                         uvm_va_block_region_t region,
7656                                         const uvm_page_mask_t *revoke_page_mask,
7657                                         uvm_prot_t prot_to_revoke)
7658 {
7659     uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
7660     NV_STATUS status = NV_OK;
7661     NV_STATUS tracker_status;
7662     uvm_processor_id_t id;
7663 
7664     for_each_id_in_mask(id, revoke_processor_mask) {
7665         status = uvm_va_block_revoke_prot(va_block,
7666                                           va_block_context,
7667                                           id,
7668                                           region,
7669                                           revoke_page_mask,
7670                                           prot_to_revoke,
7671                                           &local_tracker);
7672         if (status != NV_OK)
7673             break;
7674     }
7675 
7676     // See the comment in uvm_va_block_map_mask for adding to the tracker.
7677     tracker_status = uvm_tracker_add_tracker_safe(&va_block->tracker, &local_tracker);
7678     uvm_tracker_deinit(&local_tracker);
7679 
7680     return status == NV_OK ? tracker_status : status;
7681 }
7682 
7683 // Updates the read_duplicated_pages mask in the block when the state of GPU id
7684 // is being destroyed
7685 static void update_read_duplicated_pages_mask(uvm_va_block_t *block,
7686                                               uvm_gpu_id_t id,
7687                                               uvm_va_block_gpu_state_t *gpu_state)
7688 {
7689     uvm_gpu_id_t running_id;
7690     bool first = true;
7691     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
7692     uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, NULL);
7693     uvm_page_mask_t *running_page_mask = &block_context->update_read_duplicated_pages.running_page_mask;
7694     uvm_page_mask_t *tmp_page_mask = &block_context->scratch_page_mask;
7695 
7696     uvm_page_mask_zero(&block->read_duplicated_pages);
7697 
7698     for_each_id_in_mask(running_id, &block->resident) {
7699         const uvm_page_mask_t *running_residency_mask;
7700 
7701         if (uvm_id_equal(running_id, id))
7702             continue;
7703 
7704         running_residency_mask = uvm_va_block_resident_mask_get(block, running_id);
7705 
7706         if (first) {
7707             uvm_page_mask_copy(running_page_mask, running_residency_mask);
7708             first = false;
7709             continue;
7710         }
7711 
7712         if (uvm_page_mask_and(tmp_page_mask, running_page_mask, running_residency_mask))
7713             uvm_page_mask_or(&block->read_duplicated_pages, &block->read_duplicated_pages, tmp_page_mask);
7714 
7715         uvm_page_mask_or(running_page_mask, running_page_mask, running_residency_mask);
7716     }
7717 }
7718 
7719 // Unmaps all GPU mappings under this block, frees the page tables, and frees
7720 // all the GPU chunks. This simply drops the chunks on the floor, so the caller
7721 // must take care of copying the data elsewhere if it needs to remain intact.
7722 //
7723 // This serializes on the block tracker since it must unmap page tables.
7724 static void block_destroy_gpu_state(uvm_va_block_t *block, uvm_gpu_id_t id)
7725 {
7726     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, id);
7727     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
7728     uvm_gpu_va_space_t *gpu_va_space;
7729     uvm_gpu_t *gpu, *other_gpu;
7730 
7731     if (!gpu_state)
7732         return;
7733 
7734     uvm_assert_mutex_locked(&block->lock);
7735 
7736     // Unmap PTEs and free page tables
7737     gpu = uvm_va_space_get_gpu(va_space, id);
7738     gpu_va_space = uvm_gpu_va_space_get(va_space, gpu);
7739     if (gpu_va_space) {
7740         uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, NULL);
7741 
7742         uvm_va_block_remove_gpu_va_space(block, gpu_va_space, block_context);
7743     }
7744 
7745     UVM_ASSERT(!uvm_processor_mask_test(&block->mapped, id));
7746 
7747     // No processor should have this GPU mapped at this point
7748     UVM_ASSERT(block_check_processor_not_mapped(block, id));
7749 
7750     // We need to remove the mappings of the indirect peers from the reverse
7751     // map when the GPU state is being destroyed (for example, on
7752     // unregister_gpu) and when peer access between indirect peers is disabled.
7753     // However, we need to avoid double mapping removals. There are two
7754     // possible scenarios:
7755     // - Disable peer access first. This will remove all mappings between A and
7756     // B GPUs, and the indirect_peers bit is cleared. Thus, the later call to
7757     // unregister_gpu will not operate on that pair of GPUs.
7758     // - Unregister GPU first. This will remove all mappings from all indirect
7759     // peers to the GPU being unregistered. It will also destroy its GPU state.
7760     // Subsequent calls to disable peers will remove the mappings from the GPU
7761     // being unregistered, but never to the GPU being unregistered (since it no
7762     // longer has a valid GPU state).
7763     for_each_va_space_gpu_in_mask(other_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)])
7764         block_gpu_unmap_all_chunks_indirect_peer(block, gpu, other_gpu);
7765 
7766     if (gpu_state->chunks) {
7767         size_t i, num_chunks;
7768 
7769         update_read_duplicated_pages_mask(block, id, gpu_state);
7770         uvm_page_mask_zero(&gpu_state->resident);
7771         block_clear_resident_processor(block, id);
7772 
7773         num_chunks = block_num_gpu_chunks(block, gpu);
7774         for (i = 0; i < num_chunks; i++) {
7775             uvm_gpu_chunk_t *chunk = gpu_state->chunks[i];
7776             if (!chunk)
7777                 continue;
7778 
7779             uvm_mmu_chunk_unmap(chunk, &block->tracker);
7780             uvm_pmm_gpu_free(&gpu->pmm, chunk, &block->tracker);
7781         }
7782 
7783         uvm_kvfree(gpu_state->chunks);
7784     }
7785     else {
7786         UVM_ASSERT(!uvm_processor_mask_test(&block->resident, id));
7787     }
7788 
7789 
7790     // Pending operations may still need the DMA memory to be mapped.
7791     uvm_tracker_wait(&block->tracker);
7792 
7793     block_gpu_unmap_phys_all_cpu_pages(block, gpu);
7794     uvm_processor_mask_clear(&block->evicted_gpus, id);
7795 
7796     kmem_cache_free(g_uvm_va_block_gpu_state_cache, gpu_state);
7797     block->gpus[uvm_id_gpu_index(id)] = NULL;
7798 }
7799 
7800 static void block_put_ptes_safe(uvm_page_tree_t *tree, uvm_page_table_range_t *range)
7801 {
7802     if (range->table) {
7803         uvm_page_tree_put_ptes(tree, range);
7804         memset(range, 0, sizeof(*range));
7805     }
7806 }
7807 
7808 NV_STATUS uvm_va_block_add_gpu_va_space(uvm_va_block_t *va_block, uvm_gpu_va_space_t *gpu_va_space)
7809 {
7810     uvm_assert_mutex_locked(&va_block->lock);
7811 
7812     if (!gpu_va_space->ats.enabled || !va_block->cpu.ever_mapped)
7813         return NV_OK;
7814 
7815     // Pre-populate PDEs down to PDE1 for all GPU VA spaces on ATS systems. See
7816     // comments in pre_populate_pde1_gpu.
7817     return block_pre_populate_pde1_gpu(va_block, gpu_va_space, NULL);
7818 }
7819 
7820 void uvm_va_block_remove_gpu_va_space(uvm_va_block_t *va_block,
7821                                       uvm_gpu_va_space_t *gpu_va_space,
7822                                       uvm_va_block_context_t *block_context)
7823 {
7824     uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch;
7825     uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch;
7826     uvm_gpu_t *gpu = gpu_va_space->gpu;
7827     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id);
7828     uvm_va_block_region_t region = uvm_va_block_region_from_block(va_block);
7829     uvm_push_t push;
7830     NV_STATUS status;
7831 
7832     uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
7833 
7834     if (!gpu_state)
7835         return;
7836 
7837     uvm_assert_mutex_locked(&va_block->lock);
7838 
7839     // Unmapping the whole block won't cause a page table split, so this should
7840     // only fail if we have a system-fatal error.
7841     status = uvm_va_block_unmap(va_block, block_context, gpu->id, region, NULL, &local_tracker);
7842     if (status != NV_OK) {
7843         UVM_ASSERT(status == uvm_global_get_status());
7844         return; // Just leak
7845     }
7846 
7847     UVM_ASSERT(!uvm_processor_mask_test(&va_block->mapped, gpu->id));
7848 
7849     // Reset the page tables if other allocations could reuse them
7850     if (!block_gpu_supports_2m(va_block, gpu) &&
7851         !bitmap_empty(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)) {
7852 
7853         status = uvm_push_begin_acquire(gpu->channel_manager,
7854                                         UVM_CHANNEL_TYPE_MEMOPS,
7855                                         &local_tracker,
7856                                         &push,
7857                                         "Resetting PTEs for block [0x%llx, 0x%llx)",
7858                                         va_block->start,
7859                                         va_block->end + 1);
7860         if (status != NV_OK) {
7861             UVM_ASSERT(status == uvm_global_get_status());
7862             return; // Just leak
7863         }
7864 
7865         uvm_pte_batch_begin(&push, pte_batch);
7866         uvm_tlb_batch_begin(&gpu_va_space->page_tables, tlb_batch);
7867 
7868         // When the big PTEs is active, the 4k PTEs under it are garbage. Make
7869         // them invalid so the page tree code can reuse them for other
7870         // allocations on this VA. These don't need TLB invalidates since the
7871         // big PTEs above them are active.
7872         if (gpu_state->page_table_range_4k.table) {
7873             uvm_page_mask_init_from_big_ptes(va_block, gpu, &block_context->scratch_page_mask, gpu_state->big_ptes);
7874             block_gpu_pte_clear_4k(va_block, gpu, &block_context->scratch_page_mask, 0, pte_batch, NULL);
7875         }
7876 
7877         // We unmapped all big PTEs above, which means they have the unmapped
7878         // pattern so the GPU MMU won't read 4k PTEs under them. Set them to
7879         // invalid to activate the 4ks below so new allocations using just those
7880         // 4k PTEs will work.
7881         block_gpu_pte_clear_big(va_block, gpu, gpu_state->big_ptes, 0, pte_batch, tlb_batch);
7882 
7883         uvm_pte_batch_end(pte_batch);
7884         uvm_tlb_batch_end(tlb_batch, &push, UVM_MEMBAR_NONE);
7885 
7886         uvm_push_end(&push);
7887         uvm_tracker_overwrite_with_push(&local_tracker, &push);
7888     }
7889 
7890     // The unmap must finish before we free the page tables
7891     status = uvm_tracker_wait_deinit(&local_tracker);
7892     if (status != NV_OK)
7893         return; // System-fatal error, just leak
7894 
7895     // Note that if the PTE is currently 2M with lower tables allocated but not
7896     // in use, calling put_ptes on those lower ranges will re-write the 2M entry
7897     // to be a PDE.
7898     block_put_ptes_safe(&gpu_va_space->page_tables, &gpu_state->page_table_range_4k);
7899     block_put_ptes_safe(&gpu_va_space->page_tables, &gpu_state->page_table_range_big);
7900     block_put_ptes_safe(&gpu_va_space->page_tables, &gpu_state->page_table_range_2m);
7901 
7902     gpu_state->pte_is_2m = false;
7903     gpu_state->initialized_big = false;
7904     gpu_state->activated_big = false;
7905     gpu_state->activated_4k = false;
7906     bitmap_zero(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
7907 
7908     UVM_ASSERT(block_check_mappings(va_block));
7909 }
7910 
7911 NV_STATUS uvm_va_block_enable_peer(uvm_va_block_t *va_block, uvm_gpu_t *gpu0, uvm_gpu_t *gpu1)
7912 {
7913     NV_STATUS status;
7914     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
7915 
7916     UVM_ASSERT(uvm_gpu_peer_caps(gpu0, gpu1)->link_type != UVM_GPU_LINK_INVALID);
7917     uvm_assert_rwsem_locked_write(&va_space->lock);
7918     uvm_assert_mutex_locked(&va_block->lock);
7919 
7920     if (uvm_processor_mask_test(&va_space->indirect_peers[uvm_id_value(gpu0->id)], gpu1->id)) {
7921         status = block_gpu_map_all_chunks_indirect_peer(va_block, gpu0, gpu1);
7922         if (status != NV_OK)
7923             return status;
7924 
7925         status = block_gpu_map_all_chunks_indirect_peer(va_block, gpu1, gpu0);
7926         if (status != NV_OK) {
7927             block_gpu_unmap_all_chunks_indirect_peer(va_block, gpu0, gpu1);
7928             return status;
7929         }
7930     }
7931 
7932     // TODO: Bug 1767224: Refactor the uvm_va_block_set_accessed_by logic so we
7933     //       call it here.
7934 
7935     return NV_OK;
7936 }
7937 
7938 void uvm_va_block_disable_peer(uvm_va_block_t *va_block, uvm_gpu_t *gpu0, uvm_gpu_t *gpu1)
7939 {
7940     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
7941     NV_STATUS status;
7942     uvm_tracker_t tracker = UVM_TRACKER_INIT();
7943     uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, NULL);
7944     uvm_page_mask_t *unmap_page_mask = &block_context->caller_page_mask;
7945     const uvm_page_mask_t *resident0;
7946     const uvm_page_mask_t *resident1;
7947 
7948     uvm_assert_mutex_locked(&va_block->lock);
7949 
7950     // See comment in block_destroy_gpu_state
7951     if (uvm_processor_mask_test(&va_space->indirect_peers[uvm_id_value(gpu0->id)], gpu1->id)) {
7952         block_gpu_unmap_all_chunks_indirect_peer(va_block, gpu0, gpu1);
7953         block_gpu_unmap_all_chunks_indirect_peer(va_block, gpu1, gpu0);
7954     }
7955 
7956     // If either of the GPUs doesn't have GPU state then nothing could be mapped
7957     // between them.
7958     if (!uvm_va_block_gpu_state_get(va_block, gpu0->id) || !uvm_va_block_gpu_state_get(va_block, gpu1->id))
7959         return;
7960 
7961     resident0 = uvm_va_block_resident_mask_get(va_block, gpu0->id);
7962     resident1 = uvm_va_block_resident_mask_get(va_block, gpu1->id);
7963 
7964     // Unmap all pages resident on gpu1, but not on gpu0, from gpu0
7965     if (uvm_page_mask_andnot(unmap_page_mask, resident1, resident0)) {
7966         status = block_unmap_gpu(va_block, block_context, gpu0, unmap_page_mask, &tracker);
7967         if (status != NV_OK) {
7968             // Since all PTEs unmapped by this call have the same aperture, page
7969             // splits should never be required so any failure should be the
7970             // result of a system-fatal error.
7971             UVM_ASSERT_MSG(status == uvm_global_get_status(),
7972                            "Unmapping failed: %s, GPU %s\n",
7973                            nvstatusToString(status),
7974                            uvm_gpu_name(gpu0));
7975         }
7976     }
7977 
7978     // Unmap all pages resident on gpu0, but not on gpu1, from gpu1
7979     if (uvm_page_mask_andnot(unmap_page_mask, resident0, resident1)) {
7980         status = block_unmap_gpu(va_block, block_context, gpu1, unmap_page_mask, &tracker);
7981         if (status != NV_OK) {
7982             UVM_ASSERT_MSG(status == uvm_global_get_status(),
7983                            "Unmapping failed: %s, GPU %s\n",
7984                            nvstatusToString(status),
7985                            uvm_gpu_name(gpu0));
7986         }
7987     }
7988 
7989     status = uvm_tracker_add_tracker_safe(&va_block->tracker, &tracker);
7990     if (status != NV_OK)
7991         UVM_ASSERT(status == uvm_global_get_status());
7992 
7993     status = uvm_tracker_wait_deinit(&tracker);
7994     if (status != NV_OK)
7995         UVM_ASSERT(status == uvm_global_get_status());
7996 }
7997 
7998 void uvm_va_block_unmap_preferred_location_uvm_lite(uvm_va_block_t *va_block, uvm_gpu_t *gpu)
7999 {
8000     NV_STATUS status;
8001     uvm_va_range_t *va_range = va_block->va_range;
8002     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
8003     uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, NULL);
8004     uvm_va_block_region_t region = uvm_va_block_region_from_block(va_block);
8005 
8006     uvm_assert_mutex_locked(&va_block->lock);
8007     UVM_ASSERT(uvm_processor_mask_test(&va_range->uvm_lite_gpus, gpu->id));
8008 
8009     // If the GPU doesn't have GPU state then nothing could be mapped.
8010     if (!uvm_va_block_gpu_state_get(va_block, gpu->id))
8011         return;
8012 
8013     // In UVM-Lite mode, mappings to the preferred location are not tracked
8014     // directly, so just unmap the whole block.
8015     status = uvm_va_block_unmap(va_block, block_context, gpu->id, region, NULL, &va_block->tracker);
8016     if (status != NV_OK) {
8017         // Unmapping the whole block should not cause page splits so any failure
8018         // should be the result of a system-fatal error.
8019         UVM_ASSERT_MSG(status == uvm_global_get_status(),
8020                        "Unmapping failed: %s, GPU %s\n",
8021                        nvstatusToString(status), uvm_gpu_name(gpu));
8022     }
8023 
8024     status = uvm_tracker_wait(&va_block->tracker);
8025     if (status != NV_OK) {
8026         UVM_ASSERT_MSG(status == uvm_global_get_status(),
8027                        "Unmapping failed: %s, GPU %s\n",
8028                        nvstatusToString(status), uvm_gpu_name(gpu));
8029     }
8030 }
8031 
8032 // Evict pages from the GPU by moving each resident region to the CPU
8033 //
8034 // Notably the caller needs to support allocation-retry as
8035 // uvm_va_block_migrate_locked() requires that.
8036 static NV_STATUS block_evict_pages_from_gpu(uvm_va_block_t *va_block, uvm_gpu_t *gpu, struct mm_struct *mm)
8037 {
8038     NV_STATUS status = NV_OK;
8039     const uvm_page_mask_t *resident = uvm_va_block_resident_mask_get(va_block, gpu->id);
8040     uvm_va_block_region_t region = uvm_va_block_region_from_block(va_block);
8041     uvm_va_block_region_t subregion;
8042     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
8043     uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, mm);
8044 
8045     // Move all subregions resident on the GPU to the CPU
8046     for_each_va_block_subregion_in_mask(subregion, resident, region) {
8047         if (uvm_va_block_is_hmm(va_block)) {
8048             status = uvm_hmm_va_block_evict_pages_from_gpu(va_block,
8049                                                            gpu,
8050                                                            block_context,
8051                                                            resident,
8052                                                            subregion);
8053         }
8054         else {
8055             status = uvm_va_block_migrate_locked(va_block,
8056                                                  NULL,
8057                                                  block_context,
8058                                                  subregion,
8059                                                  UVM_ID_CPU,
8060                                                  UVM_MIGRATE_MODE_MAKE_RESIDENT_AND_MAP,
8061                                                  NULL);
8062         }
8063         if (status != NV_OK)
8064             return status;
8065     }
8066 
8067     UVM_ASSERT(!uvm_processor_mask_test(&va_block->resident, gpu->id));
8068     return NV_OK;
8069 }
8070 
8071 void uvm_va_block_unregister_gpu_locked(uvm_va_block_t *va_block, uvm_gpu_t *gpu, struct mm_struct *mm)
8072 {
8073     NV_STATUS status;
8074     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id);
8075 
8076     uvm_assert_mutex_locked(&va_block->lock);
8077 
8078     if (!gpu_state)
8079         return;
8080 
8081     // The mappings should've already been torn down by GPU VA space unregister
8082     UVM_ASSERT(!uvm_processor_mask_test(&va_block->mapped, gpu->id));
8083     UVM_ASSERT(uvm_page_mask_empty(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ]));
8084     UVM_ASSERT(!block_gpu_has_page_tables(va_block, gpu));
8085 
8086     // Use UVM_VA_BLOCK_RETRY_LOCKED() as the va block lock is already taken and
8087     // we don't rely on any state of the block across the call.
8088     status = UVM_VA_BLOCK_RETRY_LOCKED(va_block, NULL, block_evict_pages_from_gpu(va_block, gpu, mm));
8089     if (status != NV_OK) {
8090         UVM_ERR_PRINT("Failed to evict GPU pages on GPU unregister: %s, GPU %s\n",
8091                       nvstatusToString(status),
8092                       uvm_gpu_name(gpu));
8093         uvm_global_set_fatal_error(status);
8094     }
8095 
8096     // This function will copy the block's tracker into each chunk then free the
8097     // chunk to PMM. If we do this before waiting for the block tracker below
8098     // we'll populate PMM's free chunks with tracker entries, which gives us
8099     // better testing coverage of chunk synchronization on GPU unregister.
8100     block_destroy_gpu_state(va_block, gpu->id);
8101 
8102     // Any time a GPU is unregistered we need to make sure that there are no
8103     // pending (direct or indirect) tracker entries for that GPU left in the
8104     // block's tracker. The only way to ensure that is to wait for the whole
8105     // tracker.
8106     status = uvm_tracker_wait(&va_block->tracker);
8107     if (status != NV_OK)
8108         UVM_ASSERT(status == uvm_global_get_status());
8109 }
8110 
8111 void uvm_va_block_unregister_gpu(uvm_va_block_t *va_block, uvm_gpu_t *gpu, struct mm_struct *mm)
8112 {
8113     // Take the lock internally to not expose the caller to allocation-retry.
8114     uvm_mutex_lock(&va_block->lock);
8115 
8116     uvm_va_block_unregister_gpu_locked(va_block, gpu, mm);
8117 
8118     uvm_mutex_unlock(&va_block->lock);
8119 }
8120 
8121 static void block_mark_region_cpu_dirty(uvm_va_block_t *va_block, uvm_va_block_region_t region)
8122 {
8123     uvm_page_index_t page_index;
8124 
8125     uvm_assert_mutex_locked(&va_block->lock);
8126 
8127     for_each_va_block_page_in_region_mask (page_index, &va_block->cpu.resident, region)
8128         block_mark_cpu_page_dirty(va_block, page_index);
8129 }
8130 
8131 // Tears down everything within the block, but doesn't free the block itself.
8132 // Note that when uvm_va_block_kill is called, this is called twice: once for
8133 // the initial kill itself, then again when the block's ref count is eventually
8134 // destroyed. block->va_range is used to track whether the block has already
8135 // been killed.
8136 static void block_kill(uvm_va_block_t *block)
8137 {
8138     uvm_va_space_t *va_space;
8139     uvm_perf_event_data_t event_data;
8140     uvm_cpu_chunk_t *chunk;
8141     uvm_gpu_id_t id;
8142     NV_STATUS status;
8143     uvm_va_block_region_t region = uvm_va_block_region_from_block(block);
8144     uvm_page_index_t page_index;
8145     uvm_page_index_t next_page_index;
8146 
8147     if (uvm_va_block_is_dead(block))
8148         return;
8149 
8150     va_space = uvm_va_block_get_va_space(block);
8151     event_data.block_destroy.block = block;
8152     uvm_perf_event_notify(&va_space->perf_events, UVM_PERF_EVENT_BLOCK_DESTROY, &event_data);
8153 
8154     // Unmap all processors in parallel first. Unmapping the whole block won't
8155     // cause a page table split, so this should only fail if we have a system-
8156     // fatal error.
8157     if (!uvm_processor_mask_empty(&block->mapped)) {
8158         uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, NULL);
8159 
8160         // HMM CPU mappings are controlled by Linux so no need to unmap.
8161         // Remote GPU mappings will be removed below.
8162         if (uvm_va_block_is_hmm(block) && uvm_processor_mask_test(&block->mapped, UVM_ID_CPU)) {
8163             uvm_page_mask_zero(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE]);
8164             uvm_page_mask_zero(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ]);
8165             uvm_processor_mask_clear(&block->mapped, UVM_ID_CPU);
8166         }
8167 
8168         // We could only be killed with mapped GPU state by VA range free or VA
8169         // space teardown, so it's safe to use the va_space's block_context
8170         // because both of those have the VA space lock held in write mode.
8171         status = uvm_va_block_unmap_mask(block, block_context, &block->mapped, region, NULL);
8172         UVM_ASSERT(status == uvm_global_get_status());
8173     }
8174 
8175     UVM_ASSERT(uvm_processor_mask_empty(&block->mapped));
8176 
8177     // Free the GPU page tables and chunks
8178     for_each_gpu_id(id)
8179         block_destroy_gpu_state(block, id);
8180 
8181     // Wait for the GPU PTE unmaps before freeing CPU memory
8182     uvm_tracker_wait_deinit(&block->tracker);
8183 
8184     // No processor should have the CPU mapped at this point
8185     UVM_ASSERT(block_check_processor_not_mapped(block, UVM_ID_CPU));
8186 
8187     // Free CPU pages
8188     for_each_cpu_chunk_in_block_safe(chunk, page_index, next_page_index, block) {
8189         // be conservative.
8190         // Tell the OS we wrote to the page because we sometimes clear the dirty
8191         // bit after writing to it. HMM dirty flags are managed by the kernel.
8192         if (!uvm_va_block_is_hmm(block))
8193             uvm_cpu_chunk_mark_dirty(chunk, 0);
8194         uvm_cpu_chunk_remove_from_block(block, page_index);
8195         uvm_cpu_chunk_free(chunk);
8196     }
8197 
8198     uvm_kvfree((void *)block->cpu.chunks);
8199     block->cpu.chunks = 0;
8200 
8201     // Clearing the resident bit isn't strictly necessary since this block
8202     // is getting destroyed, but it keeps state consistent for assertions.
8203     uvm_page_mask_zero(&block->cpu.resident);
8204     block_clear_resident_processor(block, UVM_ID_CPU);
8205 
8206     if (uvm_va_block_is_hmm(block))
8207         uvm_va_policy_clear(block, block->start, block->end);
8208 
8209     block->va_range = NULL;
8210 #if UVM_IS_CONFIG_HMM()
8211     block->hmm.va_space = NULL;
8212 #endif
8213 }
8214 
8215 // Called when the block's ref count drops to 0
8216 void uvm_va_block_destroy(nv_kref_t *nv_kref)
8217 {
8218     uvm_va_block_t *block = container_of(nv_kref, uvm_va_block_t, kref);
8219 
8220     // Nobody else should have a reference when freeing
8221     uvm_assert_mutex_unlocked(&block->lock);
8222 
8223     uvm_mutex_lock(&block->lock);
8224     block_kill(block);
8225     uvm_mutex_unlock(&block->lock);
8226 
8227     if (uvm_enable_builtin_tests) {
8228         uvm_va_block_wrapper_t *block_wrapper = container_of(block, uvm_va_block_wrapper_t, block);
8229 
8230         kmem_cache_free(g_uvm_va_block_cache, block_wrapper);
8231     }
8232     else {
8233         kmem_cache_free(g_uvm_va_block_cache, block);
8234     }
8235 }
8236 
8237 void uvm_va_block_kill(uvm_va_block_t *va_block)
8238 {
8239     uvm_mutex_lock(&va_block->lock);
8240     block_kill(va_block);
8241     uvm_mutex_unlock(&va_block->lock);
8242 
8243     // May call block_kill again
8244     uvm_va_block_release(va_block);
8245 }
8246 
8247 static void block_gpu_release_region(uvm_va_block_t *va_block,
8248                                      uvm_gpu_id_t gpu_id,
8249                                      uvm_va_block_gpu_state_t *gpu_state,
8250                                      uvm_page_mask_t *page_mask,
8251                                      uvm_va_block_region_t region)
8252 {
8253     uvm_page_index_t page_index;
8254 
8255     for_each_va_block_page_in_region_mask(page_index, page_mask, region) {
8256         uvm_gpu_chunk_t *gpu_chunk = gpu_state->chunks[page_index];
8257 
8258         if (!gpu_chunk)
8259             continue;
8260 
8261         // TODO: Bug 3898467: unmap indirect peers when freeing GPU chunks
8262 
8263         uvm_mmu_chunk_unmap(gpu_chunk, &va_block->tracker);
8264 
8265         // The GPU chunk will be freed when the device private reference drops.
8266         if (uvm_page_mask_test_and_clear(&gpu_state->resident, page_index) &&
8267             uvm_page_mask_empty(&gpu_state->resident))
8268             block_clear_resident_processor(va_block, gpu_id);
8269 
8270         gpu_state->chunks[page_index] = NULL;
8271     }
8272 }
8273 
8274 void uvm_va_block_munmap_region(uvm_va_block_t *va_block,
8275                                 uvm_va_block_region_t region)
8276 {
8277     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
8278     uvm_perf_event_data_t event_data;
8279     uvm_gpu_id_t gpu_id;
8280 
8281     UVM_ASSERT(uvm_va_block_is_hmm(va_block));
8282     uvm_assert_mutex_locked(&va_block->lock);
8283 
8284     // Reset thrashing state for the region.
8285     event_data.block_munmap.block = va_block;
8286     event_data.block_munmap.region = region;
8287     uvm_perf_event_notify(&va_space->perf_events, UVM_PERF_EVENT_BLOCK_MUNMAP, &event_data);
8288 
8289     // Set a flag so that GPU fault events are flushed since they might refer
8290     // to the region being unmapped.
8291     // Note that holding the va_block lock prevents GPU VA spaces from
8292     // being removed so the registered_gpu_va_spaces mask is stable.
8293     for_each_gpu_id_in_mask(gpu_id, &va_space->registered_gpu_va_spaces) {
8294         uvm_processor_mask_set_atomic(&va_space->needs_fault_buffer_flush, gpu_id);
8295     }
8296 
8297     // Release any remaining vidmem chunks in the given region.
8298     for_each_gpu_id(gpu_id) {
8299         uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu_id);
8300 
8301         if (!gpu_state)
8302             continue;
8303 
8304         uvm_page_mask_region_clear(&gpu_state->evicted, region);
8305         if (uvm_page_mask_empty(&gpu_state->evicted))
8306             uvm_processor_mask_clear(&va_block->evicted_gpus, gpu_id);
8307 
8308         if (gpu_state->chunks) {
8309             block_gpu_release_region(va_block, gpu_id, gpu_state, NULL, region);
8310 
8311             // TODO: bug 3660922: Need to update the read duplicated pages mask
8312             // when read duplication is supported for HMM.
8313         }
8314         else {
8315             UVM_ASSERT(!uvm_processor_mask_test(&va_block->resident, gpu_id));
8316         }
8317     }
8318 
8319     uvm_va_policy_clear(va_block,
8320                         uvm_va_block_region_start(va_block, region),
8321                         uvm_va_block_region_end(va_block, region));
8322 }
8323 
8324 static NV_STATUS block_split_presplit_ptes_gpu(uvm_va_block_t *existing, uvm_va_block_t *new, uvm_gpu_t *gpu)
8325 {
8326     uvm_va_block_gpu_state_t *existing_gpu_state = uvm_va_block_gpu_state_get(existing, gpu->id);
8327     uvm_va_space_t *va_space = uvm_va_block_get_va_space(existing);
8328     uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, NULL);
8329     NvU32 big_page_size = uvm_va_block_gpu_big_page_size(existing, gpu);
8330     NvU32 alloc_sizes;
8331     DECLARE_BITMAP(new_big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
8332     uvm_page_index_t new_start_page_index = uvm_va_block_cpu_page_index(existing, new->start);
8333     size_t big_page_index;
8334     uvm_push_t push;
8335     NV_STATUS status;
8336 
8337     // We only have to split to big PTEs if we're currently a 2M PTE
8338     if (existing_gpu_state->pte_is_2m) {
8339         // We can skip the split if the 2M PTE is invalid and we have no lower
8340         // PTEs.
8341         if (block_page_prot_gpu(existing, gpu, 0) == UVM_PROT_NONE &&
8342             !existing_gpu_state->page_table_range_big.table &&
8343             !existing_gpu_state->page_table_range_4k.table)
8344             return NV_OK;
8345 
8346         alloc_sizes = big_page_size;
8347         bitmap_fill(new_big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
8348 
8349         if (!IS_ALIGNED(new->start, big_page_size)) {
8350             alloc_sizes |= UVM_PAGE_SIZE_4K;
8351 
8352             big_page_index = uvm_va_block_big_page_index(existing, new_start_page_index, big_page_size);
8353             __clear_bit(big_page_index, new_big_ptes);
8354         }
8355 
8356         status = block_alloc_ptes_with_retry(existing, gpu, alloc_sizes, NULL);
8357         if (status != NV_OK)
8358             return status;
8359 
8360         status = uvm_push_begin_acquire(gpu->channel_manager,
8361                                         UVM_CHANNEL_TYPE_MEMOPS,
8362                                         &existing->tracker,
8363                                         &push,
8364                                         "Splitting 2M PTE, existing [0x%llx, 0x%llx) new [0x%llx, 0x%llx)",
8365                                         existing->start, existing->end + 1,
8366                                         new->start, new->end + 1);
8367         if (status != NV_OK)
8368             return status;
8369 
8370         block_gpu_split_2m(existing, block_context, gpu, new_big_ptes, &push);
8371     }
8372     else {
8373         big_page_index = uvm_va_block_big_page_index(existing, new_start_page_index, big_page_size);
8374 
8375         // If the split point is on a big page boundary, or if the split point
8376         // is not currently covered by a big PTE, we don't have to split
8377         // anything.
8378         if (IS_ALIGNED(new->start, big_page_size) ||
8379             big_page_index == MAX_BIG_PAGES_PER_UVM_VA_BLOCK ||
8380             !test_bit(big_page_index, existing_gpu_state->big_ptes))
8381             return NV_OK;
8382 
8383         status = block_alloc_ptes_with_retry(existing, gpu, UVM_PAGE_SIZE_4K, NULL);
8384         if (status != NV_OK)
8385             return status;
8386 
8387         bitmap_zero(new_big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
8388         __set_bit(big_page_index, new_big_ptes);
8389 
8390         status = uvm_push_begin_acquire(gpu->channel_manager,
8391                                         UVM_CHANNEL_TYPE_MEMOPS,
8392                                         &existing->tracker,
8393                                         &push,
8394                                         "Splitting big PTE, existing [0x%llx, 0x%llx) new [0x%llx, 0x%llx)",
8395                                         existing->start, existing->end + 1,
8396                                         new->start, new->end + 1);
8397         if (status != NV_OK)
8398             return status;
8399 
8400         block_gpu_split_big(existing, block_context, gpu, new_big_ptes, &push);
8401     }
8402 
8403     uvm_push_end(&push);
8404 
8405     // Adding this push to existing block tracker will cause all GPU PTE splits
8406     // to serialize on each other, but it's simpler than maintaining a separate
8407     // tracker and this path isn't performance-critical.
8408     return uvm_tracker_add_push_safe(&existing->tracker, &push);
8409 }
8410 
8411 static NV_STATUS block_split_presplit_ptes(uvm_va_block_t *existing, uvm_va_block_t *new)
8412 {
8413     uvm_gpu_t *gpu;
8414     uvm_gpu_id_t id;
8415     NV_STATUS status;
8416 
8417     for_each_gpu_id(id) {
8418         if (!uvm_va_block_gpu_state_get(existing, id))
8419             continue;
8420 
8421         gpu = block_get_gpu(existing, id);
8422 
8423         if (block_gpu_has_page_tables(existing, gpu)) {
8424             status = block_split_presplit_ptes_gpu(existing, new, gpu);
8425             if (status != NV_OK)
8426                 return status;
8427         }
8428     }
8429 
8430     return NV_OK;
8431 }
8432 
8433 typedef struct
8434 {
8435     // Number of chunks contained by this VA block
8436     size_t num_chunks;
8437 
8438     // Index of the "interesting" chunk, either adjacent to or spanning the
8439     // split point depending on which block this is.
8440     size_t chunk_index;
8441 
8442     // Size of the chunk referenced by chunk_index
8443     uvm_chunk_size_t chunk_size;
8444 } block_gpu_chunk_split_state_t;
8445 
8446 static void block_gpu_chunk_get_split_state(uvm_va_block_t *block,
8447                                             block_gpu_chunk_split_state_t *state,
8448                                             NvU64 start,
8449                                             NvU64 end,
8450                                             uvm_page_index_t page_index,
8451                                             uvm_gpu_t *gpu)
8452 {
8453     NvU64 size = end - start + 1;
8454     state->num_chunks = block_num_gpu_chunks_range(block, start, size, gpu);
8455     state->chunk_index = block_gpu_chunk_index_range(block, start, size, gpu, page_index, &state->chunk_size);
8456 }
8457 
8458 static void block_merge_chunk(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_gpu_chunk_t *chunk)
8459 {
8460     uvm_gpu_t *accessing_gpu;
8461     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
8462 
8463     uvm_pmm_gpu_merge_chunk(&gpu->pmm, chunk);
8464 
8465     for_each_va_space_gpu_in_mask(accessing_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)]) {
8466         NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(&gpu->pmm, chunk, accessing_gpu);
8467 
8468         uvm_pmm_sysmem_mappings_merge_gpu_chunk_mappings(&accessing_gpu->pmm_reverse_sysmem_mappings,
8469                                                          peer_addr,
8470                                                          uvm_gpu_chunk_get_size(chunk));
8471     }
8472 }
8473 
8474 // Perform any chunk splitting and array growing required for this block split,
8475 // but don't actually move chunk pointers anywhere.
8476 static NV_STATUS block_presplit_gpu_chunks(uvm_va_block_t *existing, uvm_va_block_t *new, uvm_gpu_t *gpu)
8477 {
8478     uvm_va_block_gpu_state_t *existing_gpu_state = uvm_va_block_gpu_state_get(existing, gpu->id);
8479     uvm_gpu_t *accessing_gpu;
8480     uvm_va_space_t *va_space = uvm_va_block_get_va_space(existing);
8481     uvm_gpu_chunk_t **temp_chunks;
8482     uvm_gpu_chunk_t *original_chunk, *curr_chunk;
8483     uvm_page_index_t split_page_index = uvm_va_block_cpu_page_index(existing, new->start);
8484     uvm_chunk_sizes_mask_t split_sizes;
8485     uvm_chunk_size_t subchunk_size;
8486     NV_STATUS status;
8487     block_gpu_chunk_split_state_t existing_before_state, existing_after_state, new_state;
8488 
8489     block_gpu_chunk_get_split_state(existing,
8490                                     &existing_before_state,
8491                                     existing->start,
8492                                     existing->end,
8493                                     split_page_index,
8494                                     gpu);
8495     block_gpu_chunk_get_split_state(existing,
8496                                     &existing_after_state,
8497                                     existing->start,
8498                                     new->start - 1,
8499                                     split_page_index - 1,
8500                                     gpu);
8501     block_gpu_chunk_get_split_state(new,
8502                                     &new_state,
8503                                     new->start,
8504                                     new->end,
8505                                     0,
8506                                     gpu);
8507 
8508     // Even though we're splitting existing, we could wind up requiring a larger
8509     // chunks array if we split a large chunk into many smaller ones.
8510     if (existing_after_state.num_chunks > existing_before_state.num_chunks) {
8511         temp_chunks = uvm_kvrealloc(existing_gpu_state->chunks,
8512                                     existing_after_state.num_chunks * sizeof(existing_gpu_state->chunks[0]));
8513         if (!temp_chunks)
8514             return NV_ERR_NO_MEMORY;
8515         existing_gpu_state->chunks = temp_chunks;
8516     }
8517 
8518     original_chunk = existing_gpu_state->chunks[existing_before_state.chunk_index];
8519 
8520     // If the chunk covering the split point is not populated, we're done. We've
8521     // already grown the array to cover any new chunks which may be populated
8522     // later.
8523     if (!original_chunk)
8524         return NV_OK;
8525 
8526     // Figure out the splits we need to perform. Remove all sizes >= the current
8527     // size, and all sizes < the target size. Note that the resulting mask will
8528     // be 0 if the sizes match (we're already splitting at a chunk boundary).
8529     UVM_ASSERT(uvm_gpu_chunk_get_size(original_chunk) == existing_before_state.chunk_size);
8530     UVM_ASSERT(existing_before_state.chunk_size >= new_state.chunk_size);
8531     split_sizes = gpu->parent->mmu_user_chunk_sizes;
8532     split_sizes &= existing_before_state.chunk_size - 1;
8533     split_sizes &= ~(new_state.chunk_size - 1);
8534 
8535     // Keep splitting the chunk covering the split point until we hit the target
8536     // size.
8537     curr_chunk = original_chunk;
8538     for_each_chunk_size_rev(subchunk_size, split_sizes) {
8539         size_t last_index, num_subchunks;
8540 
8541         status = uvm_pmm_gpu_split_chunk(&gpu->pmm, curr_chunk, subchunk_size, NULL);
8542         if (status != NV_OK)
8543             goto error;
8544 
8545         // Split physical GPU mappings for indirect peers
8546         for_each_va_space_gpu_in_mask(accessing_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)]) {
8547             NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(&gpu->pmm, curr_chunk, accessing_gpu);
8548 
8549             status = uvm_pmm_sysmem_mappings_split_gpu_chunk_mappings(&accessing_gpu->pmm_reverse_sysmem_mappings,
8550                                                                       peer_addr,
8551                                                                       subchunk_size);
8552             if (status != NV_OK)
8553                 goto error;
8554         }
8555 
8556         if (subchunk_size == new_state.chunk_size)
8557             break;
8558 
8559         // Compute the last subchunk index prior to the split point. Divide the
8560         // entire address space into units of subchunk_size, then mod by the
8561         // number of subchunks within the parent.
8562         last_index = (size_t)uvm_div_pow2_64(new->start - 1, subchunk_size);
8563         num_subchunks = (size_t)uvm_div_pow2_64(uvm_gpu_chunk_get_size(curr_chunk), subchunk_size);
8564         UVM_ASSERT(num_subchunks > 1);
8565         last_index &= num_subchunks - 1;
8566 
8567         uvm_pmm_gpu_get_subchunks(&gpu->pmm, curr_chunk, last_index, 1, &curr_chunk);
8568         UVM_ASSERT(uvm_gpu_chunk_get_size(curr_chunk) == subchunk_size);
8569     }
8570 
8571     // Note that existing's chunks array still has a pointer to original_chunk,
8572     // not to any newly-split subchunks. If a subsequent split failure occurs on
8573     // a later GPU we'll have to merge it back. Once we're past the preallocate
8574     // stage we'll remove it from the chunks array and move the new split chunks
8575     // in.
8576 
8577     return NV_OK;
8578 
8579 error:
8580     // On error we need to leave the chunk in its initial state
8581     block_merge_chunk(existing, gpu, original_chunk);
8582 
8583     return status;
8584 }
8585 
8586 static NV_STATUS block_split_cpu_chunk_to_64k(uvm_va_block_t *block)
8587 {
8588     uvm_cpu_chunk_storage_mixed_t *mixed;
8589     uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, 0);
8590     NV_STATUS status;
8591 
8592     UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_2M);
8593     UVM_ASSERT(uvm_cpu_storage_get_type(block) == UVM_CPU_CHUNK_STORAGE_CHUNK);
8594 
8595     mixed = uvm_kvmalloc_zero(sizeof(*mixed));
8596     if (!mixed)
8597         return NV_ERR_NO_MEMORY;
8598 
8599     status = uvm_cpu_chunk_split(chunk, (uvm_cpu_chunk_t **)&mixed->slots);
8600     if (status != NV_OK) {
8601         uvm_kvfree(mixed);
8602         return status;
8603     }
8604 
8605     bitmap_fill(mixed->big_chunks, MAX_BIG_CPU_CHUNK_SLOTS_PER_UVM_VA_BLOCK);
8606     block->cpu.chunks = (unsigned long)mixed | UVM_CPU_CHUNK_STORAGE_MIXED;
8607     return status;
8608 }
8609 
8610 static NV_STATUS block_split_cpu_chunk_to_4k(uvm_va_block_t *block, uvm_page_index_t page_index)
8611 {
8612     uvm_cpu_chunk_storage_mixed_t *mixed;
8613     uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index);
8614     uvm_cpu_chunk_t **small_chunks;
8615     size_t slot_index;
8616     NV_STATUS status;
8617 
8618     UVM_ASSERT(chunk);
8619     UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_64K);
8620     UVM_ASSERT(uvm_cpu_storage_get_type(block) == UVM_CPU_CHUNK_STORAGE_MIXED);
8621 
8622     mixed = uvm_cpu_storage_get_ptr(block);
8623     slot_index = compute_slot_index(block, page_index);
8624     small_chunks = uvm_kvmalloc_zero(sizeof(*small_chunks) * MAX_SMALL_CHUNKS_PER_BIG_SLOT);
8625     if (!small_chunks)
8626         return NV_ERR_NO_MEMORY;
8627 
8628     status = uvm_cpu_chunk_split(chunk, small_chunks);
8629     if (status != NV_OK) {
8630         uvm_kvfree(small_chunks);
8631         return status;
8632     }
8633 
8634     mixed->slots[slot_index] = small_chunks;
8635     clear_bit(slot_index, mixed->big_chunks);
8636     return status;
8637 }
8638 
8639 static NV_STATUS block_split_cpu_chunk_one(uvm_va_block_t *block, uvm_page_index_t page_index)
8640 {
8641     uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index);
8642     uvm_chunk_size_t chunk_size = uvm_cpu_chunk_get_size(chunk);
8643     uvm_chunk_size_t new_size;
8644     uvm_gpu_t *gpu;
8645     NvU64 gpu_mapping_addr;
8646     uvm_processor_mask_t gpu_split_mask;
8647     uvm_gpu_id_t id;
8648     NV_STATUS status;
8649 
8650     if (chunk_size == UVM_CHUNK_SIZE_2M)
8651         new_size = UVM_CHUNK_SIZE_64K;
8652     else
8653         new_size = UVM_CHUNK_SIZE_4K;
8654 
8655     UVM_ASSERT(IS_ALIGNED(chunk_size, new_size));
8656 
8657     uvm_processor_mask_zero(&gpu_split_mask);
8658     for_each_gpu_id(id) {
8659         if (!uvm_va_block_gpu_state_get(block, id))
8660             continue;
8661 
8662         gpu = block_get_gpu(block, id);
8663 
8664         // If the parent chunk has not been mapped, there is nothing to split.
8665         gpu_mapping_addr = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu->parent);
8666         if (gpu_mapping_addr == 0)
8667             continue;
8668 
8669         status = uvm_pmm_sysmem_mappings_split_gpu_mappings(&gpu->pmm_reverse_sysmem_mappings,
8670                                                             gpu_mapping_addr,
8671                                                             new_size);
8672         if (status != NV_OK)
8673             goto merge;
8674 
8675         uvm_processor_mask_set(&gpu_split_mask, id);
8676     }
8677 
8678     if (new_size == UVM_CHUNK_SIZE_64K)
8679         status = block_split_cpu_chunk_to_64k(block);
8680     else
8681         status = block_split_cpu_chunk_to_4k(block, page_index);
8682 
8683     if (status != NV_OK) {
8684 merge:
8685         for_each_gpu_id_in_mask(id, &gpu_split_mask) {
8686             gpu = block_get_gpu(block, id);
8687             gpu_mapping_addr = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu->parent);
8688             uvm_pmm_sysmem_mappings_merge_gpu_mappings(&gpu->pmm_reverse_sysmem_mappings,
8689                                                        gpu_mapping_addr,
8690                                                        chunk_size);
8691         }
8692     }
8693 
8694     return status;
8695 }
8696 
8697 static NV_STATUS block_prealloc_cpu_chunk_storage(uvm_va_block_t *existing, uvm_va_block_t *new)
8698 {
8699     uvm_cpu_chunk_storage_mixed_t *existing_mixed;
8700     uvm_cpu_chunk_storage_mixed_t *new_mixed = NULL;
8701     size_t slot_offset;
8702     size_t existing_slot;
8703     NV_STATUS status = NV_OK;
8704 
8705     UVM_ASSERT(uvm_cpu_storage_get_type(existing) == UVM_CPU_CHUNK_STORAGE_MIXED);
8706     existing_mixed = uvm_cpu_storage_get_ptr(existing);
8707 
8708     // Pre-allocate chunk storage for the new block. By definition, the new block
8709     // will contain either 64K and/or 4K chunks.
8710     //
8711     // We do this here so there are no failures in block_split_cpu().
8712     new_mixed = uvm_kvmalloc_zero(sizeof(*new_mixed));
8713     if (!new_mixed)
8714         return NV_ERR_NO_MEMORY;
8715 
8716     slot_offset = compute_slot_index(existing, uvm_va_block_cpu_page_index(existing, new->start));
8717     existing_slot = slot_offset;
8718     for_each_clear_bit_from(existing_slot, existing_mixed->big_chunks, MAX_BIG_CPU_CHUNK_SLOTS_PER_UVM_VA_BLOCK) {
8719         size_t new_slot = existing_slot - slot_offset;
8720 
8721         if (existing_mixed->slots[existing_slot]) {
8722             uvm_cpu_chunk_t **small_chunks = uvm_kvmalloc_zero(sizeof(*small_chunks) * MAX_SMALL_CHUNKS_PER_BIG_SLOT);
8723 
8724             if (!small_chunks) {
8725                 status = NV_ERR_NO_MEMORY;
8726                 goto done;
8727             }
8728 
8729             new_mixed->slots[new_slot] = small_chunks;
8730         }
8731     }
8732 
8733     new->cpu.chunks = (unsigned long)new_mixed | UVM_CPU_CHUNK_STORAGE_MIXED;
8734     UVM_ASSERT(status == NV_OK);
8735 
8736 done:
8737     if (status != NV_OK) {
8738         for (; existing_slot > slot_offset; existing_slot--)
8739             uvm_kvfree(new_mixed->slots[existing_slot - slot_offset]);
8740 
8741         uvm_kvfree(new_mixed);
8742     }
8743 
8744     return status;
8745 }
8746 
8747 static void block_free_cpu_chunk_storage(uvm_va_block_t *block)
8748 {
8749     if (block->cpu.chunks) {
8750         uvm_cpu_chunk_storage_mixed_t *mixed;
8751         size_t slot_index;
8752 
8753         UVM_ASSERT(uvm_cpu_storage_get_type(block) == UVM_CPU_CHUNK_STORAGE_MIXED);
8754         mixed = uvm_cpu_storage_get_ptr(block);
8755         for (slot_index = 0; slot_index < MAX_BIG_CPU_CHUNK_SLOTS_PER_UVM_VA_BLOCK; slot_index++)
8756             uvm_kvfree(mixed->slots[slot_index]);
8757 
8758         uvm_kvfree(mixed);
8759         block->cpu.chunks = 0;
8760     }
8761 }
8762 
8763 // Perform any CPU chunk splitting that may be required for this block split.
8764 // Just like block_presplit_gpu_chunks, no chunks are moved to the new block.
8765 static NV_STATUS block_presplit_cpu_chunks(uvm_va_block_t *existing, uvm_va_block_t *new)
8766 {
8767     uvm_page_index_t page_index = uvm_va_block_cpu_page_index(existing, new->start);
8768     uvm_cpu_chunk_t *splitting_chunk;
8769     uvm_chunk_sizes_mask_t split_sizes = uvm_cpu_chunk_get_allocation_sizes();
8770     uvm_chunk_size_t subchunk_size;
8771     NV_STATUS status = NV_OK;
8772 
8773     UVM_ASSERT(!IS_ALIGNED(new->start, UVM_VA_BLOCK_SIZE));
8774     splitting_chunk = uvm_cpu_chunk_get_chunk_for_page(existing, page_index);
8775 
8776     // If the page covering the split point has not been populated, there is no
8777     // need to split.
8778     if (!splitting_chunk)
8779         return NV_OK;
8780 
8781     // If the split point is aligned on the chunk size, there is no need to
8782     // split.
8783     if (IS_ALIGNED(new->start, uvm_cpu_chunk_get_size(splitting_chunk)))
8784         return NV_OK;
8785 
8786     // Remove all sizes above the chunk's current size.
8787     split_sizes &= uvm_cpu_chunk_get_size(splitting_chunk) - 1;
8788     // Remove all sizes below the alignment of the new block's start.
8789     split_sizes &= ~(IS_ALIGNED(new->start, UVM_CHUNK_SIZE_64K) ? UVM_CHUNK_SIZE_64K - 1 : 0);
8790 
8791     for_each_chunk_size_rev(subchunk_size, split_sizes) {
8792         status = block_split_cpu_chunk_one(existing, page_index);
8793         if (status != NV_OK)
8794             return status;
8795     }
8796 
8797     return block_prealloc_cpu_chunk_storage(existing, new);
8798 }
8799 
8800 static void block_merge_cpu_chunks_to_64k(uvm_va_block_t *block, uvm_page_index_t page_index)
8801 {
8802     uvm_cpu_chunk_storage_mixed_t *mixed = uvm_cpu_storage_get_ptr(block);
8803     size_t slot_index = compute_slot_index(block, page_index);
8804     uvm_cpu_chunk_t **small_chunks = mixed->slots[slot_index];
8805     uvm_cpu_chunk_t *merged_chunk;
8806 
8807     UVM_ASSERT(uvm_cpu_storage_get_type(block) == UVM_CPU_CHUNK_STORAGE_MIXED);
8808     UVM_ASSERT(small_chunks);
8809     UVM_ASSERT(!test_bit(slot_index, mixed->big_chunks));
8810 
8811     merged_chunk = uvm_cpu_chunk_merge(small_chunks);
8812     mixed->slots[slot_index] = merged_chunk;
8813     set_bit(slot_index, mixed->big_chunks);
8814     uvm_kvfree(small_chunks);
8815 }
8816 
8817 static void block_merge_cpu_chunks_to_2m(uvm_va_block_t *block, uvm_page_index_t page_index)
8818 {
8819     uvm_cpu_chunk_storage_mixed_t *mixed = uvm_cpu_storage_get_ptr(block);
8820     uvm_cpu_chunk_t **big_chunks = (uvm_cpu_chunk_t **)&mixed->slots;
8821     uvm_cpu_chunk_t *merged_chunk;
8822 
8823     UVM_ASSERT(uvm_cpu_storage_get_type(block) == UVM_CPU_CHUNK_STORAGE_MIXED);
8824     UVM_ASSERT(bitmap_full(mixed->big_chunks, MAX_BIG_CPU_CHUNK_SLOTS_PER_UVM_VA_BLOCK));
8825 
8826     merged_chunk = uvm_cpu_chunk_merge(big_chunks);
8827     block->cpu.chunks = (unsigned long)merged_chunk | UVM_CPU_CHUNK_STORAGE_CHUNK;
8828     uvm_kvfree(mixed);
8829 }
8830 
8831 static void block_merge_cpu_chunks_one(uvm_va_block_t *block, uvm_page_index_t page_index)
8832 {
8833     uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index);
8834     uvm_gpu_id_t id;
8835 
8836     if (uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_4K) {
8837         block_merge_cpu_chunks_to_64k(block, page_index);
8838     }
8839     else {
8840         UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_64K);
8841         block_merge_cpu_chunks_to_2m(block, page_index);
8842     }
8843 
8844     chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index);
8845 
8846     for_each_gpu_id(id) {
8847         NvU64 gpu_mapping_addr;
8848         uvm_gpu_t *gpu;
8849 
8850         if (!uvm_va_block_gpu_state_get(block, id))
8851             continue;
8852 
8853         gpu = block_get_gpu(block, id);
8854         gpu_mapping_addr = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu->parent);
8855         if (gpu_mapping_addr == 0)
8856             continue;
8857 
8858         uvm_pmm_sysmem_mappings_merge_gpu_mappings(&gpu->pmm_reverse_sysmem_mappings,
8859                                                    gpu_mapping_addr,
8860                                                    uvm_cpu_chunk_get_size(chunk));
8861     }
8862 }
8863 
8864 static void block_merge_cpu_chunks(uvm_va_block_t *existing, uvm_va_block_t *new)
8865 {
8866     uvm_page_index_t page_index = uvm_va_block_cpu_page_index(existing, new->start);
8867     uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(existing, page_index);
8868     uvm_chunk_sizes_mask_t merge_sizes = uvm_cpu_chunk_get_allocation_sizes();
8869     uvm_chunk_size_t largest_size;
8870     uvm_chunk_size_t chunk_size;
8871     uvm_chunk_size_t merge_size;
8872     size_t block_size = uvm_va_block_size(existing);
8873 
8874     if (!chunk || uvm_cpu_chunk_is_physical(chunk))
8875         return;
8876 
8877     chunk_size = uvm_cpu_chunk_get_size(chunk);
8878 
8879     // Remove all CPU chunk sizes above the size of the existing VA block.
8880     // Since block sizes are not always powers of 2, use the largest power of 2
8881     // less than or equal to the block size since we can't merge to a size
8882     // larger than the block's size.
8883     largest_size = rounddown_pow_of_two(block_size);
8884     merge_sizes &= (largest_size | (largest_size - 1));
8885 
8886     // Remove all CPU chunk sizes smaller than the size of the chunk being merged up.
8887     merge_sizes &= ~(chunk_size | (chunk_size - 1));
8888 
8889     for_each_chunk_size(merge_size, merge_sizes) {
8890         uvm_va_block_region_t chunk_region;
8891 
8892         // The block has to fully contain the VA range after the merge.
8893         if (!uvm_va_block_contains_address(existing, UVM_ALIGN_DOWN(new->start, merge_size)) ||
8894             !uvm_va_block_contains_address(existing, UVM_ALIGN_DOWN(new->start, merge_size) + merge_size - 1))
8895             break;
8896 
8897         chunk_region = uvm_va_block_chunk_region(existing, merge_size, page_index);
8898 
8899         // If not all pages in the region covered by the chunk are allocated,
8900         // we can't merge.
8901         if (!uvm_page_mask_region_full(&existing->cpu.allocated, chunk_region))
8902             break;
8903 
8904         block_merge_cpu_chunks_one(existing, chunk_region.first);
8905         chunk = uvm_cpu_chunk_get_chunk_for_page(existing, page_index);
8906         if (uvm_cpu_chunk_is_physical(chunk))
8907             break;
8908     }
8909 
8910     block_free_cpu_chunk_storage(new);
8911 }
8912 
8913 // Pre-allocate everything which doesn't require retry on both existing and new
8914 // which will be needed to handle a split. If this fails, existing must remain
8915 // functionally unmodified.
8916 static NV_STATUS block_split_preallocate_no_retry(uvm_va_block_t *existing, uvm_va_block_t *new)
8917 {
8918     NV_STATUS status;
8919     uvm_gpu_t *gpu;
8920     uvm_gpu_id_t id;
8921     uvm_page_index_t split_page_index;
8922     uvm_va_block_test_t *block_test;
8923 
8924     status = block_presplit_cpu_chunks(existing, new);
8925     if (status != NV_OK)
8926         goto error;
8927 
8928     for_each_gpu_id(id) {
8929         if (!uvm_va_block_gpu_state_get(existing, id))
8930             continue;
8931 
8932         gpu = block_get_gpu(existing, id);
8933 
8934         status = block_presplit_gpu_chunks(existing, new, gpu);
8935         if (status != NV_OK)
8936             goto error;
8937 
8938         if (!block_gpu_state_get_alloc(new, gpu)) {
8939             status = NV_ERR_NO_MEMORY;
8940             goto error;
8941         }
8942     }
8943 
8944     block_test = uvm_va_block_get_test(existing);
8945     if (block_test && block_test->inject_split_error) {
8946         block_test->inject_split_error = false;
8947         if (!uvm_va_block_is_hmm(existing)) {
8948             UVM_ASSERT(existing->va_range->inject_split_error);
8949             existing->va_range->inject_split_error = false;
8950         }
8951         status = NV_ERR_NO_MEMORY;
8952         goto error;
8953     }
8954 
8955     if (uvm_va_block_is_hmm(existing)) {
8956         uvm_va_policy_node_t *node = uvm_va_policy_node_find(existing, new->start);
8957 
8958         if (node && node->node.start != new->start) {
8959             status = uvm_va_policy_node_split(existing, node, new->start - 1, NULL);
8960             if (status != NV_OK)
8961                 goto error;
8962         }
8963     }
8964 
8965     return NV_OK;
8966 
8967 error:
8968     // Merge back the chunks we split
8969     split_page_index = uvm_va_block_cpu_page_index(existing, new->start);
8970 
8971     for_each_gpu_id(id) {
8972         uvm_gpu_chunk_t *chunk;
8973         size_t chunk_index;
8974         uvm_va_block_gpu_state_t *existing_gpu_state = uvm_va_block_gpu_state_get(existing, id);
8975 
8976         if (!existing_gpu_state)
8977             continue;
8978 
8979         // If the chunk spanning the split point was split, merge it back
8980         gpu = block_get_gpu(existing, id);
8981         chunk_index = block_gpu_chunk_index(existing, gpu, split_page_index, NULL);
8982         chunk = existing_gpu_state->chunks[chunk_index];
8983         if (!chunk || chunk->state != UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT)
8984             continue;
8985 
8986         block_merge_chunk(existing, gpu, chunk);
8987 
8988         // We could attempt to shrink the chunks array back down, but it doesn't
8989         // hurt much to have it larger than necessary, and we'd have to handle
8990         // the shrink call failing anyway on this error path.
8991 
8992     }
8993 
8994     block_merge_cpu_chunks(existing, new);
8995 
8996     return status;
8997 }
8998 
8999 // Re-calculate the block's top-level processor masks:
9000 //   - block->mapped
9001 //   - block->resident
9002 //
9003 // This is called on block split.
9004 static void block_set_processor_masks(uvm_va_block_t *block)
9005 {
9006     size_t num_pages = uvm_va_block_num_cpu_pages(block);
9007     uvm_va_block_region_t block_region = uvm_va_block_region(0, num_pages);
9008     uvm_gpu_id_t id;
9009 
9010     if (uvm_page_mask_region_empty(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ], block_region)) {
9011         UVM_ASSERT(uvm_page_mask_region_empty(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE], block_region));
9012         uvm_processor_mask_clear(&block->mapped, UVM_ID_CPU);
9013     }
9014     else {
9015         uvm_processor_mask_set(&block->mapped, UVM_ID_CPU);
9016     }
9017 
9018     if (uvm_page_mask_region_empty(&block->cpu.resident, block_region)) {
9019         uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
9020 
9021         if (uvm_processor_mask_get_gpu_count(&va_space->can_access[UVM_ID_CPU_VALUE]) == 0)
9022             UVM_ASSERT(!uvm_processor_mask_test(&block->mapped, UVM_ID_CPU));
9023 
9024         block_clear_resident_processor(block, UVM_ID_CPU);
9025     }
9026     else {
9027         block_set_resident_processor(block, UVM_ID_CPU);
9028     }
9029 
9030     for_each_gpu_id(id) {
9031         uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, id);
9032         if (!gpu_state)
9033             continue;
9034 
9035         if (uvm_page_mask_region_empty(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ], block_region)) {
9036             UVM_ASSERT(uvm_page_mask_region_empty(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_WRITE], block_region));
9037             UVM_ASSERT(uvm_page_mask_region_empty(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_ATOMIC], block_region));
9038             uvm_processor_mask_clear(&block->mapped, id);
9039         }
9040         else {
9041             uvm_processor_mask_set(&block->mapped, id);
9042         }
9043 
9044         if (uvm_page_mask_region_empty(&gpu_state->resident, block_region))
9045             block_clear_resident_processor(block, id);
9046         else
9047             block_set_resident_processor(block, id);
9048 
9049         if (uvm_page_mask_region_empty(&gpu_state->evicted, block_region))
9050             uvm_processor_mask_clear(&block->evicted_gpus, id);
9051         else
9052             uvm_processor_mask_set(&block->evicted_gpus, id);
9053     }
9054 }
9055 
9056 // Split a PAGES_PER_UVM_VA_BLOCK sized bitmap into new and existing parts
9057 // corresponding to a block split.
9058 static void block_split_page_mask(uvm_page_mask_t *existing_mask,
9059                                   size_t existing_pages,
9060                                   uvm_page_mask_t *new_mask,
9061                                   size_t new_pages)
9062 {
9063     UVM_ASSERT_MSG(existing_pages + new_pages <= PAGES_PER_UVM_VA_BLOCK, "existing %zu new %zu\n",
9064                    existing_pages, new_pages);
9065 
9066     // The new block is always in the upper region of existing, so shift the bit
9067     // vectors down.
9068     //
9069     // Note that bitmap_shift_right requires both dst and src to be the same
9070     // size. That's ok since we don't scale them by block size.
9071     uvm_page_mask_shift_right(new_mask, existing_mask, existing_pages);
9072     uvm_page_mask_region_clear(existing_mask, uvm_va_block_region(existing_pages, existing_pages + new_pages));
9073 }
9074 
9075 // Split the CPU state within the existing block. existing's start is correct
9076 // but its end has not yet been adjusted.
9077 static void block_split_cpu(uvm_va_block_t *existing, uvm_va_block_t *new)
9078 {
9079     size_t existing_pages, new_pages = uvm_va_block_num_cpu_pages(new);
9080     uvm_pte_bits_cpu_t pte_bit;
9081     uvm_va_block_region_t block_region = uvm_va_block_region_from_block(existing);
9082     uvm_page_index_t split_page_index = uvm_va_block_cpu_page_index(existing, new->start);
9083     uvm_page_index_t page_index;
9084     uvm_page_index_t next_page_index;
9085     uvm_cpu_chunk_t *chunk;
9086     uvm_va_range_t *existing_va_range = existing->va_range;
9087 
9088     if (existing_va_range) {
9089         UVM_ASSERT(existing->va_range->type == UVM_VA_RANGE_TYPE_MANAGED);
9090         UVM_ASSERT(existing->va_range->type == new->va_range->type);
9091     }
9092 
9093     UVM_ASSERT(existing->start < new->start);
9094     UVM_ASSERT(existing->end == new->end);
9095 
9096     UVM_ASSERT(PAGE_ALIGNED(new->start));
9097     UVM_ASSERT(PAGE_ALIGNED(existing->start));
9098 
9099     existing_pages = (new->start - existing->start) / PAGE_SIZE;
9100 
9101     // We don't have to unmap the CPU since its virtual -> physical mappings
9102     // don't change.
9103 
9104     page_index = uvm_va_block_next_page_in_mask(block_region, &existing->cpu.allocated, split_page_index - 1);
9105 
9106     for_each_cpu_chunk_in_block_region_safe(chunk,
9107                                             page_index,
9108                                             next_page_index,
9109                                             existing,
9110                                             uvm_va_block_region(split_page_index, block_region.outer)) {
9111         uvm_page_index_t new_chunk_page_index;
9112         NV_STATUS status;
9113 
9114         uvm_cpu_chunk_remove_from_block(existing, page_index);
9115 
9116         // The chunk has to be adjusted for the new block before inserting it.
9117         new_chunk_page_index = page_index - split_page_index;
9118 
9119         // This should never fail because all necessary storage was allocated
9120         // in block_presplit_cpu_chunks().
9121         status = uvm_cpu_chunk_insert_in_block(new, chunk, new_chunk_page_index);
9122         UVM_ASSERT(status == NV_OK);
9123     }
9124 
9125     new->cpu.ever_mapped = existing->cpu.ever_mapped;
9126 
9127     block_split_page_mask(&existing->cpu.resident, existing_pages, &new->cpu.resident, new_pages);
9128 
9129     for (pte_bit = 0; pte_bit < UVM_PTE_BITS_CPU_MAX; pte_bit++)
9130         block_split_page_mask(&existing->cpu.pte_bits[pte_bit], existing_pages, &new->cpu.pte_bits[pte_bit], new_pages);
9131 }
9132 
9133 // Fill out the blocks' chunks arrays with the chunks split by
9134 // block_presplit_gpu_chunks.
9135 static void block_copy_split_gpu_chunks(uvm_va_block_t *existing, uvm_va_block_t *new, uvm_gpu_t *gpu)
9136 {
9137     uvm_va_block_gpu_state_t *existing_gpu_state = uvm_va_block_gpu_state_get(existing, gpu->id);
9138     uvm_va_block_gpu_state_t *new_gpu_state = uvm_va_block_gpu_state_get(new, gpu->id);
9139     uvm_gpu_chunk_t **temp_chunks;
9140     uvm_gpu_chunk_t *original_chunk;
9141     block_gpu_chunk_split_state_t existing_before_state, existing_after_state, new_state;
9142     size_t num_pre_chunks, num_post_chunks, num_split_chunks_existing, num_split_chunks_new;
9143     uvm_page_index_t split_page_index = uvm_va_block_cpu_page_index(existing, new->start);
9144     size_t i;
9145 
9146     block_gpu_chunk_get_split_state(existing,
9147                                     &existing_before_state,
9148                                     existing->start,
9149                                     existing->end,
9150                                     split_page_index,
9151                                     gpu);
9152     block_gpu_chunk_get_split_state(existing,
9153                                     &existing_after_state,
9154                                     existing->start,
9155                                     new->start - 1,
9156                                     split_page_index - 1,
9157                                     gpu);
9158     block_gpu_chunk_get_split_state(new,
9159                                     &new_state,
9160                                     new->start,
9161                                     new->end,
9162                                     0,
9163                                     gpu);
9164 
9165     // General case (B is original_chunk):
9166     //                                          split
9167     //                                            v
9168     //  existing (before) [------ A -----][------ B -----][------ C -----]
9169     //  existing (after)  [------ A -----][- B0 -]
9170     //  new                                       [- B1 -][------ C -----]
9171     //
9172     // Note that the logic below also handles the case of the split happening at
9173     // a chunk boundary. That case behaves as though there is no B0 chunk.
9174 
9175     // Number of chunks to the left and right of original_chunk (A and C above).
9176     // Either or both of these may be 0.
9177     num_pre_chunks  = existing_before_state.chunk_index;
9178     num_post_chunks = existing_before_state.num_chunks - num_pre_chunks - 1;
9179 
9180     // Number of subchunks under existing's portion of original_chunk (B0 above)
9181     num_split_chunks_existing = existing_after_state.num_chunks - num_pre_chunks;
9182 
9183     // Number of subchunks under new's portion of original_chunk (B1 above)
9184     num_split_chunks_new = new_state.num_chunks - num_post_chunks;
9185 
9186     UVM_ASSERT(num_pre_chunks + num_split_chunks_existing > 0);
9187     UVM_ASSERT(num_split_chunks_new > 0);
9188 
9189     // Copy post chunks from the end of existing into new (C above)
9190     memcpy(&new_gpu_state->chunks[num_split_chunks_new],
9191            &existing_gpu_state->chunks[existing_before_state.chunk_index + 1],
9192            num_post_chunks * sizeof(new_gpu_state->chunks[0]));
9193 
9194     // Save off the original split chunk since we may overwrite the array
9195     original_chunk = existing_gpu_state->chunks[existing_before_state.chunk_index];
9196 
9197     // Fill out the new pointers
9198     if (original_chunk) {
9199         // Note that if the split happened at a chunk boundary, original_chunk
9200         // will not be split. In that case, num_split_chunks_existing will be 0
9201         // and num_split_chunks_new will be 1, so the left copy will be skipped
9202         // and the right copy will pick up the chunk.
9203 
9204         // Copy left newly-split chunks into existing (B0 above). The array was
9205         // re-sized in block_presplit_gpu_chunks as necessary.
9206         size_t num_subchunks;
9207 
9208         num_subchunks = uvm_pmm_gpu_get_subchunks(&gpu->pmm,
9209                                                   original_chunk,
9210                                                   0, // start_index
9211                                                   num_split_chunks_existing,
9212                                                   &existing_gpu_state->chunks[existing_before_state.chunk_index]);
9213         UVM_ASSERT(num_subchunks == num_split_chunks_existing);
9214 
9215         // Copy right newly-split chunks into new (B1 above), overwriting the
9216         // pointer to the original chunk.
9217         num_subchunks = uvm_pmm_gpu_get_subchunks(&gpu->pmm,
9218                                                   original_chunk,
9219                                                   num_split_chunks_existing, // start_index
9220                                                   num_split_chunks_new,
9221                                                   &new_gpu_state->chunks[0]);
9222         UVM_ASSERT(num_subchunks == num_split_chunks_new);
9223     }
9224     else {
9225         // If the chunk wasn't already populated we don't need to copy pointers
9226         // anywhere, but we need to clear out stale pointers from existing's
9227         // array covering the new elements. new's chunks array was already zero-
9228         // initialized.
9229         memset(&existing_gpu_state->chunks[existing_before_state.chunk_index],
9230                0,
9231                num_split_chunks_existing * sizeof(existing_gpu_state->chunks[0]));
9232     }
9233 
9234     // Since we update the reverse map information, protect it against a
9235     // concurrent lookup
9236     uvm_spin_lock(&gpu->pmm.list_lock);
9237 
9238     // Update the reverse map of all the chunks that are now under the new block
9239     for (i = 0; i < new_state.num_chunks; ++i) {
9240         if (new_gpu_state->chunks[i]) {
9241             UVM_ASSERT(new_gpu_state->chunks[i]->va_block == existing);
9242             new_gpu_state->chunks[i]->va_block = new;
9243 
9244             // Adjust the page_index within the VA block for the new subchunks in
9245             // the new VA block
9246             UVM_ASSERT(new_gpu_state->chunks[i]->va_block_page_index >= split_page_index);
9247             new_gpu_state->chunks[i]->va_block_page_index -= split_page_index;
9248         }
9249     }
9250 
9251     uvm_spin_unlock(&gpu->pmm.list_lock);
9252 
9253     // Attempt to shrink existing's chunk allocation. If the realloc fails, just
9254     // keep on using the old larger one.
9255     if (existing_after_state.num_chunks < existing_before_state.num_chunks) {
9256         temp_chunks = uvm_kvrealloc(existing_gpu_state->chunks,
9257                                     existing_after_state.num_chunks * sizeof(existing_gpu_state->chunks[0]));
9258         if (temp_chunks)
9259             existing_gpu_state->chunks = temp_chunks;
9260     }
9261 }
9262 
9263 static void block_split_gpu(uvm_va_block_t *existing, uvm_va_block_t *new, uvm_gpu_id_t gpu_id)
9264 {
9265     uvm_va_block_gpu_state_t *existing_gpu_state = uvm_va_block_gpu_state_get(existing, gpu_id);
9266     uvm_va_block_gpu_state_t *new_gpu_state = uvm_va_block_gpu_state_get(new, gpu_id);
9267     uvm_va_space_t *va_space = uvm_va_block_get_va_space(existing);
9268     uvm_gpu_va_space_t *gpu_va_space;
9269     uvm_gpu_t *gpu;
9270     uvm_gpu_t *accessing_gpu;
9271     size_t new_pages = uvm_va_block_num_cpu_pages(new);
9272     size_t existing_pages, existing_pages_4k, existing_pages_big, new_pages_big;
9273     uvm_pte_bits_gpu_t pte_bit;
9274     size_t num_chunks, i;
9275     uvm_cpu_chunk_t *cpu_chunk;
9276     uvm_page_index_t page_index;
9277 
9278     if (!existing_gpu_state)
9279         return;
9280 
9281     gpu = uvm_va_space_get_gpu(va_space, gpu_id);
9282     UVM_ASSERT(new_gpu_state);
9283 
9284     new_gpu_state->force_4k_ptes = existing_gpu_state->force_4k_ptes;
9285 
9286     UVM_ASSERT(PAGE_ALIGNED(new->start));
9287     UVM_ASSERT(PAGE_ALIGNED(existing->start));
9288     existing_pages = (new->start - existing->start) / PAGE_SIZE;
9289 
9290     for_each_cpu_chunk_in_block(cpu_chunk, page_index, new) {
9291         uvm_pmm_sysmem_mappings_reparent_gpu_mapping(&gpu->pmm_reverse_sysmem_mappings,
9292                                                      uvm_cpu_chunk_get_gpu_phys_addr(cpu_chunk, gpu->parent),
9293                                                      new);
9294     }
9295 
9296     block_copy_split_gpu_chunks(existing, new, gpu);
9297 
9298     num_chunks = block_num_gpu_chunks(new, gpu);
9299 
9300     // Reparent GPU mappings for indirect peers
9301     for (i = 0; i < num_chunks; ++i) {
9302         uvm_gpu_chunk_t *chunk = new_gpu_state->chunks[i];
9303         if (!chunk)
9304             continue;
9305 
9306         for_each_va_space_gpu_in_mask(accessing_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)]) {
9307             NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(&gpu->pmm, chunk, accessing_gpu);
9308 
9309             uvm_pmm_sysmem_mappings_reparent_gpu_chunk_mapping(&accessing_gpu->pmm_reverse_sysmem_mappings,
9310                                                                peer_addr,
9311                                                                new);
9312         }
9313     }
9314 
9315     block_split_page_mask(&existing_gpu_state->resident,
9316                           existing_pages,
9317                           &new_gpu_state->resident,
9318                           new_pages);
9319 
9320     for (pte_bit = 0; pte_bit < UVM_PTE_BITS_GPU_MAX; pte_bit++) {
9321         block_split_page_mask(&existing_gpu_state->pte_bits[pte_bit], existing_pages,
9322                               &new_gpu_state->pte_bits[pte_bit], new_pages);
9323     }
9324 
9325     // Adjust page table ranges.
9326     gpu_va_space = uvm_gpu_va_space_get(va_space, gpu);
9327     if (gpu_va_space) {
9328         if (existing_gpu_state->page_table_range_big.table) {
9329             NvU32 big_page_size = uvm_va_block_gpu_big_page_size(existing, gpu);
9330 
9331             // existing's end has not been adjusted yet
9332             existing_pages_big = range_num_big_pages(existing->start, new->start - 1, big_page_size);
9333 
9334             // Take references on all big pages covered by new
9335             new_pages_big = uvm_va_block_num_big_pages(new, big_page_size);
9336             if (new_pages_big) {
9337                 uvm_page_table_range_get_upper(&gpu_va_space->page_tables,
9338                                                &existing_gpu_state->page_table_range_big,
9339                                                &new_gpu_state->page_table_range_big,
9340                                                new_pages_big);
9341 
9342                 // If the split point is within a big page region, we might have
9343                 // a gap since neither existing nor new can use it anymore.
9344                 // Get the top N bits from existing's mask to handle that.
9345                 bitmap_shift_right(new_gpu_state->big_ptes,
9346                                    existing_gpu_state->big_ptes,
9347                                    uvm_va_block_num_big_pages(existing, big_page_size) - new_pages_big,
9348                                    MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
9349 
9350                 new_gpu_state->initialized_big = existing_gpu_state->initialized_big;
9351             }
9352 
9353             // Drop existing's references on the big PTEs it no longer covers
9354             // now that new has references on them. Note that neither existing
9355             // nor new might have big PTEs after the split. In that case, this
9356             // shrink will free the entire old range.
9357             uvm_page_table_range_shrink(&gpu_va_space->page_tables,
9358                                         &existing_gpu_state->page_table_range_big,
9359                                         existing_pages_big);
9360 
9361             if (existing_pages_big == 0) {
9362                 memset(&existing_gpu_state->page_table_range_big, 0, sizeof(existing_gpu_state->page_table_range_big));
9363                 existing_gpu_state->initialized_big = false;
9364             }
9365 
9366             bitmap_clear(existing_gpu_state->big_ptes,
9367                          existing_pages_big,
9368                          MAX_BIG_PAGES_PER_UVM_VA_BLOCK - existing_pages_big);
9369         }
9370 
9371         if (existing_gpu_state->page_table_range_4k.table) {
9372             // Since existing and new share the same PDE we just need to bump
9373             // the ref-count on new's sub-range.
9374             uvm_page_table_range_get_upper(&gpu_va_space->page_tables,
9375                                            &existing_gpu_state->page_table_range_4k,
9376                                            &new_gpu_state->page_table_range_4k,
9377                                            uvm_va_block_size(new) / UVM_PAGE_SIZE_4K);
9378 
9379             // Drop existing's references on the PTEs it no longer covers now
9380             // that new has references on them.
9381             existing_pages_4k = existing_pages * (PAGE_SIZE / UVM_PAGE_SIZE_4K);
9382             uvm_page_table_range_shrink(&gpu_va_space->page_tables,
9383                                         &existing_gpu_state->page_table_range_4k,
9384                                         existing_pages_4k);
9385         }
9386 
9387         // We have to set this explicitly to handle the case of splitting an
9388         // invalid, active 2M PTE with no lower page tables allocated.
9389         if (existing_gpu_state->pte_is_2m) {
9390             UVM_ASSERT(!existing_gpu_state->page_table_range_big.table);
9391             UVM_ASSERT(!existing_gpu_state->page_table_range_4k.table);
9392             existing_gpu_state->pte_is_2m = false;
9393         }
9394 
9395         // existing can't possibly cover 2MB after a split, so drop any 2M PTE
9396         // references it has. We've taken the necessary references on the lower
9397         // tables above.
9398         block_put_ptes_safe(&gpu_va_space->page_tables, &existing_gpu_state->page_table_range_2m);
9399         existing_gpu_state->activated_big = false;
9400         existing_gpu_state->activated_4k = false;
9401     }
9402 
9403     block_split_page_mask(&existing_gpu_state->evicted, existing_pages, &new_gpu_state->evicted, new_pages);
9404 }
9405 
9406 NV_STATUS uvm_va_block_split(uvm_va_block_t *existing_va_block,
9407                              NvU64 new_end,
9408                              uvm_va_block_t **new_va_block,
9409                              uvm_va_range_t *new_va_range)
9410 {
9411     uvm_va_space_t *va_space;
9412     uvm_va_block_t *new_block = NULL;
9413     NV_STATUS status;
9414 
9415     va_space = new_va_range->va_space;
9416     UVM_ASSERT(existing_va_block->va_range);
9417     UVM_ASSERT(existing_va_block->va_range->va_space == va_space);
9418     UVM_ASSERT(!uvm_va_block_is_hmm(existing_va_block));
9419 
9420     // External range types can't be split
9421     UVM_ASSERT(existing_va_block->va_range->type == UVM_VA_RANGE_TYPE_MANAGED);
9422     UVM_ASSERT(new_va_range->type == UVM_VA_RANGE_TYPE_MANAGED);
9423     uvm_assert_rwsem_locked_write(&va_space->lock);
9424 
9425     UVM_ASSERT(new_end > existing_va_block->start);
9426     UVM_ASSERT(new_end < existing_va_block->end);
9427     UVM_ASSERT(PAGE_ALIGNED(new_end + 1));
9428 
9429     status = uvm_va_block_create(new_va_range, new_end + 1, existing_va_block->end, &new_block);
9430     if (status != NV_OK)
9431         return status;
9432 
9433     // We're protected from other splits and faults by the va_space lock being
9434     // held in write mode, but that doesn't stop the reverse mapping (eviction
9435     // path) from inspecting the existing block. Stop those threads by taking
9436     // the block lock. When a reverse mapping thread takes this lock after the
9437     // split has been performed, it will have to re-inspect state and may see
9438     // that it should use the newly-split block instead.
9439     uvm_mutex_lock(&existing_va_block->lock);
9440 
9441     status = uvm_va_block_split_locked(existing_va_block, new_end, new_block, new_va_range);
9442 
9443     uvm_mutex_unlock(&existing_va_block->lock);
9444 
9445     if (status != NV_OK)
9446         uvm_va_block_release(new_block);
9447     else if (new_va_block)
9448         *new_va_block = new_block;
9449 
9450     return status;
9451 }
9452 
9453 NV_STATUS uvm_va_block_split_locked(uvm_va_block_t *existing_va_block,
9454                                     NvU64 new_end,
9455                                     uvm_va_block_t *new_block,
9456                                     uvm_va_range_t *new_va_range)
9457 {
9458     uvm_va_space_t *va_space = uvm_va_block_get_va_space(existing_va_block);
9459     uvm_gpu_id_t id;
9460     NV_STATUS status;
9461     uvm_perf_event_data_t event_data;
9462 
9463     UVM_ASSERT(block_check_chunks(existing_va_block));
9464 
9465     // As soon as we update existing's reverse mappings to point to the newly-
9466     // split block, the eviction path could try to operate on the new block.
9467     // Lock that out too until new is ready.
9468     //
9469     // Note that we usually shouldn't nest block locks, but it's ok here because
9470     // we just created new_block so no other thread could possibly take it out
9471     // of order with existing's lock.
9472     uvm_mutex_lock_no_tracking(&new_block->lock);
9473 
9474     // The split has to be transactional, meaning that if we fail, the existing
9475     // block must not be modified. Handle that by pre-allocating everything we
9476     // might need under both existing and new at the start so we only have a
9477     // single point of failure.
9478 
9479     // Since pre-allocation might require allocating new PTEs, we have to handle
9480     // allocation retry which might drop existing's block lock. The
9481     // preallocation is split into two steps for that: the first part which
9482     // allocates and splits PTEs can handle having the block lock dropped then
9483     // re-taken. It won't modify existing_va_block other than adding new PTE
9484     // allocations and splitting existing PTEs, which is always safe.
9485     status = UVM_VA_BLOCK_RETRY_LOCKED(existing_va_block,
9486                                        NULL,
9487                                        block_split_presplit_ptes(existing_va_block, new_block));
9488     if (status != NV_OK)
9489         goto out;
9490 
9491     // Pre-allocate, stage two. This modifies existing_va_block in ways which
9492     // violate many assumptions (such as changing chunk size), but it will put
9493     // things back into place on a failure without dropping the block lock.
9494     status = block_split_preallocate_no_retry(existing_va_block, new_block);
9495     if (status != NV_OK)
9496         goto out;
9497 
9498     // We'll potentially be freeing page tables, so we need to wait for any
9499     // outstanding work before we start
9500     status = uvm_tracker_wait(&existing_va_block->tracker);
9501     if (status != NV_OK)
9502         goto out;
9503 
9504     // Update existing's state only once we're past all failure points
9505 
9506     event_data.block_shrink.block = existing_va_block;
9507     uvm_perf_event_notify(&va_space->perf_events, UVM_PERF_EVENT_BLOCK_SHRINK, &event_data);
9508 
9509     block_split_cpu(existing_va_block, new_block);
9510 
9511     for_each_gpu_id(id)
9512         block_split_gpu(existing_va_block, new_block, id);
9513 
9514     // Update the size of the existing block first so that
9515     // block_set_processor_masks can use block_{set,clear}_resident_processor
9516     // that relies on the size to be correct.
9517     existing_va_block->end = new_end;
9518 
9519     block_split_page_mask(&existing_va_block->read_duplicated_pages,
9520                           uvm_va_block_num_cpu_pages(existing_va_block),
9521                           &new_block->read_duplicated_pages,
9522                           uvm_va_block_num_cpu_pages(new_block));
9523 
9524     block_split_page_mask(&existing_va_block->maybe_mapped_pages,
9525                           uvm_va_block_num_cpu_pages(existing_va_block),
9526                           &new_block->maybe_mapped_pages,
9527                           uvm_va_block_num_cpu_pages(new_block));
9528 
9529     block_set_processor_masks(existing_va_block);
9530     block_set_processor_masks(new_block);
9531 
9532     if (uvm_va_block_is_hmm(existing_va_block)) {
9533         uvm_hmm_va_block_split_tree(existing_va_block, new_block);
9534         uvm_va_policy_node_split_move(existing_va_block, new_block);
9535     }
9536 
9537 out:
9538     // Run checks on existing_va_block even on failure, since an error must
9539     // leave the block in a consistent state.
9540     UVM_ASSERT(block_check_chunks(existing_va_block));
9541     UVM_ASSERT(block_check_mappings(existing_va_block));
9542     if (status == NV_OK) {
9543         UVM_ASSERT(block_check_chunks(new_block));
9544         UVM_ASSERT(block_check_mappings(new_block));
9545     }
9546     else {
9547         block_free_cpu_chunk_storage(new_block);
9548     }
9549 
9550     uvm_mutex_unlock_no_tracking(&new_block->lock);
9551 
9552     return status;
9553 }
9554 
9555 static bool block_region_might_read_duplicate(uvm_va_block_t *va_block,
9556                                               uvm_va_block_region_t region)
9557 {
9558     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
9559     uvm_va_range_t *va_range = va_block->va_range;
9560 
9561     if (!uvm_va_space_can_read_duplicate(va_space, NULL))
9562         return false;
9563 
9564     // TODO: Bug 3660922: need to implement HMM read duplication support.
9565     if (uvm_va_block_is_hmm(va_block) ||
9566         uvm_va_range_get_policy(va_range)->read_duplication == UVM_READ_DUPLICATION_DISABLED)
9567         return false;
9568 
9569     if (uvm_va_range_get_policy(va_range)->read_duplication == UVM_READ_DUPLICATION_UNSET
9570         && uvm_page_mask_region_weight(&va_block->read_duplicated_pages, region) == 0)
9571         return false;
9572 
9573     return true;
9574 }
9575 
9576 // Returns the new access permission for the processor that faulted or
9577 // triggered access counter notifications on the given page
9578 //
9579 // TODO: Bug 1766424: this function works on a single page at a time. This
9580 //       could be changed in the future to optimize multiple faults/counters on
9581 //       contiguous pages.
9582 static uvm_prot_t compute_new_permission(uvm_va_block_t *va_block,
9583                                          uvm_va_block_context_t *va_block_context,
9584                                          uvm_page_index_t page_index,
9585                                          uvm_processor_id_t fault_processor_id,
9586                                          uvm_processor_id_t new_residency,
9587                                          uvm_fault_access_type_t access_type)
9588 {
9589     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
9590     uvm_prot_t logical_prot, new_prot;
9591 
9592     // TODO: Bug 1766432: Refactor into policies. Current policy is
9593     //       query_promote: upgrade access privileges to avoid future faults IF
9594     //       they don't trigger further revocations.
9595     new_prot = uvm_fault_access_type_to_prot(access_type);
9596     logical_prot = compute_logical_prot(va_block, va_block_context, page_index);
9597 
9598     UVM_ASSERT(logical_prot >= new_prot);
9599 
9600     if (logical_prot > UVM_PROT_READ_ONLY && new_prot == UVM_PROT_READ_ONLY &&
9601         !block_region_might_read_duplicate(va_block, uvm_va_block_region_for_page(page_index))) {
9602         uvm_processor_mask_t processors_with_atomic_mapping;
9603         uvm_processor_mask_t revoke_processors;
9604 
9605         block_page_authorized_processors(va_block,
9606                                          page_index,
9607                                          UVM_PROT_READ_WRITE_ATOMIC,
9608                                          &processors_with_atomic_mapping);
9609 
9610         uvm_processor_mask_andnot(&revoke_processors,
9611                                   &processors_with_atomic_mapping,
9612                                   &va_space->has_native_atomics[uvm_id_value(new_residency)]);
9613 
9614         // Only check if there are no faultable processors in the revoke
9615         // processors mask.
9616         uvm_processor_mask_and(&revoke_processors, &revoke_processors, &va_space->faultable_processors);
9617 
9618         if (uvm_processor_mask_empty(&revoke_processors))
9619             new_prot = UVM_PROT_READ_WRITE;
9620     }
9621     if (logical_prot == UVM_PROT_READ_WRITE_ATOMIC) {
9622         // HMM allocations with logical read/write/atomic permission can be
9623         // upgraded without notifying the driver so assume read/write/atomic
9624         // even if the fault is only for reading.
9625         if (new_prot == UVM_PROT_READ_WRITE ||
9626             (UVM_ID_IS_CPU(fault_processor_id) && uvm_va_block_is_hmm(va_block))) {
9627             if (uvm_processor_mask_test(&va_space->has_native_atomics[uvm_id_value(new_residency)], fault_processor_id))
9628                 new_prot = UVM_PROT_READ_WRITE_ATOMIC;
9629         }
9630     }
9631 
9632     return new_prot;
9633 }
9634 
9635 static NV_STATUS do_block_add_mappings_after_migration(uvm_va_block_t *va_block,
9636                                                        uvm_va_block_context_t *va_block_context,
9637                                                        uvm_processor_id_t new_residency,
9638                                                        uvm_processor_id_t processor_id,
9639                                                        const uvm_processor_mask_t *map_processors,
9640                                                        uvm_va_block_region_t region,
9641                                                        const uvm_page_mask_t *map_page_mask,
9642                                                        uvm_prot_t max_prot,
9643                                                        const uvm_processor_mask_t *thrashing_processors,
9644                                                        uvm_tracker_t *tracker)
9645 {
9646     NV_STATUS status;
9647     uvm_processor_id_t map_processor_id;
9648     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
9649     uvm_prot_t new_map_prot = max_prot;
9650     uvm_processor_mask_t map_processors_local;
9651 
9652     uvm_processor_mask_copy(&map_processors_local, map_processors);
9653 
9654     // Handle atomic mappings separately
9655     if (max_prot == UVM_PROT_READ_WRITE_ATOMIC) {
9656         bool this_processor_has_native_atomics;
9657 
9658         this_processor_has_native_atomics =
9659             uvm_processor_mask_test(&va_space->has_native_atomics[uvm_id_value(new_residency)], processor_id);
9660 
9661         if (this_processor_has_native_atomics) {
9662             uvm_processor_mask_t map_atomic_processors;
9663 
9664             // Compute processors with native atomics to the residency
9665             uvm_processor_mask_and(&map_atomic_processors,
9666                                    &map_processors_local,
9667                                    &va_space->has_native_atomics[uvm_id_value(new_residency)]);
9668 
9669             // Filter out these mapped processors for the next steps
9670             uvm_processor_mask_andnot(&map_processors_local, &map_processors_local, &map_atomic_processors);
9671 
9672             for_each_id_in_mask(map_processor_id, &map_atomic_processors) {
9673                 UvmEventMapRemoteCause cause = UvmEventMapRemoteCausePolicy;
9674                 if (thrashing_processors && uvm_processor_mask_test(thrashing_processors, map_processor_id))
9675                     cause = UvmEventMapRemoteCauseThrashing;
9676 
9677                 status = uvm_va_block_map(va_block,
9678                                           va_block_context,
9679                                           map_processor_id,
9680                                           region,
9681                                           map_page_mask,
9682                                           UVM_PROT_READ_WRITE_ATOMIC,
9683                                           cause,
9684                                           tracker);
9685                 if (status != NV_OK)
9686                     return status;
9687             }
9688 
9689             new_map_prot = UVM_PROT_READ_WRITE;
9690         }
9691         else {
9692             if (UVM_ID_IS_CPU(processor_id))
9693                 new_map_prot = UVM_PROT_READ_WRITE;
9694             else
9695                 new_map_prot = UVM_PROT_READ_ONLY;
9696         }
9697     }
9698 
9699     // Map the rest of processors
9700     for_each_id_in_mask(map_processor_id, &map_processors_local) {
9701         UvmEventMapRemoteCause cause = UvmEventMapRemoteCausePolicy;
9702         uvm_prot_t final_map_prot;
9703         bool map_processor_has_enabled_system_wide_atomics =
9704             uvm_processor_mask_test(&va_space->system_wide_atomics_enabled_processors, map_processor_id);
9705 
9706         // Write mappings from processors with disabled system-wide atomics are treated like atomics
9707         if (new_map_prot == UVM_PROT_READ_WRITE && !map_processor_has_enabled_system_wide_atomics)
9708             final_map_prot = UVM_PROT_READ_WRITE_ATOMIC;
9709         else
9710             final_map_prot = new_map_prot;
9711 
9712         if (thrashing_processors && uvm_processor_mask_test(thrashing_processors, map_processor_id))
9713             cause = UvmEventMapRemoteCauseThrashing;
9714 
9715         status = uvm_va_block_map(va_block,
9716                                   va_block_context,
9717                                   map_processor_id,
9718                                   region,
9719                                   map_page_mask,
9720                                   final_map_prot,
9721                                   cause,
9722                                   tracker);
9723         if (status != NV_OK)
9724             return status;
9725     }
9726 
9727     return NV_OK;
9728 }
9729 
9730 NV_STATUS uvm_va_block_add_mappings_after_migration(uvm_va_block_t *va_block,
9731                                                     uvm_va_block_context_t *va_block_context,
9732                                                     uvm_processor_id_t new_residency,
9733                                                     uvm_processor_id_t processor_id,
9734                                                     uvm_va_block_region_t region,
9735                                                     const uvm_page_mask_t *map_page_mask,
9736                                                     uvm_prot_t max_prot,
9737                                                     const uvm_processor_mask_t *thrashing_processors)
9738 {
9739     NV_STATUS tracker_status, status = NV_OK;
9740     uvm_processor_mask_t map_other_processors, map_uvm_lite_gpus;
9741     uvm_processor_id_t map_processor_id;
9742     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
9743     const uvm_page_mask_t *final_page_mask = map_page_mask;
9744     uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
9745     const uvm_va_policy_t *policy = va_block_context->policy;
9746     uvm_processor_id_t preferred_location;
9747 
9748     uvm_assert_mutex_locked(&va_block->lock);
9749     UVM_ASSERT(uvm_va_block_check_policy_is_valid(va_block, policy, region));
9750 
9751     // Read duplication takes precedence over SetAccessedBy.
9752     //
9753     // Exclude ranges with read duplication set...
9754     if (uvm_va_policy_is_read_duplicate(policy, va_space)) {
9755         status = NV_OK;
9756         goto out;
9757     }
9758 
9759     // ... and pages read-duplicated by performance heuristics
9760     if (policy->read_duplication == UVM_READ_DUPLICATION_UNSET) {
9761         if (map_page_mask) {
9762             uvm_page_mask_andnot(&va_block_context->mapping.filtered_page_mask,
9763                                  map_page_mask,
9764                                  &va_block->read_duplicated_pages);
9765         }
9766         else {
9767             uvm_page_mask_complement(&va_block_context->mapping.filtered_page_mask, &va_block->read_duplicated_pages);
9768         }
9769         final_page_mask = &va_block_context->mapping.filtered_page_mask;
9770     }
9771 
9772     // Add mappings for accessed_by processors and the given processor mask
9773     if (thrashing_processors)
9774         uvm_processor_mask_or(&map_other_processors, &policy->accessed_by, thrashing_processors);
9775     else
9776         uvm_processor_mask_copy(&map_other_processors, &policy->accessed_by);
9777 
9778     // Only processors that can access the new location must be considered
9779     uvm_processor_mask_and(&map_other_processors,
9780                            &map_other_processors,
9781                            &va_space->accessible_from[uvm_id_value(new_residency)]);
9782 
9783     // Exclude caller processor as it must have already been mapped
9784     uvm_processor_mask_clear(&map_other_processors, processor_id);
9785 
9786     // Exclude preferred location so it won't get remote mappings
9787     preferred_location = policy->preferred_location;
9788     if (UVM_ID_IS_VALID(preferred_location) &&
9789         !uvm_id_equal(new_residency, preferred_location) &&
9790         uvm_va_space_processor_has_memory(va_space, preferred_location)) {
9791         uvm_processor_mask_clear(&map_other_processors, preferred_location);
9792     }
9793 
9794     // Map the UVM-Lite GPUs if the new location is the preferred location. This
9795     // will only create mappings on first touch. After that they're persistent
9796     // so uvm_va_block_map will be a no-op.
9797     uvm_processor_mask_and(&map_uvm_lite_gpus, &map_other_processors, block_get_uvm_lite_gpus(va_block));
9798     if (!uvm_processor_mask_empty(&map_uvm_lite_gpus) &&
9799         uvm_id_equal(new_residency, preferred_location)) {
9800         for_each_id_in_mask(map_processor_id, &map_uvm_lite_gpus) {
9801             status = uvm_va_block_map(va_block,
9802                                       va_block_context,
9803                                       map_processor_id,
9804                                       region,
9805                                       final_page_mask,
9806                                       UVM_PROT_READ_WRITE_ATOMIC,
9807                                       UvmEventMapRemoteCauseCoherence,
9808                                       &local_tracker);
9809             if (status != NV_OK)
9810                 goto out;
9811         }
9812     }
9813 
9814     uvm_processor_mask_andnot(&map_other_processors, &map_other_processors, block_get_uvm_lite_gpus(va_block));
9815 
9816     // We can't map non-migratable pages to the CPU. If we have any, build a
9817     // new mask of migratable pages and map the CPU separately.
9818     if (uvm_processor_mask_test(&map_other_processors, UVM_ID_CPU) &&
9819         !uvm_range_group_all_migratable(va_space,
9820                                         uvm_va_block_region_start(va_block, region),
9821                                         uvm_va_block_region_end(va_block, region))) {
9822         uvm_page_mask_t *migratable_mask = &va_block_context->mapping.migratable_mask;
9823 
9824         uvm_range_group_migratable_page_mask(va_block, region, migratable_mask);
9825         if (uvm_page_mask_and(migratable_mask, migratable_mask, final_page_mask)) {
9826             uvm_processor_mask_t cpu_mask;
9827             uvm_processor_mask_zero(&cpu_mask);
9828             uvm_processor_mask_set(&cpu_mask, UVM_ID_CPU);
9829 
9830             status = do_block_add_mappings_after_migration(va_block,
9831                                                            va_block_context,
9832                                                            new_residency,
9833                                                            processor_id,
9834                                                            &cpu_mask,
9835                                                            region,
9836                                                            migratable_mask,
9837                                                            max_prot,
9838                                                            thrashing_processors,
9839                                                            &local_tracker);
9840             if (status != NV_OK)
9841                 goto out;
9842         }
9843 
9844         uvm_processor_mask_clear(&map_other_processors, UVM_ID_CPU);
9845     }
9846 
9847     status = do_block_add_mappings_after_migration(va_block,
9848                                                    va_block_context,
9849                                                    new_residency,
9850                                                    processor_id,
9851                                                    &map_other_processors,
9852                                                    region,
9853                                                    final_page_mask,
9854                                                    max_prot,
9855                                                    thrashing_processors,
9856                                                    &local_tracker);
9857     if (status != NV_OK)
9858         goto out;
9859 
9860 out:
9861     tracker_status = uvm_tracker_add_tracker_safe(&va_block->tracker, &local_tracker);
9862     uvm_tracker_deinit(&local_tracker);
9863     return status == NV_OK ? tracker_status : status;
9864 }
9865 
9866 // TODO: Bug 1750144: check logical permissions from HMM to know what's the
9867 //       maximum allowed.
9868 uvm_prot_t uvm_va_block_page_compute_highest_permission(uvm_va_block_t *va_block,
9869                                                         uvm_processor_id_t processor_id,
9870                                                         uvm_page_index_t page_index)
9871 {
9872     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
9873     uvm_processor_mask_t resident_processors;
9874     NvU32 resident_processors_count;
9875 
9876     if (uvm_processor_mask_test(block_get_uvm_lite_gpus(va_block), processor_id))
9877         return UVM_PROT_READ_WRITE_ATOMIC;
9878 
9879     uvm_va_block_page_resident_processors(va_block, page_index, &resident_processors);
9880     resident_processors_count = uvm_processor_mask_get_count(&resident_processors);
9881 
9882     if (resident_processors_count == 0) {
9883         return UVM_PROT_NONE;
9884     }
9885     else if (resident_processors_count > 1) {
9886         // If there are many copies, we can only map READ ONLY
9887         //
9888         // The block state doesn't track the mapping target (aperture) of each
9889         // individual PTE, just the permissions and where the data is resident.
9890         // If the data is resident in multiple places, then we have a problem
9891         // since we can't know where the PTE points. This means we won't know
9892         // what needs to be unmapped for cases like UvmUnregisterGpu and
9893         // UvmDisablePeerAccess.
9894         //
9895         // The simple way to solve this is to enforce that a read-duplication
9896         // mapping always points to local memory.
9897         if (uvm_processor_mask_test(&resident_processors, processor_id))
9898             return UVM_PROT_READ_ONLY;
9899 
9900         return UVM_PROT_NONE;
9901     }
9902     else {
9903         uvm_processor_id_t atomic_id;
9904         uvm_processor_id_t residency;
9905         uvm_processor_mask_t atomic_mappings;
9906         uvm_processor_mask_t write_mappings;
9907 
9908         // Search the id of the processor with the only resident copy
9909         residency = uvm_processor_mask_find_first_id(&resident_processors);
9910         UVM_ASSERT(UVM_ID_IS_VALID(residency));
9911 
9912         // If we cannot map the processor with the resident copy, exit
9913         if (!uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(residency)], processor_id))
9914             return UVM_PROT_NONE;
9915 
9916         // Fast path: if the page is not mapped anywhere else, it can be safely
9917         // mapped with RWA permission
9918         if (!uvm_page_mask_test(&va_block->maybe_mapped_pages, page_index))
9919             return UVM_PROT_READ_WRITE_ATOMIC;
9920 
9921         block_page_authorized_processors(va_block, page_index, UVM_PROT_READ_WRITE_ATOMIC, &atomic_mappings);
9922 
9923         // Exclude processors with system-wide atomics disabled from atomic_mappings
9924         uvm_processor_mask_and(&atomic_mappings,
9925                                &atomic_mappings,
9926                                &va_space->system_wide_atomics_enabled_processors);
9927 
9928         // Exclude the processor for which the mapping protections are being computed
9929         uvm_processor_mask_clear(&atomic_mappings, processor_id);
9930 
9931         // If there is any processor with atomic mapping, check if it has native atomics to the processor
9932         // with the resident copy. If it does not, we can only map READ ONLY
9933         atomic_id = uvm_processor_mask_find_first_id(&atomic_mappings);
9934         if (UVM_ID_IS_VALID(atomic_id) &&
9935             !uvm_processor_mask_test(&va_space->has_native_atomics[uvm_id_value(residency)], atomic_id)) {
9936             return UVM_PROT_READ_ONLY;
9937         }
9938 
9939         block_page_authorized_processors(va_block, page_index, UVM_PROT_READ_WRITE, &write_mappings);
9940 
9941         // Exclude the processor for which the mapping protections are being computed
9942         uvm_processor_mask_clear(&write_mappings, processor_id);
9943 
9944         // At this point, any processor with atomic mappings either has native atomics support to the
9945         // processor with the resident copy or has disabled system-wide atomics. If the requesting
9946         // processor has disabled system-wide atomics or has native atomics to that processor, we can
9947         // map with ATOMIC privileges. Likewise, if there are no other processors with WRITE or ATOMIC
9948         // mappings, we can map with ATOMIC privileges.
9949         if (!uvm_processor_mask_test(&va_space->system_wide_atomics_enabled_processors, processor_id) ||
9950             uvm_processor_mask_test(&va_space->has_native_atomics[uvm_id_value(residency)], processor_id) ||
9951             uvm_processor_mask_empty(&write_mappings)) {
9952             return UVM_PROT_READ_WRITE_ATOMIC;
9953         }
9954 
9955         return UVM_PROT_READ_WRITE;
9956     }
9957 }
9958 
9959 NV_STATUS uvm_va_block_add_mappings(uvm_va_block_t *va_block,
9960                                     uvm_va_block_context_t *va_block_context,
9961                                     uvm_processor_id_t processor_id,
9962                                     uvm_va_block_region_t region,
9963                                     const uvm_page_mask_t *page_mask,
9964                                     UvmEventMapRemoteCause cause)
9965 {
9966     uvm_va_range_t *va_range = va_block->va_range;
9967     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
9968     NV_STATUS status = NV_OK;
9969     uvm_page_index_t page_index;
9970     uvm_range_group_range_iter_t iter;
9971     uvm_prot_t prot_to_map;
9972 
9973     UVM_ASSERT(uvm_va_block_check_policy_is_valid(va_block, va_block_context->policy, region));
9974 
9975     if (UVM_ID_IS_CPU(processor_id) && !uvm_va_block_is_hmm(va_block)) {
9976         if (!uvm_va_range_vma_check(va_range, va_block_context->mm))
9977             return NV_OK;
9978 
9979         uvm_range_group_range_migratability_iter_first(va_space,
9980                                                        uvm_va_block_region_start(va_block, region),
9981                                                        uvm_va_block_region_end(va_block, region),
9982                                                        &iter);
9983     }
9984 
9985     for (prot_to_map = UVM_PROT_READ_ONLY; prot_to_map <= UVM_PROT_READ_WRITE_ATOMIC; ++prot_to_map)
9986         va_block_context->mask_by_prot[prot_to_map - 1].count = 0;
9987 
9988     for_each_va_block_page_in_region_mask(page_index, page_mask, region) {
9989         // Read duplication takes precedence over SetAccessedBy. Exclude pages
9990         // read-duplicated by performance heuristics
9991         if (uvm_page_mask_test(&va_block->read_duplicated_pages, page_index))
9992             continue;
9993 
9994         prot_to_map = uvm_va_block_page_compute_highest_permission(va_block, processor_id, page_index);
9995         if (prot_to_map == UVM_PROT_NONE)
9996             continue;
9997 
9998         if (UVM_ID_IS_CPU(processor_id) && !uvm_va_block_is_hmm(va_block)) {
9999             while (uvm_va_block_cpu_page_index(va_block, iter.end) < page_index) {
10000                 uvm_range_group_range_migratability_iter_next(va_space,
10001                                                               &iter,
10002                                                               uvm_va_block_region_end(va_block, region));
10003             }
10004 
10005             if (!iter.migratable)
10006                 continue;
10007         }
10008 
10009         if (va_block_context->mask_by_prot[prot_to_map - 1].count++ == 0)
10010             uvm_page_mask_zero(&va_block_context->mask_by_prot[prot_to_map - 1].page_mask);
10011 
10012         uvm_page_mask_set(&va_block_context->mask_by_prot[prot_to_map - 1].page_mask, page_index);
10013     }
10014 
10015     for (prot_to_map = UVM_PROT_READ_ONLY; prot_to_map <= UVM_PROT_READ_WRITE_ATOMIC; ++prot_to_map) {
10016         if (va_block_context->mask_by_prot[prot_to_map - 1].count == 0)
10017             continue;
10018 
10019         status = uvm_va_block_map(va_block,
10020                                   va_block_context,
10021                                   processor_id,
10022                                   region,
10023                                   &va_block_context->mask_by_prot[prot_to_map - 1].page_mask,
10024                                   prot_to_map,
10025                                   cause,
10026                                   &va_block->tracker);
10027         if (status != NV_OK)
10028             break;
10029     }
10030 
10031     return status;
10032 }
10033 
10034 static bool can_read_duplicate(uvm_va_block_t *va_block,
10035                                uvm_page_index_t page_index,
10036                                const uvm_va_policy_t *policy,
10037                                const uvm_perf_thrashing_hint_t *thrashing_hint)
10038 {
10039     if (uvm_va_policy_is_read_duplicate(policy, uvm_va_block_get_va_space(va_block)))
10040         return true;
10041 
10042     if (policy->read_duplication != UVM_READ_DUPLICATION_DISABLED &&
10043         uvm_page_mask_test(&va_block->read_duplicated_pages, page_index) &&
10044         thrashing_hint->type != UVM_PERF_THRASHING_HINT_TYPE_PIN)
10045         return true;
10046 
10047     return false;
10048 }
10049 
10050 // TODO: Bug 1827400: If the faulting processor has support for native
10051 //       atomics to the current location and the faults on the page were
10052 //       triggered by atomic accesses only, we keep the current residency.
10053 //       This is a short-term solution to exercise remote atomics over
10054 //       NVLINK when possible (not only when preferred location is set to
10055 //       the remote GPU) as they are much faster than relying on page
10056 //       faults and permission downgrades, which cause thrashing. In the
10057 //       future, the thrashing detection/prevention heuristics should
10058 //       detect and handle this case.
10059 static bool map_remote_on_atomic_fault(uvm_va_space_t *va_space,
10060                                        NvU32 access_type_mask,
10061                                        uvm_processor_id_t processor_id,
10062                                        uvm_processor_id_t residency)
10063 {
10064     // This policy can be enabled/disabled using a module parameter
10065     if (!uvm_perf_map_remote_on_native_atomics_fault)
10066         return false;
10067 
10068     // Only consider atomics faults
10069     if (uvm_fault_access_type_mask_lowest(access_type_mask) < UVM_FAULT_ACCESS_TYPE_ATOMIC_WEAK)
10070         return false;
10071 
10072     // We cannot differentiate CPU writes from atomics. We exclude CPU faults
10073     // from the logic explained above in order to avoid mapping CPU to vidmem
10074     // memory due to a write.
10075     if (UVM_ID_IS_CPU(processor_id))
10076         return false;
10077 
10078     // On P9 systems (which have native HW support for system-wide atomics), we
10079     // have determined experimentally that placing memory on a GPU yields the
10080     // best performance on most cases (since CPU can cache vidmem but not vice
10081     // versa). Therefore, don't map remotely if the current residency is
10082     // sysmem.
10083     if (UVM_ID_IS_CPU(residency))
10084         return false;
10085 
10086     return uvm_processor_mask_test(&va_space->has_native_atomics[uvm_id_value(residency)], processor_id);
10087 }
10088 
10089 // TODO: Bug 1766424: this function works on a single page at a time. This
10090 //       could be changed in the future to optimize multiple faults or access
10091 //       counter notifications on contiguous pages.
10092 static uvm_processor_id_t block_select_residency(uvm_va_block_t *va_block,
10093                                                  uvm_va_block_context_t *va_block_context,
10094                                                  uvm_page_index_t page_index,
10095                                                  uvm_processor_id_t processor_id,
10096                                                  NvU32 access_type_mask,
10097                                                  const uvm_va_policy_t *policy,
10098                                                  const uvm_perf_thrashing_hint_t *thrashing_hint,
10099                                                  uvm_service_operation_t operation,
10100                                                  bool *read_duplicate)
10101 {
10102     uvm_processor_id_t closest_resident_processor;
10103     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
10104     bool may_read_duplicate;
10105     uvm_processor_id_t preferred_location;
10106 
10107     // TODO: Bug 3660968: Remove uvm_hmm_force_sysmem_set() check as soon as
10108     // HMM migration is implemented VMAs other than anonymous memory.
10109     if (is_uvm_fault_force_sysmem_set() || uvm_hmm_must_use_sysmem(va_block, va_block_context)) {
10110         *read_duplicate = false;
10111         return UVM_ID_CPU;
10112     }
10113 
10114     may_read_duplicate = can_read_duplicate(va_block, page_index, policy, thrashing_hint);
10115 
10116     // Read/prefetch faults on a VA range with read duplication enabled
10117     // always create a copy of the page on the faulting processor's memory.
10118     // Note that access counters always use UVM_FAULT_ACCESS_TYPE_PREFETCH,
10119     // which will lead to read duplication if it is enabled.
10120     *read_duplicate = may_read_duplicate &&
10121                       (uvm_fault_access_type_mask_highest(access_type_mask) <= UVM_FAULT_ACCESS_TYPE_READ);
10122 
10123     if (*read_duplicate)
10124         return processor_id;
10125 
10126     *read_duplicate = false;
10127 
10128     // If read-duplication is active in the page but we are not
10129     // read-duplicating because the access type is not a read or a prefetch,
10130     // the faulting processor should get a local copy
10131     if (may_read_duplicate)
10132         return processor_id;
10133 
10134     // If the faulting processor is the preferred location always migrate
10135     preferred_location = policy->preferred_location;
10136     if (uvm_id_equal(processor_id, preferred_location)) {
10137         if (thrashing_hint->type != UVM_PERF_THRASHING_HINT_TYPE_NONE) {
10138             UVM_ASSERT(thrashing_hint->type == UVM_PERF_THRASHING_HINT_TYPE_PIN);
10139             if (uvm_va_space_processor_has_memory(va_space, processor_id))
10140                 UVM_ASSERT(uvm_id_equal(thrashing_hint->pin.residency, processor_id));
10141         }
10142 
10143         return processor_id;
10144     }
10145 
10146     // If the faulting processor is the CPU, HMM has to migrate the block to
10147     // system memory.
10148     // TODO: Bug 3900021: [UVM-HMM] investigate thrashing improvements.
10149     if (UVM_ID_IS_CPU(processor_id) && uvm_va_block_is_hmm(va_block))
10150         return processor_id;
10151 
10152     if (thrashing_hint->type == UVM_PERF_THRASHING_HINT_TYPE_PIN) {
10153         UVM_ASSERT(uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(thrashing_hint->pin.residency)],
10154                                            processor_id));
10155         return thrashing_hint->pin.residency;
10156     }
10157 
10158     closest_resident_processor = uvm_va_block_page_get_closest_resident(va_block, page_index, processor_id);
10159 
10160     // If the page is not resident anywhere, select the preferred location as
10161     // long as the preferred location is accessible from the faulting processor.
10162     // Otherwise select the faulting processor.
10163     if (UVM_ID_IS_INVALID(closest_resident_processor)) {
10164         if (UVM_ID_IS_VALID(preferred_location) &&
10165             uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(preferred_location)],
10166                                     processor_id)) {
10167             return preferred_location;
10168         }
10169 
10170         return processor_id;
10171     }
10172 
10173     // AccessedBy mappings might have not been created for the CPU if the thread
10174     // which made the memory resident did not have the proper references on the
10175     // mm_struct (for example, the GPU fault handling path when
10176     // uvm_va_space_mm_enabled() is false).
10177     //
10178     // Also, in uvm_migrate_*, we implement a two-pass scheme in which
10179     // AccessedBy mappings may be delayed to the second pass. This can produce
10180     // faults even if the faulting processor is in the accessed_by mask.
10181     //
10182     // Here, we keep it on the current residency and we just add the missing
10183     // mapping.
10184     if (uvm_processor_mask_test(&policy->accessed_by, processor_id) &&
10185         uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(closest_resident_processor)], processor_id) &&
10186         operation != UVM_SERVICE_OPERATION_ACCESS_COUNTERS) {
10187         return closest_resident_processor;
10188     }
10189 
10190     // Check if we should map the closest resident processor remotely on atomic
10191     // fault
10192     if (map_remote_on_atomic_fault(va_space, access_type_mask, processor_id, closest_resident_processor))
10193         return closest_resident_processor;
10194 
10195     // If the processor has access to the preferred location, and the page is
10196     // not resident on the accessing processor, move it to the preferred
10197     // location.
10198     if (!uvm_id_equal(closest_resident_processor, processor_id) &&
10199         UVM_ID_IS_VALID(preferred_location) &&
10200         uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(preferred_location)], processor_id))
10201         return preferred_location;
10202 
10203     // If the page is resident on a processor other than the preferred location,
10204     // or the faulting processor can't access the preferred location, we select
10205     // the faulting processor as the new residency.
10206     return processor_id;
10207 }
10208 
10209 uvm_processor_id_t uvm_va_block_select_residency(uvm_va_block_t *va_block,
10210                                                  uvm_va_block_context_t *va_block_context,
10211                                                  uvm_page_index_t page_index,
10212                                                  uvm_processor_id_t processor_id,
10213                                                  NvU32 access_type_mask,
10214                                                  const uvm_va_policy_t *policy,
10215                                                  const uvm_perf_thrashing_hint_t *thrashing_hint,
10216                                                  uvm_service_operation_t operation,
10217                                                  bool *read_duplicate)
10218 {
10219     uvm_processor_id_t id;
10220 
10221     UVM_ASSERT(uvm_va_block_check_policy_is_valid(va_block,
10222                                                   va_block_context->policy,
10223                                                   uvm_va_block_region_for_page(page_index)));
10224     UVM_ASSERT(uvm_hmm_check_context_vma_is_valid(va_block,
10225                                                   va_block_context,
10226                                                   uvm_va_block_region_for_page(page_index)));
10227 
10228     id = block_select_residency(va_block,
10229                                 va_block_context,
10230                                 page_index,
10231                                 processor_id,
10232                                 access_type_mask,
10233                                 policy,
10234                                 thrashing_hint,
10235                                 operation,
10236                                 read_duplicate);
10237 
10238     // If the intended residency doesn't have memory, fall back to the CPU.
10239     if (!block_processor_has_memory(va_block, id)) {
10240         *read_duplicate = false;
10241         return UVM_ID_CPU;
10242     }
10243 
10244     return id;
10245 }
10246 
10247 static bool check_access_counters_dont_revoke(uvm_va_block_t *block,
10248                                               uvm_va_block_context_t *block_context,
10249                                               uvm_va_block_region_t region,
10250                                               const uvm_processor_mask_t *revoke_processors,
10251                                               const uvm_page_mask_t *revoke_page_mask,
10252                                               uvm_prot_t revoke_prot)
10253 {
10254     uvm_processor_id_t id;
10255     for_each_id_in_mask(id, revoke_processors) {
10256         const uvm_page_mask_t *mapped_with_prot = block_map_with_prot_mask_get(block, id, revoke_prot);
10257 
10258         uvm_page_mask_and(&block_context->caller_page_mask, revoke_page_mask, mapped_with_prot);
10259 
10260         UVM_ASSERT(uvm_page_mask_region_weight(&block_context->caller_page_mask, region) == 0);
10261     }
10262 
10263     return true;
10264 }
10265 
10266 // Update service_context->prefetch_hint, service_context->per_processor_masks,
10267 // and service_context->region.
10268 static void uvm_va_block_get_prefetch_hint(uvm_va_block_t *va_block,
10269                                            uvm_service_block_context_t *service_context)
10270 {
10271     uvm_processor_id_t new_residency;
10272     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
10273 
10274     // Performance heuristics policy: we only consider prefetching when there
10275     // are migrations to a single processor, only.
10276     if (uvm_processor_mask_get_count(&service_context->resident_processors) == 1) {
10277         uvm_page_index_t page_index;
10278         uvm_page_mask_t *new_residency_mask;
10279         const uvm_va_policy_t *policy = service_context->block_context.policy;
10280 
10281         new_residency = uvm_processor_mask_find_first_id(&service_context->resident_processors);
10282         new_residency_mask = &service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency;
10283 
10284         // Update prefetch tracking structure with the pages that will migrate
10285         // due to faults
10286         uvm_perf_prefetch_get_hint(va_block,
10287                                    &service_context->block_context,
10288                                    new_residency,
10289                                    new_residency_mask,
10290                                    service_context->region,
10291                                    &service_context->prefetch_bitmap_tree,
10292                                    &service_context->prefetch_hint);
10293 
10294         // Obtain the prefetch hint and give a fake fault access type to the
10295         // prefetched pages
10296         if (UVM_ID_IS_VALID(service_context->prefetch_hint.residency)) {
10297             const uvm_page_mask_t *prefetch_pages_mask = &service_context->prefetch_hint.prefetch_pages_mask;
10298 
10299             for_each_va_block_page_in_mask(page_index, prefetch_pages_mask, va_block) {
10300                 UVM_ASSERT(!uvm_page_mask_test(new_residency_mask, page_index));
10301 
10302                 service_context->access_type[page_index] = UVM_FAULT_ACCESS_TYPE_PREFETCH;
10303 
10304                 if (uvm_va_policy_is_read_duplicate(policy, va_space) ||
10305                     (policy->read_duplication != UVM_READ_DUPLICATION_DISABLED &&
10306                      uvm_page_mask_test(&va_block->read_duplicated_pages, page_index))) {
10307                     if (service_context->read_duplicate_count++ == 0)
10308                         uvm_page_mask_zero(&service_context->read_duplicate_mask);
10309 
10310                     uvm_page_mask_set(&service_context->read_duplicate_mask, page_index);
10311                 }
10312             }
10313 
10314             uvm_page_mask_or(new_residency_mask, new_residency_mask, prefetch_pages_mask);
10315             service_context->region = uvm_va_block_region_from_mask(va_block, new_residency_mask);
10316         }
10317     }
10318     else {
10319         service_context->prefetch_hint.residency = UVM_ID_INVALID;
10320     }
10321 }
10322 
10323 NV_STATUS uvm_va_block_service_copy(uvm_processor_id_t processor_id,
10324                                     uvm_processor_id_t new_residency,
10325                                     uvm_va_block_t *va_block,
10326                                     uvm_va_block_retry_t *block_retry,
10327                                     uvm_service_block_context_t *service_context)
10328 {
10329     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
10330     uvm_processor_mask_t *all_involved_processors =
10331         &service_context->block_context.make_resident.all_involved_processors;
10332     uvm_page_mask_t *new_residency_mask =
10333         &service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency;
10334     uvm_page_mask_t *did_migrate_mask = &service_context->block_context.make_resident.pages_changed_residency;
10335     uvm_page_mask_t *caller_page_mask = &service_context->block_context.caller_page_mask;
10336     uvm_make_resident_cause_t cause;
10337     NV_STATUS status;
10338 
10339     // 1- Migrate pages
10340     switch (service_context->operation) {
10341         case UVM_SERVICE_OPERATION_REPLAYABLE_FAULTS:
10342             cause = UVM_MAKE_RESIDENT_CAUSE_REPLAYABLE_FAULT;
10343             break;
10344         case UVM_SERVICE_OPERATION_NON_REPLAYABLE_FAULTS:
10345             cause = UVM_MAKE_RESIDENT_CAUSE_NON_REPLAYABLE_FAULT;
10346             break;
10347         case UVM_SERVICE_OPERATION_ACCESS_COUNTERS:
10348             cause = UVM_MAKE_RESIDENT_CAUSE_ACCESS_COUNTER;
10349             break;
10350         default:
10351             UVM_ASSERT_MSG(false, "Invalid operation value %d\n", service_context->operation);
10352             // Set cause to silence compiler warning that it may be unused.
10353             cause = UVM_MAKE_RESIDENT_CAUSE_ACCESS_COUNTER;
10354             break;
10355     }
10356 
10357     // Reset masks before all of the make_resident calls
10358     uvm_page_mask_zero(did_migrate_mask);
10359     uvm_processor_mask_zero(all_involved_processors);
10360 
10361     // Handle read duplication first so that the caller_page_mask will be free
10362     // to use below and still valid in uvm_va_block_service_finish().
10363     // TODO: Bug 3660922: need to implement HMM read duplication support.
10364     if (service_context->read_duplicate_count != 0 &&
10365         uvm_page_mask_and(caller_page_mask,
10366                           new_residency_mask,
10367                           &service_context->read_duplicate_mask)) {
10368         status = uvm_va_block_make_resident_read_duplicate(va_block,
10369                                                            block_retry,
10370                                                            &service_context->block_context,
10371                                                            new_residency,
10372                                                            service_context->region,
10373                                                            caller_page_mask,
10374                                                            &service_context->prefetch_hint.prefetch_pages_mask,
10375                                                            cause);
10376         if (status != NV_OK)
10377             return status;
10378     }
10379 
10380     if (service_context->read_duplicate_count == 0 ||
10381         uvm_page_mask_andnot(caller_page_mask, new_residency_mask, &service_context->read_duplicate_mask)) {
10382         if (service_context->read_duplicate_count == 0)
10383             uvm_page_mask_copy(caller_page_mask, new_residency_mask);
10384         status = uvm_va_block_make_resident_copy(va_block,
10385                                                  block_retry,
10386                                                  &service_context->block_context,
10387                                                  new_residency,
10388                                                  service_context->region,
10389                                                  caller_page_mask,
10390                                                  &service_context->prefetch_hint.prefetch_pages_mask,
10391                                                  cause);
10392         if (status != NV_OK)
10393             return status;
10394     }
10395 
10396     if (UVM_ID_IS_CPU(processor_id) && !uvm_processor_mask_empty(all_involved_processors))
10397         service_context->cpu_fault.did_migrate = true;
10398 
10399     // 2- Check for ECC errors on all GPUs involved in the migration if CPU is
10400     //    the destination. Migrations in response to CPU faults are special
10401     //    because they're on the only path (apart from tools) where CUDA is not
10402     //    involved and wouldn't have a chance to do its own ECC checking.
10403     if (service_context->operation == UVM_SERVICE_OPERATION_REPLAYABLE_FAULTS &&
10404         UVM_ID_IS_CPU(new_residency) &&
10405         !uvm_processor_mask_empty(all_involved_processors)) {
10406         uvm_gpu_t *gpu;
10407 
10408         // Before checking for ECC errors, make sure all of the GPU work
10409         // is finished. Creating mappings on the CPU would have to wait
10410         // for the tracker anyway so this shouldn't hurt performance.
10411         status = uvm_tracker_wait(&va_block->tracker);
10412         if (status != NV_OK)
10413             return status;
10414 
10415         for_each_va_space_gpu_in_mask(gpu, va_space, all_involved_processors) {
10416             // We cannot call into RM here so use the no RM ECC check.
10417             status = uvm_gpu_check_ecc_error_no_rm(gpu);
10418             if (status == NV_WARN_MORE_PROCESSING_REQUIRED) {
10419                 // In case we need to call into RM to be sure whether
10420                 // there is an ECC error or not, signal that to the
10421                 // caller by adding the GPU to the mask.
10422                 //
10423                 // In that case the ECC error might be noticed only after
10424                 // the CPU mappings have been already created below,
10425                 // exposing different CPU threads to the possibly corrupt
10426                 // data, but this thread will fault eventually and that's
10427                 // considered to be an acceptable trade-off between
10428                 // performance and ECC error containment.
10429                 uvm_processor_mask_set(&service_context->cpu_fault.gpus_to_check_for_ecc, gpu->id);
10430                 status = NV_OK;
10431             }
10432             if (status != NV_OK)
10433                 return status;
10434         }
10435     }
10436 
10437     return NV_OK;
10438 }
10439 
10440 NV_STATUS uvm_va_block_service_finish(uvm_processor_id_t processor_id,
10441                                       uvm_va_block_t *va_block,
10442                                       uvm_service_block_context_t *service_context)
10443 {
10444     uvm_processor_id_t new_residency = service_context->block_context.make_resident.dest_id;
10445     uvm_page_mask_t *new_residency_mask =
10446         &service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency;
10447     uvm_page_mask_t *did_migrate_mask = &service_context->block_context.make_resident.pages_changed_residency;
10448     uvm_page_mask_t *caller_page_mask = &service_context->block_context.caller_page_mask;
10449     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
10450     uvm_prot_t new_prot;
10451     uvm_page_index_t page_index;
10452     NV_STATUS status;
10453 
10454     // Update residency.
10455     if (service_context->read_duplicate_count == 0 || !uvm_page_mask_empty(caller_page_mask))
10456         uvm_va_block_make_resident_finish(va_block,
10457                                           &service_context->block_context,
10458                                           service_context->region,
10459                                           caller_page_mask);
10460 
10461     uvm_page_mask_andnot(&service_context->did_not_migrate_mask, new_residency_mask, did_migrate_mask);
10462 
10463     // The loops below depend on the enums having the following values in order
10464     // to index into service_context->mappings_by_prot[].
10465     BUILD_BUG_ON(UVM_PROT_READ_ONLY != 1);
10466     BUILD_BUG_ON(UVM_PROT_READ_WRITE != 2);
10467     BUILD_BUG_ON(UVM_PROT_READ_WRITE_ATOMIC != 3);
10468     BUILD_BUG_ON(UVM_PROT_MAX != 4);
10469 
10470     // 1- Compute mapping protections for the requesting processor on the new
10471     // residency.
10472     for (new_prot = UVM_PROT_READ_ONLY; new_prot < UVM_PROT_MAX; ++new_prot)
10473         service_context->mappings_by_prot[new_prot - 1].count = 0;
10474 
10475     for_each_va_block_page_in_region_mask(page_index, new_residency_mask, service_context->region) {
10476         new_prot = compute_new_permission(va_block,
10477                                           &service_context->block_context,
10478                                           page_index,
10479                                           processor_id,
10480                                           new_residency,
10481                                           service_context->access_type[page_index]);
10482 
10483         if (service_context->mappings_by_prot[new_prot - 1].count++ == 0)
10484             uvm_page_mask_zero(&service_context->mappings_by_prot[new_prot - 1].page_mask);
10485 
10486         uvm_page_mask_set(&service_context->mappings_by_prot[new_prot - 1].page_mask, page_index);
10487     }
10488 
10489     // 2- Revoke permissions
10490     //
10491     // NOTE: uvm_va_block_make_resident_copy destroys mappings to old locations.
10492     //       Thus, we need to revoke only if residency did not change and we
10493     //       are mapping higher than READ ONLY.
10494     for (new_prot = UVM_PROT_READ_WRITE; new_prot <= UVM_PROT_READ_WRITE_ATOMIC; ++new_prot) {
10495         bool pages_need_revocation;
10496         uvm_processor_mask_t revoke_processors;
10497         uvm_prot_t revoke_prot;
10498         bool this_processor_has_enabled_atomics;
10499 
10500         if (service_context->mappings_by_prot[new_prot - 1].count == 0)
10501             continue;
10502 
10503         pages_need_revocation = uvm_page_mask_and(&service_context->revocation_mask,
10504                                                   &service_context->did_not_migrate_mask,
10505                                                   &service_context->mappings_by_prot[new_prot - 1].page_mask);
10506         if (!pages_need_revocation)
10507             continue;
10508 
10509         uvm_processor_mask_and(&revoke_processors, &va_block->mapped, &va_space->faultable_processors);
10510 
10511         // Do not revoke the processor that took the fault
10512         uvm_processor_mask_clear(&revoke_processors, processor_id);
10513 
10514         this_processor_has_enabled_atomics = uvm_processor_mask_test(&va_space->system_wide_atomics_enabled_processors,
10515                                                                      processor_id);
10516 
10517         // Atomic operations on processors with system-wide atomics
10518         // disabled or with native atomics access to new_residency
10519         // behave like writes.
10520         if (new_prot == UVM_PROT_READ_WRITE ||
10521             !this_processor_has_enabled_atomics ||
10522             uvm_processor_mask_test(&va_space->has_native_atomics[uvm_id_value(new_residency)], processor_id)) {
10523 
10524             // Exclude processors with native atomics on the resident copy
10525             uvm_processor_mask_andnot(&revoke_processors,
10526                                       &revoke_processors,
10527                                       &va_space->has_native_atomics[uvm_id_value(new_residency)]);
10528 
10529             // Exclude processors with disabled system-wide atomics
10530             uvm_processor_mask_and(&revoke_processors,
10531                                    &revoke_processors,
10532                                    &va_space->system_wide_atomics_enabled_processors);
10533         }
10534 
10535         if (UVM_ID_IS_CPU(processor_id)) {
10536             revoke_prot = UVM_PROT_READ_WRITE_ATOMIC;
10537         }
10538         else {
10539             revoke_prot = (new_prot == UVM_PROT_READ_WRITE_ATOMIC)? UVM_PROT_READ_WRITE:
10540                                                                     UVM_PROT_READ_WRITE_ATOMIC;
10541         }
10542 
10543         // UVM-Lite processors must always have RWA mappings
10544         if (uvm_processor_mask_andnot(&revoke_processors, &revoke_processors, block_get_uvm_lite_gpus(va_block))) {
10545             // Access counters should never trigger revocations apart from
10546             // read-duplication, which are performed in the calls to
10547             // uvm_va_block_make_resident_read_duplicate, above.
10548             if (service_context->operation == UVM_SERVICE_OPERATION_ACCESS_COUNTERS) {
10549                 UVM_ASSERT(check_access_counters_dont_revoke(va_block,
10550                                                              &service_context->block_context,
10551                                                              service_context->region,
10552                                                              &revoke_processors,
10553                                                              &service_context->revocation_mask,
10554                                                              revoke_prot));
10555             }
10556 
10557             // Downgrade other processors' mappings
10558             status = uvm_va_block_revoke_prot_mask(va_block,
10559                                                    &service_context->block_context,
10560                                                    &revoke_processors,
10561                                                    service_context->region,
10562                                                    &service_context->revocation_mask,
10563                                                    revoke_prot);
10564             if (status != NV_OK)
10565                 return status;
10566         }
10567     }
10568 
10569     // 3- Map requesting processor with the necessary privileges
10570     for (new_prot = UVM_PROT_READ_ONLY; new_prot <= UVM_PROT_READ_WRITE_ATOMIC; ++new_prot) {
10571         const uvm_page_mask_t *map_prot_mask = &service_context->mappings_by_prot[new_prot - 1].page_mask;
10572 
10573         if (service_context->mappings_by_prot[new_prot - 1].count == 0)
10574             continue;
10575 
10576         // 3.1 - Unmap CPU pages
10577         // HMM cpu mappings can be upgraded at any time without notification
10578         // so no need to downgrade first.
10579         if (service_context->operation != UVM_SERVICE_OPERATION_ACCESS_COUNTERS &&
10580             UVM_ID_IS_CPU(processor_id) &&
10581             !uvm_va_block_is_hmm(va_block)) {
10582             // The kernel can downgrade managed CPU mappings at any time without
10583             // notifying us, which means our PTE state could be stale. We
10584             // handle this by unmapping the CPU PTE and re-mapping it again.
10585             //
10586             // A CPU fault is unexpected if:
10587             // curr_prot == RW || (!is_write && curr_prot == RO)
10588             status = uvm_va_block_unmap(va_block,
10589                                         &service_context->block_context,
10590                                         UVM_ID_CPU,
10591                                         service_context->region,
10592                                         map_prot_mask,
10593                                         NULL);
10594             if (status != NV_OK)
10595                 return status;
10596         }
10597 
10598         // 3.2 - Add new mappings
10599 
10600         // The faulting processor can be mapped remotely due to user policy or
10601         // the thrashing mitigation heuristics. Therefore, we set the cause
10602         // accordingly in each case.
10603 
10604         // Map pages that are thrashing first
10605         if (service_context->thrashing_pin_count > 0 && va_space->tools.enabled) {
10606             uvm_page_mask_t *helper_page_mask = &service_context->block_context.caller_page_mask;
10607             bool pages_need_mapping = uvm_page_mask_and(helper_page_mask,
10608                                                         map_prot_mask,
10609                                                         &service_context->thrashing_pin_mask);
10610             if (pages_need_mapping) {
10611                 status = uvm_va_block_map(va_block,
10612                                           &service_context->block_context,
10613                                           processor_id,
10614                                           service_context->region,
10615                                           helper_page_mask,
10616                                           new_prot,
10617                                           UvmEventMapRemoteCauseThrashing,
10618                                           &va_block->tracker);
10619                 if (status != NV_OK)
10620                     return status;
10621 
10622                 // Remove thrashing pages from the map mask
10623                 pages_need_mapping = uvm_page_mask_andnot(helper_page_mask,
10624                                                           map_prot_mask,
10625                                                           &service_context->thrashing_pin_mask);
10626                 if (!pages_need_mapping)
10627                     continue;
10628 
10629                 map_prot_mask = helper_page_mask;
10630             }
10631         }
10632 
10633         status = uvm_va_block_map(va_block,
10634                                   &service_context->block_context,
10635                                   processor_id,
10636                                   service_context->region,
10637                                   map_prot_mask,
10638                                   new_prot,
10639                                   UvmEventMapRemoteCausePolicy,
10640                                   &va_block->tracker);
10641         if (status != NV_OK)
10642             return status;
10643     }
10644 
10645     // 4- If pages did migrate, map SetAccessedBy processors, except for
10646     // UVM-Lite
10647     for (new_prot = UVM_PROT_READ_ONLY; new_prot <= UVM_PROT_READ_WRITE_ATOMIC; ++new_prot) {
10648         bool pages_need_mapping;
10649 
10650         if (service_context->mappings_by_prot[new_prot - 1].count == 0)
10651             continue;
10652 
10653         pages_need_mapping = uvm_page_mask_and(caller_page_mask,
10654                                                new_residency_mask,
10655                                                &service_context->mappings_by_prot[new_prot - 1].page_mask);
10656         if (!pages_need_mapping)
10657             continue;
10658 
10659         // Map pages that are thrashing
10660         if (service_context->thrashing_pin_count > 0) {
10661             uvm_page_index_t page_index;
10662 
10663             for_each_va_block_page_in_region_mask(page_index,
10664                                                   &service_context->thrashing_pin_mask,
10665                                                   service_context->region) {
10666                 uvm_processor_mask_t *map_thrashing_processors = NULL;
10667                 NvU64 page_addr = uvm_va_block_cpu_page_address(va_block, page_index);
10668 
10669                 // Check protection type
10670                 if (!uvm_page_mask_test(caller_page_mask, page_index))
10671                     continue;
10672 
10673                 map_thrashing_processors = uvm_perf_thrashing_get_thrashing_processors(va_block, page_addr);
10674 
10675                 status = uvm_va_block_add_mappings_after_migration(va_block,
10676                                                                    &service_context->block_context,
10677                                                                    new_residency,
10678                                                                    processor_id,
10679                                                                    uvm_va_block_region_for_page(page_index),
10680                                                                    caller_page_mask,
10681                                                                    new_prot,
10682                                                                    map_thrashing_processors);
10683                 if (status != NV_OK)
10684                     return status;
10685             }
10686 
10687             pages_need_mapping = uvm_page_mask_andnot(caller_page_mask,
10688                                                       caller_page_mask,
10689                                                       &service_context->thrashing_pin_mask);
10690             if (!pages_need_mapping)
10691                 continue;
10692         }
10693 
10694         // Map the rest of pages in a single shot
10695         status = uvm_va_block_add_mappings_after_migration(va_block,
10696                                                            &service_context->block_context,
10697                                                            new_residency,
10698                                                            processor_id,
10699                                                            service_context->region,
10700                                                            caller_page_mask,
10701                                                            new_prot,
10702                                                            NULL);
10703         if (status != NV_OK)
10704             return status;
10705     }
10706 
10707     return NV_OK;
10708 }
10709 
10710 NV_STATUS uvm_va_block_service_locked(uvm_processor_id_t processor_id,
10711                                       uvm_va_block_t *va_block,
10712                                       uvm_va_block_retry_t *block_retry,
10713                                       uvm_service_block_context_t *service_context)
10714 {
10715     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
10716     uvm_processor_id_t new_residency;
10717     NV_STATUS status = NV_OK;
10718 
10719     uvm_assert_mutex_locked(&va_block->lock);
10720     UVM_ASSERT(uvm_va_block_check_policy_is_valid(va_block,
10721                                                   service_context->block_context.policy,
10722                                                   service_context->region));
10723     UVM_ASSERT(uvm_hmm_check_context_vma_is_valid(va_block,
10724                                                   &service_context->block_context,
10725                                                   service_context->region));
10726 
10727     // GPU fault servicing must be done under the VA space read lock. GPU fault
10728     // servicing is required for RM to make forward progress, and we allow other
10729     // threads to call into RM while holding the VA space lock in read mode. If
10730     // we took the VA space lock in write mode on the GPU fault service path,
10731     // we could deadlock because the thread in RM which holds the VA space lock
10732     // for read wouldn't be able to complete until fault servicing completes.
10733     if (service_context->operation != UVM_SERVICE_OPERATION_REPLAYABLE_FAULTS || UVM_ID_IS_CPU(processor_id))
10734         uvm_assert_rwsem_locked(&va_space->lock);
10735     else
10736         uvm_assert_rwsem_locked_read(&va_space->lock);
10737 
10738     uvm_va_block_get_prefetch_hint(va_block, service_context);
10739 
10740     for_each_id_in_mask(new_residency, &service_context->resident_processors) {
10741         if (uvm_va_block_is_hmm(va_block)) {
10742             status = uvm_hmm_va_block_service_locked(processor_id, new_residency, va_block, block_retry, service_context);
10743             if (status != NV_OK)
10744                 break;
10745 
10746             continue;
10747         }
10748 
10749         status = uvm_va_block_service_copy(processor_id, new_residency, va_block, block_retry, service_context);
10750         if (status != NV_OK)
10751             break;
10752 
10753         status = uvm_va_block_service_finish(processor_id, va_block, service_context);
10754         if (status != NV_OK)
10755             break;
10756     }
10757 
10758     return status;
10759 }
10760 
10761 NV_STATUS uvm_va_block_check_logical_permissions(uvm_va_block_t *va_block,
10762                                                  uvm_va_block_context_t *va_block_context,
10763                                                  uvm_processor_id_t processor_id,
10764                                                  uvm_page_index_t page_index,
10765                                                  uvm_fault_type_t access_type,
10766                                                  bool allow_migration)
10767 {
10768     uvm_va_range_t *va_range = va_block->va_range;
10769     uvm_prot_t access_prot = uvm_fault_access_type_to_prot(access_type);
10770 
10771     UVM_ASSERT(uvm_va_block_check_policy_is_valid(va_block,
10772                                                   va_block_context->policy,
10773                                                   uvm_va_block_region_for_page(page_index)));
10774     UVM_ASSERT(uvm_hmm_check_context_vma_is_valid(va_block,
10775                                                   va_block_context,
10776                                                   uvm_va_block_region_for_page(page_index)));
10777 
10778     // CPU permissions are checked later by block_map_cpu_page.
10779     //
10780     // TODO: Bug 1766124: permissions are checked by block_map_cpu_page because
10781     //       it can also be called from change_pte. Make change_pte call this
10782     //       function and only check CPU permissions here.
10783     if (UVM_ID_IS_GPU(processor_id)) {
10784         if (va_range && uvm_va_range_is_managed_zombie(va_range))
10785             return NV_ERR_INVALID_ADDRESS;
10786 
10787         // GPU faults only check vma permissions if a mm is registered with the
10788         // VA space (ie. uvm_va_space_mm_retain_lock(va_space) != NULL) or if
10789         // uvm_enable_builtin_tests is set, because the Linux kernel can change
10790         // vm_flags at any moment (for example on mprotect) and here we are not
10791         // guaranteed to have vma->vm_mm->mmap_lock. During tests we ensure that
10792         // this scenario does not happen.
10793         if ((va_block_context->mm || uvm_enable_builtin_tests) &&
10794             (access_prot > compute_logical_prot(va_block, va_block_context, page_index)))
10795             return NV_ERR_INVALID_ACCESS_TYPE;
10796     }
10797 
10798     // Non-migratable range:
10799     // - CPU accesses are always fatal, regardless of the VA range residency
10800     // - GPU accesses are fatal if the GPU can't map the preferred location
10801     if (!allow_migration) {
10802         UVM_ASSERT(!uvm_va_block_is_hmm(va_block));
10803 
10804         if (UVM_ID_IS_CPU(processor_id)) {
10805             return NV_ERR_INVALID_OPERATION;
10806         }
10807         else {
10808             uvm_va_space_t *va_space = va_range->va_space;
10809 
10810             return uvm_processor_mask_test(
10811                     &va_space->accessible_from[uvm_id_value(uvm_va_range_get_policy(va_range)->preferred_location)],
10812                     processor_id)?
10813                 NV_OK : NV_ERR_INVALID_ACCESS_TYPE;
10814         }
10815     }
10816 
10817     return NV_OK;
10818 }
10819 
10820 // Check if we are faulting on a page with valid permissions to check if we can
10821 // skip fault handling. See uvm_va_block_t::cpu::fault_authorized for more
10822 // details
10823 static bool skip_cpu_fault_with_valid_permissions(uvm_va_block_t *va_block,
10824                                                   uvm_page_index_t page_index,
10825                                                   uvm_fault_access_type_t fault_access_type)
10826 {
10827     // TODO: Bug 3900038: is skip_cpu_fault_with_valid_permissions() needed for
10828     // HMM?
10829     if (uvm_va_block_is_hmm(va_block))
10830         return false;
10831 
10832     if (block_page_is_processor_authorized(va_block,
10833                                            page_index,
10834                                            UVM_ID_CPU,
10835                                            uvm_fault_access_type_to_prot(fault_access_type))) {
10836         NvU64 now = NV_GETTIME();
10837         pid_t pid = current->pid;
10838 
10839         // Latch the pid/timestamp/page_index values for the first time
10840         if (!va_block->cpu.fault_authorized.first_fault_stamp) {
10841             va_block->cpu.fault_authorized.first_fault_stamp = now;
10842             va_block->cpu.fault_authorized.first_pid = pid;
10843             va_block->cpu.fault_authorized.page_index = page_index;
10844 
10845             return true;
10846         }
10847 
10848         // If the same thread shows up again, this means that the kernel
10849         // downgraded the page's PTEs. Service the fault to force a remap of
10850         // the page.
10851         if (va_block->cpu.fault_authorized.first_pid == pid &&
10852             va_block->cpu.fault_authorized.page_index == page_index) {
10853             va_block->cpu.fault_authorized.first_fault_stamp = 0;
10854         }
10855         else {
10856             // If the window has expired, clear the information and service the
10857             // fault. Otherwise, just return
10858             if (now - va_block->cpu.fault_authorized.first_fault_stamp > uvm_perf_authorized_cpu_fault_tracking_window_ns)
10859                 va_block->cpu.fault_authorized.first_fault_stamp = 0;
10860             else
10861                 return true;
10862         }
10863     }
10864 
10865     return false;
10866 }
10867 
10868 static NV_STATUS block_cpu_fault_locked(uvm_va_block_t *va_block,
10869                                         uvm_va_block_retry_t *va_block_retry,
10870                                         NvU64 fault_addr,
10871                                         uvm_fault_access_type_t fault_access_type,
10872                                         uvm_service_block_context_t *service_context)
10873 {
10874     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
10875     NV_STATUS status = NV_OK;
10876     uvm_page_index_t page_index;
10877     uvm_perf_thrashing_hint_t thrashing_hint;
10878     uvm_processor_id_t new_residency;
10879     bool read_duplicate;
10880 
10881     uvm_assert_rwsem_locked(&va_space->lock);
10882 
10883     UVM_ASSERT(fault_addr >= va_block->start);
10884     UVM_ASSERT(fault_addr <= va_block->end);
10885 
10886     uvm_assert_mmap_lock_locked(service_context->block_context.mm);
10887 
10888     service_context->block_context.policy = uvm_va_policy_get(va_block, fault_addr);
10889 
10890     if (service_context->num_retries == 0) {
10891         // notify event to tools/performance heuristics
10892         uvm_perf_event_notify_cpu_fault(&va_space->perf_events,
10893                                         va_block,
10894                                         service_context->block_context.policy->preferred_location,
10895                                         fault_addr,
10896                                         fault_access_type > UVM_FAULT_ACCESS_TYPE_READ,
10897                                         KSTK_EIP(current));
10898     }
10899 
10900     // Check logical permissions
10901     page_index = uvm_va_block_cpu_page_index(va_block, fault_addr);
10902     status = uvm_va_block_check_logical_permissions(va_block,
10903                                                     &service_context->block_context,
10904                                                     UVM_ID_CPU,
10905                                                     page_index,
10906                                                     fault_access_type,
10907                                                     uvm_range_group_address_migratable(va_space, fault_addr));
10908     if (status != NV_OK)
10909         return status;
10910 
10911     uvm_processor_mask_zero(&service_context->cpu_fault.gpus_to_check_for_ecc);
10912 
10913     if (skip_cpu_fault_with_valid_permissions(va_block, page_index, fault_access_type))
10914         return NV_OK;
10915 
10916     thrashing_hint = uvm_perf_thrashing_get_hint(va_block, fault_addr, UVM_ID_CPU);
10917     // Throttling is implemented by sleeping in the fault handler on the CPU
10918     if (thrashing_hint.type == UVM_PERF_THRASHING_HINT_TYPE_THROTTLE) {
10919         service_context->cpu_fault.wakeup_time_stamp = thrashing_hint.throttle.end_time_stamp;
10920         return NV_WARN_MORE_PROCESSING_REQUIRED;
10921     }
10922 
10923     service_context->read_duplicate_count = 0;
10924     service_context->thrashing_pin_count = 0;
10925     service_context->operation = UVM_SERVICE_OPERATION_REPLAYABLE_FAULTS;
10926 
10927     if (thrashing_hint.type == UVM_PERF_THRASHING_HINT_TYPE_PIN) {
10928         uvm_page_mask_zero(&service_context->thrashing_pin_mask);
10929         uvm_page_mask_set(&service_context->thrashing_pin_mask, page_index);
10930         service_context->thrashing_pin_count = 1;
10931     }
10932 
10933     // Compute new residency and update the masks
10934     new_residency = uvm_va_block_select_residency(va_block,
10935                                                   &service_context->block_context,
10936                                                   page_index,
10937                                                   UVM_ID_CPU,
10938                                                   uvm_fault_access_type_mask_bit(fault_access_type),
10939                                                   service_context->block_context.policy,
10940                                                   &thrashing_hint,
10941                                                   UVM_SERVICE_OPERATION_REPLAYABLE_FAULTS,
10942                                                   &read_duplicate);
10943 
10944     // Initialize the minimum necessary state in the fault service context
10945     uvm_processor_mask_zero(&service_context->resident_processors);
10946 
10947     // Set new residency and update the masks
10948     uvm_processor_mask_set(&service_context->resident_processors, new_residency);
10949 
10950     // The masks need to be fully zeroed as the fault region may grow due to prefetching
10951     uvm_page_mask_zero(&service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency);
10952     uvm_page_mask_set(&service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency, page_index);
10953 
10954     if (read_duplicate) {
10955         uvm_page_mask_zero(&service_context->read_duplicate_mask);
10956         uvm_page_mask_set(&service_context->read_duplicate_mask, page_index);
10957         service_context->read_duplicate_count = 1;
10958     }
10959 
10960     service_context->access_type[page_index] = fault_access_type;
10961 
10962     service_context->region = uvm_va_block_region_for_page(page_index);
10963 
10964     status = uvm_va_block_service_locked(UVM_ID_CPU, va_block, va_block_retry, service_context);
10965 
10966     ++service_context->num_retries;
10967 
10968     return status;
10969 }
10970 
10971 NV_STATUS uvm_va_block_cpu_fault(uvm_va_block_t *va_block,
10972                                  NvU64 fault_addr,
10973                                  bool is_write,
10974                                  uvm_service_block_context_t *service_context)
10975 {
10976     NV_STATUS status;
10977     uvm_va_block_retry_t va_block_retry;
10978     uvm_fault_access_type_t fault_access_type;
10979 
10980     if (is_write)
10981         fault_access_type = UVM_FAULT_ACCESS_TYPE_ATOMIC_STRONG;
10982     else
10983         fault_access_type = UVM_FAULT_ACCESS_TYPE_READ;
10984 
10985     service_context->num_retries = 0;
10986     service_context->cpu_fault.did_migrate = false;
10987 
10988     // We have to use vm_insert_page instead of handing the page to the kernel
10989     // and letting it insert the mapping, and we must do that while holding the
10990     // lock on this VA block. Otherwise there will be a window in which we think
10991     // we've mapped the page but the CPU mapping hasn't actually been created
10992     // yet. During that window a GPU fault event could arrive and claim
10993     // ownership of that VA, "unmapping" it. Then later the kernel would
10994     // eventually establish the mapping, and we'd end up with both CPU and GPU
10995     // thinking they each owned the page.
10996     //
10997     // This function must only be called when it's safe to call vm_insert_page.
10998     // That is, there must be a reference held on the vma's vm_mm, and
10999     // vm_mm->mmap_lock is held in at least read mode. Note that current->mm
11000     // might not be vma->vm_mm.
11001     status = UVM_VA_BLOCK_LOCK_RETRY(va_block,
11002                                      &va_block_retry,
11003                                      block_cpu_fault_locked(va_block,
11004                                                             &va_block_retry,
11005                                                             fault_addr,
11006                                                             fault_access_type,
11007                                                             service_context));
11008     return status;
11009 }
11010 
11011 NV_STATUS uvm_va_block_find(uvm_va_space_t *va_space, NvU64 addr, uvm_va_block_t **out_block)
11012 {
11013     uvm_va_range_t *va_range;
11014     uvm_va_block_t *block;
11015     size_t index;
11016 
11017     va_range = uvm_va_range_find(va_space, addr);
11018     if (!va_range)
11019         return uvm_hmm_va_block_find(va_space, addr, out_block);
11020 
11021     UVM_ASSERT(uvm_hmm_va_block_find(va_space, addr, out_block) == NV_ERR_INVALID_ADDRESS ||
11022                uvm_hmm_va_block_find(va_space, addr, out_block) == NV_ERR_OBJECT_NOT_FOUND);
11023 
11024     if (va_range->type != UVM_VA_RANGE_TYPE_MANAGED)
11025         return NV_ERR_INVALID_ADDRESS;
11026 
11027     index = uvm_va_range_block_index(va_range, addr);
11028     block = uvm_va_range_block(va_range, index);
11029     if (!block)
11030         return NV_ERR_OBJECT_NOT_FOUND;
11031 
11032     *out_block = block;
11033     return NV_OK;
11034 }
11035 
11036 NV_STATUS uvm_va_block_find_create_in_range(uvm_va_space_t *va_space,
11037                                             uvm_va_range_t *va_range,
11038                                             NvU64 addr,
11039                                             uvm_va_block_context_t *va_block_context,
11040                                             uvm_va_block_t **out_block)
11041 {
11042     size_t index;
11043 
11044     if (uvm_enable_builtin_tests && atomic_dec_if_positive(&va_space->test.va_block_allocation_fail_nth) == 0)
11045         return NV_ERR_NO_MEMORY;
11046 
11047     if (!va_range) {
11048         if (!va_block_context || !va_block_context->mm)
11049             return NV_ERR_INVALID_ADDRESS;
11050         return uvm_hmm_va_block_find_create(va_space, addr, va_block_context, out_block);
11051     }
11052 
11053     UVM_ASSERT(addr >= va_range->node.start);
11054     UVM_ASSERT(addr <= va_range->node.end);
11055 
11056     UVM_ASSERT(uvm_hmm_va_block_find(va_space, addr, out_block) == NV_ERR_INVALID_ADDRESS ||
11057                uvm_hmm_va_block_find(va_space, addr, out_block) == NV_ERR_OBJECT_NOT_FOUND);
11058 
11059     if (va_range->type != UVM_VA_RANGE_TYPE_MANAGED)
11060         return NV_ERR_INVALID_ADDRESS;
11061 
11062     index = uvm_va_range_block_index(va_range, addr);
11063     return uvm_va_range_block_create(va_range, index, out_block);
11064 }
11065 
11066 NV_STATUS uvm_va_block_find_create(uvm_va_space_t *va_space,
11067                                    NvU64 addr,
11068                                    uvm_va_block_context_t *va_block_context,
11069                                    uvm_va_block_t **out_block)
11070 {
11071     uvm_va_range_t *va_range = uvm_va_range_find(va_space, addr);
11072 
11073     return uvm_va_block_find_create_in_range(va_space, va_range, addr, va_block_context, out_block);
11074 }
11075 
11076 // Launch a synchronous, encrypted copy between GPU and CPU.
11077 //
11078 // The copy entails a GPU-side encryption (relying on the Copy Engine), and a
11079 // CPU-side decryption step, such that the destination CPU buffer pointed by
11080 // dst_plain will contain the unencrypted (plain text) contents. The destination
11081 // buffer can be in protected or unprotected sysmem, while the source buffer
11082 // must be in protected vidmem.
11083 //
11084 // The maximum copy size allowed is UVM_CONF_COMPUTING_DMA_BUFFER_SIZE.
11085 //
11086 // The input tracker, if not NULL, is internally acquired by the push
11087 // responsible for the encrypted copy.
11088 __attribute__ ((format(printf, 6, 7)))
11089 static NV_STATUS encrypted_memcopy_gpu_to_cpu(uvm_gpu_t *gpu,
11090                                               void *dst_plain,
11091                                               uvm_gpu_address_t src_gpu_address,
11092                                               size_t size,
11093                                               uvm_tracker_t *tracker,
11094                                               const char *format,
11095                                               ...)
11096 {
11097     NV_STATUS status;
11098     UvmCslIv decrypt_iv;
11099     uvm_push_t push;
11100     uvm_conf_computing_dma_buffer_t *dma_buffer;
11101     uvm_gpu_address_t dst_gpu_address, auth_tag_gpu_address;
11102     void *src_cipher, *auth_tag;
11103     va_list args;
11104 
11105     UVM_ASSERT(uvm_conf_computing_mode_enabled(gpu));
11106     UVM_ASSERT(size <= UVM_CONF_COMPUTING_DMA_BUFFER_SIZE);
11107 
11108     status = uvm_conf_computing_dma_buffer_alloc(&gpu->conf_computing.dma_buffer_pool, &dma_buffer, NULL);
11109     if (status != NV_OK)
11110         return status;
11111 
11112     va_start(args, format);
11113     status = uvm_push_begin_acquire(gpu->channel_manager, UVM_CHANNEL_TYPE_GPU_TO_CPU, tracker, &push, format, args);
11114     va_end(args);
11115 
11116     if (status != NV_OK)
11117         goto out;
11118 
11119     uvm_conf_computing_log_gpu_encryption(push.channel, &decrypt_iv);
11120 
11121     dst_gpu_address = uvm_mem_gpu_address_virtual_kernel(dma_buffer->alloc, gpu);
11122     auth_tag_gpu_address = uvm_mem_gpu_address_virtual_kernel(dma_buffer->auth_tag, gpu);
11123     gpu->parent->ce_hal->encrypt(&push, dst_gpu_address, src_gpu_address, size, auth_tag_gpu_address);
11124 
11125     status = uvm_push_end_and_wait(&push);
11126     if (status != NV_OK)
11127         goto out;
11128 
11129     src_cipher = uvm_mem_get_cpu_addr_kernel(dma_buffer->alloc);
11130     auth_tag = uvm_mem_get_cpu_addr_kernel(dma_buffer->auth_tag);
11131     status = uvm_conf_computing_cpu_decrypt(push.channel, dst_plain, src_cipher, &decrypt_iv, size, auth_tag);
11132 
11133  out:
11134     uvm_conf_computing_dma_buffer_free(&gpu->conf_computing.dma_buffer_pool, dma_buffer, NULL);
11135     return status;
11136 }
11137 
11138 // Launch a synchronous, encrypted copy between CPU and GPU.
11139 //
11140 // The source CPU buffer pointed by src_plain contains the unencrypted (plain
11141 // text) contents; the function internally performs a CPU-side encryption step
11142 // before launching the GPU-side CE decryption. The source buffer can be in
11143 // protected or unprotected sysmem, while the destination buffer must be in
11144 // protected vidmem.
11145 //
11146 // The maximum copy size allowed is UVM_CONF_COMPUTING_DMA_BUFFER_SIZE.
11147 //
11148 // The input tracker, if not NULL, is internally acquired by the push
11149 // responsible for the encrypted copy.
11150 __attribute__ ((format(printf, 6, 7)))
11151 static NV_STATUS encrypted_memcopy_cpu_to_gpu(uvm_gpu_t *gpu,
11152                                               uvm_gpu_address_t dst_gpu_address,
11153                                               void *src_plain,
11154                                               size_t size,
11155                                               uvm_tracker_t *tracker,
11156                                               const char *format,
11157                                               ...)
11158 {
11159     NV_STATUS status;
11160     uvm_push_t push;
11161     uvm_conf_computing_dma_buffer_t *dma_buffer;
11162     uvm_gpu_address_t src_gpu_address, auth_tag_gpu_address;
11163     void *dst_cipher, *auth_tag;
11164     va_list args;
11165 
11166     UVM_ASSERT(uvm_conf_computing_mode_enabled(gpu));
11167     UVM_ASSERT(size <= UVM_CONF_COMPUTING_DMA_BUFFER_SIZE);
11168 
11169     status = uvm_conf_computing_dma_buffer_alloc(&gpu->conf_computing.dma_buffer_pool, &dma_buffer, NULL);
11170     if (status != NV_OK)
11171         return status;
11172 
11173     va_start(args, format);
11174     status = uvm_push_begin_acquire(gpu->channel_manager, UVM_CHANNEL_TYPE_CPU_TO_GPU, tracker, &push, format, args);
11175     va_end(args);
11176 
11177     if (status != NV_OK)
11178         goto out;
11179 
11180     dst_cipher = uvm_mem_get_cpu_addr_kernel(dma_buffer->alloc);
11181     auth_tag = uvm_mem_get_cpu_addr_kernel(dma_buffer->auth_tag);
11182     uvm_conf_computing_cpu_encrypt(push.channel, dst_cipher, src_plain, NULL, size, auth_tag);
11183 
11184     src_gpu_address = uvm_mem_gpu_address_virtual_kernel(dma_buffer->alloc, gpu);
11185     auth_tag_gpu_address = uvm_mem_gpu_address_virtual_kernel(dma_buffer->auth_tag, gpu);
11186     gpu->parent->ce_hal->decrypt(&push, dst_gpu_address, src_gpu_address, size, auth_tag_gpu_address);
11187 
11188     status = uvm_push_end_and_wait(&push);
11189 
11190 out:
11191     uvm_conf_computing_dma_buffer_free(&gpu->conf_computing.dma_buffer_pool, dma_buffer, NULL);
11192     return status;
11193 }
11194 
11195 static NV_STATUS va_block_write_cpu_to_gpu(uvm_va_block_t *va_block,
11196                                            uvm_gpu_t *gpu,
11197                                            uvm_gpu_address_t dst_gpu_address,
11198                                            NvU64 dst,
11199                                            uvm_mem_t *src_mem,
11200                                            size_t size)
11201 {
11202     NV_STATUS status;
11203     uvm_push_t push;
11204     uvm_gpu_address_t src_gpu_address;
11205 
11206     if (uvm_conf_computing_mode_enabled(gpu)) {
11207         return encrypted_memcopy_cpu_to_gpu(gpu,
11208                                             dst_gpu_address,
11209                                             uvm_mem_get_cpu_addr_kernel(src_mem),
11210                                             size,
11211                                             &va_block->tracker,
11212                                             "Encrypted write to [0x%llx, 0x%llx)",
11213                                             dst,
11214                                             dst + size);
11215     }
11216 
11217     status = uvm_push_begin_acquire(gpu->channel_manager,
11218                                     UVM_CHANNEL_TYPE_CPU_TO_GPU,
11219                                     &va_block->tracker,
11220                                     &push,
11221                                     "Direct write to [0x%llx, 0x%llx)",
11222                                     dst,
11223                                     dst + size);
11224     if (status != NV_OK)
11225         return status;
11226 
11227     src_gpu_address = uvm_mem_gpu_address_virtual_kernel(src_mem, gpu);
11228     gpu->parent->ce_hal->memcopy(&push, dst_gpu_address, src_gpu_address, size);
11229     return uvm_push_end_and_wait(&push);
11230 }
11231 
11232 NV_STATUS uvm_va_block_write_from_cpu(uvm_va_block_t *va_block,
11233                                       uvm_va_block_context_t *block_context,
11234                                       NvU64 dst,
11235                                       uvm_mem_t *src_mem,
11236                                       size_t size)
11237 {
11238     NV_STATUS status;
11239     uvm_page_index_t page_index = uvm_va_block_cpu_page_index(va_block, dst);
11240     NvU64 page_offset = dst & (PAGE_SIZE - 1);
11241     uvm_processor_id_t proc = uvm_va_block_page_get_closest_resident(va_block, page_index, UVM_ID_CPU);
11242     uvm_va_block_region_t region = uvm_va_block_region_for_page(page_index);
11243 
11244     uvm_assert_mutex_locked(&va_block->lock);
11245     UVM_ASSERT_MSG(page_offset + size <= PAGE_SIZE, "Write spans multiple pages: dst 0x%llx, size 0x%zx\n", dst, size);
11246 
11247     if (UVM_ID_IS_INVALID(proc))
11248         proc = UVM_ID_CPU;
11249 
11250     block_context->policy = uvm_va_policy_get(va_block, dst);
11251 
11252     // Use make_resident() in all cases to break read-duplication, but
11253     // block_retry can be NULL as if the page is not resident yet we will make
11254     // it resident on the CPU.
11255     // Notably we don't care about coherence with respect to atomics from other
11256     // processors.
11257     status = uvm_va_block_make_resident(va_block,
11258                                         NULL,
11259                                         block_context,
11260                                         proc,
11261                                         region,
11262                                         NULL,
11263                                         NULL,
11264                                         UVM_MAKE_RESIDENT_CAUSE_API_TOOLS);
11265 
11266     if (status != NV_OK)
11267         return status;
11268 
11269     if (UVM_ID_IS_CPU(proc)) {
11270         char *mapped_page;
11271         struct page *page = uvm_cpu_chunk_get_cpu_page(va_block, page_index);
11272         void *src = uvm_mem_get_cpu_addr_kernel(src_mem);
11273 
11274         status = uvm_tracker_wait(&va_block->tracker);
11275         if (status != NV_OK)
11276             return status;
11277 
11278         mapped_page = (char *)kmap(page);
11279         memcpy(mapped_page + page_offset, src, size);
11280         kunmap(page);
11281 
11282         return NV_OK;
11283     }
11284     else {
11285         uvm_gpu_t *dst_gpu;
11286         uvm_gpu_address_t dst_gpu_address;
11287 
11288         UVM_ASSERT(UVM_ID_IS_GPU(proc));
11289 
11290         dst_gpu = block_get_gpu(va_block, proc);
11291 
11292         dst_gpu_address = block_phys_page_copy_address(va_block, block_phys_page(proc, page_index), dst_gpu);
11293         dst_gpu_address.address += page_offset;
11294 
11295         return va_block_write_cpu_to_gpu(va_block, dst_gpu, dst_gpu_address, dst, src_mem, size);
11296     }
11297 }
11298 
11299 static NV_STATUS va_block_read_gpu_to_cpu(uvm_va_block_t *va_block,
11300                                           uvm_mem_t *dst_mem,
11301                                           uvm_gpu_t *gpu,
11302                                           uvm_gpu_address_t src_gpu_address,
11303                                           NvU64 src,
11304                                           size_t size)
11305 {
11306     NV_STATUS status;
11307     uvm_push_t push;
11308     uvm_gpu_address_t dst_gpu_address;
11309 
11310     if (uvm_conf_computing_mode_enabled(gpu)) {
11311         return encrypted_memcopy_gpu_to_cpu(gpu,
11312                                             uvm_mem_get_cpu_addr_kernel(dst_mem),
11313                                             src_gpu_address,
11314                                             size,
11315                                             &va_block->tracker,
11316                                             "Encrypted read from [0x%llx, 0x%llx)",
11317                                             src,
11318                                             src + size);
11319     }
11320 
11321     status = uvm_push_begin_acquire(gpu->channel_manager,
11322                                     UVM_CHANNEL_TYPE_GPU_TO_CPU,
11323                                     &va_block->tracker,
11324                                     &push,
11325                                     "Direct read from [0x%llx, 0x%llx)",
11326                                     src,
11327                                     src + size);
11328     if (status != NV_OK)
11329         return status;
11330 
11331     dst_gpu_address = uvm_mem_gpu_address_virtual_kernel(dst_mem, gpu);
11332     gpu->parent->ce_hal->memcopy(&push, dst_gpu_address, src_gpu_address, size);
11333     return uvm_push_end_and_wait(&push);
11334 }
11335 
11336 NV_STATUS uvm_va_block_read_to_cpu(uvm_va_block_t *va_block, uvm_mem_t *dst_mem, NvU64 src, size_t size)
11337 {
11338     uvm_page_index_t page_index = uvm_va_block_cpu_page_index(va_block, src);
11339     NvU64 page_offset = src & (PAGE_SIZE - 1);
11340     uvm_processor_id_t proc = uvm_va_block_page_get_closest_resident(va_block, page_index, UVM_ID_CPU);
11341     void *dst = uvm_mem_get_cpu_addr_kernel(dst_mem);
11342 
11343     uvm_assert_mutex_locked(&va_block->lock);
11344     UVM_ASSERT_MSG(page_offset + size <= PAGE_SIZE, "Read spans multiple pages: src 0x%llx, size 0x%zx\n", src, size);
11345 
11346     if (UVM_ID_IS_INVALID(proc)) {
11347         memset(dst, 0, size);
11348         return NV_OK;
11349     }
11350     else if (UVM_ID_IS_CPU(proc)) {
11351         NV_STATUS status;
11352         char *mapped_page;
11353         struct page *page = uvm_cpu_chunk_get_cpu_page(va_block, page_index);
11354 
11355         status = uvm_tracker_wait(&va_block->tracker);
11356         if (status != NV_OK)
11357             return status;
11358 
11359         mapped_page = (char *)kmap(page);
11360         memcpy(dst, mapped_page + page_offset, size);
11361         kunmap(page);
11362 
11363         return NV_OK;
11364     }
11365     else {
11366         uvm_gpu_address_t src_gpu_address;
11367         uvm_gpu_t *gpu = block_get_gpu(va_block, proc);
11368 
11369         src_gpu_address = block_phys_page_copy_address(va_block, block_phys_page(proc, page_index), gpu);
11370         src_gpu_address.address += page_offset;
11371 
11372         return va_block_read_gpu_to_cpu(va_block, dst_mem, gpu, src_gpu_address, src, size);
11373     }
11374 }
11375 
11376 // Deferred work item reestablishing accessed by mappings after eviction. On
11377 // GPUs with access counters enabled, the evicted GPU will also get remote
11378 // mappings.
11379 static void block_add_eviction_mappings(void *args)
11380 {
11381     uvm_va_block_t *va_block = (uvm_va_block_t*)args;
11382     uvm_va_space_t *va_space;
11383     uvm_processor_id_t id;
11384     uvm_va_block_context_t *block_context = NULL;
11385     struct mm_struct *mm = NULL;
11386 
11387     uvm_mutex_lock(&va_block->lock);
11388     va_space = uvm_va_block_get_va_space_maybe_dead(va_block);
11389     uvm_mutex_unlock(&va_block->lock);
11390 
11391     if (!va_space) {
11392         // Block has been killed in the meantime
11393         goto done;
11394     }
11395 
11396     mm = uvm_va_space_mm_retain_lock(va_space);
11397 
11398     block_context = uvm_va_block_context_alloc(mm);
11399     if (!block_context)
11400         goto done;
11401 
11402     // The block wasn't dead when we checked above and that's enough to
11403     // guarantee that the VA space is still around, because
11404     // uvm_va_space_destroy() flushes the associated nv_kthread_q, and that
11405     // flush waits for this function call to finish.
11406     uvm_va_space_down_read(va_space);
11407 
11408     // Now that we have the VA space lock held, we can check whether the block
11409     // is still alive since the VA space write lock is needed to kill blocks.
11410     if (uvm_va_block_is_dead(va_block))
11411         goto unlock;
11412 
11413     if (uvm_va_block_is_hmm(va_block)) {
11414         uvm_hmm_block_add_eviction_mappings(va_space, va_block, block_context);
11415     }
11416     else {
11417         uvm_va_range_t *va_range = va_block->va_range;
11418         NV_STATUS status = NV_OK;
11419 
11420         block_context->policy = uvm_va_range_get_policy(va_range);
11421         for_each_id_in_mask(id, &uvm_va_range_get_policy(va_range)->accessed_by) {
11422             status = uvm_va_block_set_accessed_by(va_block, block_context, id);
11423             if (status != NV_OK)
11424                 break;
11425         }
11426 
11427         if (status == NV_OK && uvm_va_space_map_remote_on_eviction(va_space)) {
11428             uvm_processor_mask_t map_processors;
11429 
11430             // Exclude the processors that have been already mapped due to
11431             // AccessedBy
11432             uvm_processor_mask_andnot(&map_processors,
11433                                       &va_block->evicted_gpus,
11434                                       &uvm_va_range_get_policy(va_range)->accessed_by);
11435 
11436             for_each_gpu_id_in_mask(id, &map_processors) {
11437                 uvm_gpu_t *gpu = uvm_va_space_get_gpu(va_space, id);
11438                 uvm_va_block_gpu_state_t *gpu_state;
11439 
11440                 if (!gpu->parent->access_counters_supported)
11441                     continue;
11442 
11443                 gpu_state = uvm_va_block_gpu_state_get(va_block, id);
11444                 UVM_ASSERT(gpu_state);
11445 
11446                 // TODO: Bug 2096389: uvm_va_block_add_mappings does not add
11447                 // remote mappings to read-duplicated pages. Add support for it
11448                 // or create a new function.
11449                 status = UVM_VA_BLOCK_LOCK_RETRY(va_block, NULL,
11450                                                  uvm_va_block_add_mappings(va_block,
11451                                                                            block_context,
11452                                                                            id,
11453                                                                            uvm_va_block_region_from_block(va_block),
11454                                                                            &gpu_state->evicted,
11455                                                                            UvmEventMapRemoteCauseEviction));
11456                 if (status != NV_OK)
11457                     break;
11458             }
11459         }
11460 
11461         if (status != NV_OK) {
11462             UVM_ERR_PRINT("Deferred mappings to evicted memory for block [0x%llx, 0x%llx] failed %s, processor %s\n",
11463                           va_block->start,
11464                           va_block->end,
11465                           nvstatusToString(status),
11466                           uvm_va_space_processor_name(va_space, id));
11467         }
11468     }
11469 
11470 unlock:
11471     uvm_va_space_up_read(va_space);
11472     uvm_va_block_context_free(block_context);
11473 
11474 done:
11475     uvm_va_space_mm_release_unlock(va_space, mm);
11476     uvm_va_block_release(va_block);
11477 }
11478 
11479 static void block_add_eviction_mappings_entry(void *args)
11480 {
11481     UVM_ENTRY_VOID(block_add_eviction_mappings(args));
11482 }
11483 
11484 NV_STATUS uvm_va_block_evict_chunks(uvm_va_block_t *va_block,
11485                                     uvm_gpu_t *gpu,
11486                                     uvm_gpu_chunk_t *root_chunk,
11487                                     uvm_tracker_t *tracker)
11488 {
11489     NV_STATUS status = NV_OK;
11490     NvU32 i;
11491     uvm_va_block_gpu_state_t *gpu_state;
11492     uvm_va_block_region_t chunk_region;
11493     size_t num_gpu_chunks = block_num_gpu_chunks(va_block, gpu);
11494     size_t chunks_to_evict = 0;
11495     uvm_va_block_context_t *block_context;
11496     uvm_page_mask_t *pages_to_evict;
11497     uvm_va_block_test_t *va_block_test = uvm_va_block_get_test(va_block);
11498     uvm_va_space_t *va_space = uvm_va_block_get_va_space_maybe_dead(va_block);
11499     struct mm_struct *mm;
11500     bool accessed_by_set = false;
11501 
11502     uvm_assert_mutex_locked(&va_block->lock);
11503 
11504     // The block might have been killed in the meantime
11505     if (!va_space)
11506         return NV_OK;
11507 
11508     gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id);
11509     if (!gpu_state)
11510         return NV_OK;
11511 
11512     if (va_block_test && va_block_test->inject_eviction_error) {
11513         va_block_test->inject_eviction_error = false;
11514         return NV_ERR_NO_MEMORY;
11515     }
11516 
11517     // We cannot take this block's VA space or mmap_lock locks on the eviction
11518     // path, however, we retain mm in order to support accounting of CPU memory
11519     // allocations. If mappings need to be created,
11520     // block_add_eviction_mappings() will be scheduled below.
11521     mm = uvm_va_space_mm_retain(va_space);
11522     block_context = uvm_va_block_context_alloc(mm);
11523     if (!block_context) {
11524         if (mm)
11525             uvm_va_space_mm_release(va_space);
11526         return NV_ERR_NO_MEMORY;
11527     }
11528 
11529     pages_to_evict = &block_context->caller_page_mask;
11530     uvm_page_mask_zero(pages_to_evict);
11531     chunk_region.outer = 0;
11532 
11533     // Find all chunks that are subchunks of the root chunk
11534     for (i = 0; i < num_gpu_chunks; ++i) {
11535         uvm_chunk_size_t chunk_size;
11536         size_t chunk_index = block_gpu_chunk_index(va_block, gpu, chunk_region.outer, &chunk_size);
11537         UVM_ASSERT(chunk_index == i);
11538         chunk_region.first = chunk_region.outer;
11539         chunk_region.outer = chunk_region.first + chunk_size / PAGE_SIZE;
11540 
11541         if (!gpu_state->chunks[i])
11542             continue;
11543         if (!uvm_gpu_chunk_same_root(gpu_state->chunks[i], root_chunk))
11544             continue;
11545 
11546         if (uvm_va_block_is_hmm(va_block)) {
11547             status = uvm_hmm_va_block_evict_chunk_prep(va_block, block_context, gpu_state->chunks[i], chunk_region);
11548             if (status != NV_OK)
11549                 break;
11550         }
11551 
11552         uvm_page_mask_region_fill(pages_to_evict, chunk_region);
11553         ++chunks_to_evict;
11554     }
11555 
11556     if (chunks_to_evict == 0)
11557         goto out;
11558 
11559     // Only move pages resident on the GPU
11560     uvm_page_mask_and(pages_to_evict, pages_to_evict, uvm_va_block_resident_mask_get(va_block, gpu->id));
11561     uvm_processor_mask_zero(&block_context->make_resident.all_involved_processors);
11562 
11563     if (uvm_va_block_is_hmm(va_block)) {
11564         status = uvm_hmm_va_block_evict_chunks(va_block,
11565                                                block_context,
11566                                                pages_to_evict,
11567                                                uvm_va_block_region_from_block(va_block),
11568                                                &accessed_by_set);
11569     }
11570     else {
11571         block_context->policy = uvm_va_range_get_policy(va_block->va_range);
11572         accessed_by_set = uvm_processor_mask_get_count(&block_context->policy->accessed_by) > 0;
11573 
11574         // TODO: Bug 1765193: make_resident() breaks read-duplication, but it's
11575         // not necessary to do so for eviction. Add a version that unmaps only
11576         // the processors that have mappings to the pages being evicted.
11577         status = uvm_va_block_make_resident(va_block,
11578                                             NULL,
11579                                             block_context,
11580                                             UVM_ID_CPU,
11581                                             uvm_va_block_region_from_block(va_block),
11582                                             pages_to_evict,
11583                                             NULL,
11584                                             UVM_MAKE_RESIDENT_CAUSE_EVICTION);
11585     }
11586     if (status != NV_OK)
11587         goto out;
11588 
11589     // VA space lock may not be held and hence we cannot reestablish any
11590     // mappings here and need to defer it to a work queue.
11591     //
11592     // Reading the accessed_by mask without the VA space lock is safe because
11593     // adding a new processor to the mask triggers going over all the VA blocks
11594     // in the range and locking them. And we hold one of the VA block's locks.
11595     //
11596     // If uvm_va_range_set_accessed_by() hasn't called
11597     // uvm_va_block_set_accessed_by() for this block yet then it will take care
11598     // of adding the mapping after we are done. If it already did then we are
11599     // guaranteed to see the new processor in the accessed_by mask because we
11600     // locked the block's lock that the thread calling
11601     // uvm_va_range_set_accessed_by() unlocked after updating the mask.
11602     //
11603     // If a processor gets removed from the mask then we might not notice and
11604     // schedule the work item anyway, but that's benign as
11605     // block_add_eviction_mappings() re-examines the mask.
11606     //
11607     // Checking if access counters migrations are enabled on a VA space is racy
11608     // without holding the VA space lock. However, this is fine as
11609     // block_add_eviction_mappings() reexamines the value with the VA space
11610     // lock being held.
11611     if (accessed_by_set || (gpu->parent->access_counters_supported && uvm_va_space_map_remote_on_eviction(va_space))) {
11612         // Always retain the VA block first so that it's safe for the deferred
11613         // callback to release it immediately after it runs.
11614         uvm_va_block_retain(va_block);
11615 
11616         if (!nv_kthread_q_schedule_q_item(&g_uvm_global.global_q,
11617                                           &va_block->eviction_mappings_q_item)) {
11618             // And release it if no new callback was scheduled
11619             uvm_va_block_release_no_destroy(va_block);
11620         }
11621     }
11622 
11623     status = uvm_tracker_add_tracker_safe(tracker, &va_block->tracker);
11624     if (status != NV_OK)
11625         goto out;
11626 
11627     for (i = 0; i < num_gpu_chunks; ++i) {
11628         uvm_gpu_id_t accessing_gpu_id;
11629         uvm_gpu_chunk_t *chunk = gpu_state->chunks[i];
11630 
11631         if (!chunk)
11632             continue;
11633         if (!uvm_gpu_chunk_same_root(chunk, root_chunk))
11634             continue;
11635 
11636         // Remove the mappings of indirect peers from the reverse map. We
11637         // access the indirect peer mask from the VA space without holding the
11638         // VA space lock. Therefore, we can race with enable_peer/disable_peer
11639         // operations. However this is fine:
11640         //
11641         // The enable_peer sequence is as follows:
11642         //
11643         // set_bit in va_space->indirect_peers
11644         // uvm_va_block_enable_peer;
11645         //
11646         // - If we read the mask BEFORE it is set or AFTER the mapping has
11647         // been added to the map there is no race.
11648         // - If we read the mask AFTER it is set but BEFORE adding the mapping
11649         // to the reverse map, we will try to remove it although it is not
11650         // there yet. Therefore, we use
11651         // uvm_pmm_sysmem_mappings_remove_gpu_mapping_on_eviction, which does
11652         // not check if the mapping is present in the reverse map.
11653         //
11654         // The disable_peer sequence is as follows:
11655         //
11656         // uvm_va_block_disable_peer;
11657         // clear_bit in va_space->indirect_peers
11658         //
11659         // - If we read the mask BEFORE the mapping has been added to the map
11660         // or AFTER the bit has been cleared, there is no race.
11661         // - If we read the mask AFTER the mapping has been removed and BEFORE
11662         // the bit is cleared, we will try to remove the mapping, too.
11663         // Again, uvm_pmm_sysmem_mappings_remove_gpu_mapping_on_eviction works
11664         // in this scenario.
11665         // Obtain the uvm_gpu_t directly via the parent GPU's id since indirect
11666         // peers are not supported when SMC is enabled.
11667         for_each_gpu_id_in_mask(accessing_gpu_id, &va_space->indirect_peers[uvm_id_value(gpu->id)]) {
11668             uvm_gpu_t *accessing_gpu = uvm_va_space_get_gpu(va_space, accessing_gpu_id);
11669             NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(&gpu->pmm, chunk, accessing_gpu);
11670 
11671             uvm_pmm_sysmem_mappings_remove_gpu_mapping_on_eviction(&accessing_gpu->pmm_reverse_sysmem_mappings,
11672                                                                    peer_addr);
11673         }
11674 
11675         uvm_mmu_chunk_unmap(chunk, tracker);
11676 
11677         uvm_pmm_gpu_mark_chunk_evicted(&gpu->pmm, gpu_state->chunks[i]);
11678         gpu_state->chunks[i] = NULL;
11679     }
11680 
11681 out:
11682     uvm_va_block_context_free(block_context);
11683     if (mm)
11684         uvm_va_space_mm_release(va_space);
11685 
11686     return status;
11687 }
11688 
11689 static NV_STATUS block_gpu_force_4k_ptes(uvm_va_block_t *block, uvm_va_block_context_t *block_context, uvm_gpu_t *gpu)
11690 {
11691     uvm_va_block_gpu_state_t *gpu_state = block_gpu_state_get_alloc(block, gpu);
11692     uvm_push_t push;
11693     NV_STATUS status;
11694 
11695     // See comment in uvm_va_block_set_cancel
11696     UVM_ASSERT(!gpu->parent->fault_cancel_va_supported);
11697 
11698     if (!gpu_state)
11699         return NV_ERR_NO_MEMORY;
11700 
11701     // Force all pages to be 4K and prevent future upgrades during cancel
11702     gpu_state->force_4k_ptes = true;
11703 
11704     // If we have no page tables we're done. For fault cancel we need to make
11705     // sure that fatal faults are on different 4k PTEs than non-fatal faults,
11706     // and we need to service all non-fatal faults before issuing the cancel. So
11707     // either all faults are fatal and we have no PTEs (we're PROT_NONE), or
11708     // we'll allocate PTEs later when we service the non-fatal faults. Those
11709     // PTEs will be 4k since force_4k_ptes is set.
11710     if (!block_gpu_has_page_tables(block, gpu))
11711         return NV_OK;
11712 
11713     // Are we 4k already?
11714     if (!gpu_state->pte_is_2m && bitmap_empty(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK))
11715         return NV_OK;
11716 
11717     status = block_alloc_ptes_with_retry(block, gpu, UVM_PAGE_SIZE_4K, NULL);
11718     if (status != NV_OK)
11719         return status;
11720 
11721     status = uvm_push_begin_acquire(gpu->channel_manager,
11722                                     UVM_CHANNEL_TYPE_MEMOPS,
11723                                     &block->tracker,
11724                                     &push,
11725                                     "Forcing 4k PTEs on block [0x%llx, 0x%llx)",
11726                                     block->start,
11727                                     block->end + 1);
11728     if (status != NV_OK)
11729         return status;
11730 
11731     if (gpu_state->pte_is_2m)
11732         block_gpu_split_2m(block, block_context, gpu, NULL, &push);
11733     else
11734         block_gpu_split_big(block, block_context, gpu, gpu_state->big_ptes, &push);
11735 
11736     uvm_push_end(&push);
11737 
11738     UVM_ASSERT(block_check_mappings(block));
11739 
11740     return uvm_tracker_add_push_safe(&block->tracker, &push);
11741 }
11742 
11743 NV_STATUS uvm_va_block_set_cancel(uvm_va_block_t *va_block, uvm_va_block_context_t *block_context, uvm_gpu_t *gpu)
11744 {
11745     uvm_assert_mutex_locked(&va_block->lock);
11746 
11747     // Volta+ devices support a global VA cancel method that does not require
11748     // 4k PTEs. Thus, skip doing this PTE splitting, particularly because it
11749     // could result in 4k PTEs on P9 systems which otherwise would never need
11750     // them.
11751     if (gpu->parent->fault_cancel_va_supported)
11752         return NV_OK;
11753 
11754     return block_gpu_force_4k_ptes(va_block, block_context, gpu);
11755 }
11756 
11757 NV_STATUS uvm_test_va_block_inject_error(UVM_TEST_VA_BLOCK_INJECT_ERROR_PARAMS *params, struct file *filp)
11758 {
11759     uvm_va_space_t *va_space = uvm_va_space_get(filp);
11760     struct mm_struct *mm;
11761     uvm_va_block_t *va_block;
11762     uvm_va_block_test_t *va_block_test;
11763     uvm_va_block_context_t *block_context = NULL;
11764     NV_STATUS status = NV_OK;
11765 
11766     mm = uvm_va_space_mm_or_current_retain_lock(va_space);
11767     uvm_va_space_down_read(va_space);
11768 
11769     block_context = uvm_va_block_context_alloc(mm);
11770     if (!block_context) {
11771         status = NV_ERR_NO_MEMORY;
11772         goto out;
11773     }
11774 
11775     status = uvm_va_block_find_create(va_space, params->lookup_address, block_context, &va_block);
11776     if (status != NV_OK)
11777         goto out;
11778 
11779     va_block_test = uvm_va_block_get_test(va_block);
11780     UVM_ASSERT(va_block_test);
11781 
11782     uvm_mutex_lock(&va_block->lock);
11783 
11784     if (params->page_table_allocation_retry_force_count)
11785         va_block_test->page_table_allocation_retry_force_count = params->page_table_allocation_retry_force_count;
11786 
11787     if (params->user_pages_allocation_retry_force_count)
11788         va_block_test->user_pages_allocation_retry_force_count = params->user_pages_allocation_retry_force_count;
11789 
11790     if (params->cpu_chunk_allocation_size_mask) {
11791         if (params->cpu_chunk_allocation_size_mask & ~UVM_CPU_CHUNK_SIZES ||
11792             !(params->cpu_chunk_allocation_size_mask & PAGE_SIZE)) {
11793             status = NV_ERR_INVALID_ARGUMENT;
11794             goto block_unlock;
11795         }
11796 
11797         va_block_test->cpu_chunk_allocation_size_mask = params->cpu_chunk_allocation_size_mask & UVM_CPU_CHUNK_SIZES;
11798     }
11799 
11800     if (params->eviction_error)
11801         va_block_test->inject_eviction_error = params->eviction_error;
11802 
11803     if (params->cpu_pages_allocation_error_count)
11804         va_block_test->inject_cpu_pages_allocation_error_count = params->cpu_pages_allocation_error_count;
11805 
11806     if (params->populate_error)
11807         va_block_test->inject_populate_error = params->populate_error;
11808 
11809 block_unlock:
11810     uvm_mutex_unlock(&va_block->lock);
11811 
11812 out:
11813     uvm_va_space_up_read(va_space);
11814     uvm_va_space_mm_or_current_release_unlock(va_space, mm);
11815     uvm_va_block_context_free(block_context);
11816     return status;
11817 }
11818 
11819 static uvm_prot_t g_uvm_test_pte_mapping_to_prot[UVM_TEST_PTE_MAPPING_MAX] =
11820 {
11821     [UVM_TEST_PTE_MAPPING_INVALID]           = UVM_PROT_NONE,
11822     [UVM_TEST_PTE_MAPPING_READ_ONLY]         = UVM_PROT_READ_ONLY,
11823     [UVM_TEST_PTE_MAPPING_READ_WRITE]        = UVM_PROT_READ_WRITE,
11824     [UVM_TEST_PTE_MAPPING_READ_WRITE_ATOMIC] = UVM_PROT_READ_WRITE_ATOMIC,
11825 };
11826 
11827 static UVM_TEST_PTE_MAPPING g_uvm_prot_to_test_pte_mapping[UVM_PROT_MAX] =
11828 {
11829     [UVM_PROT_NONE]              = UVM_TEST_PTE_MAPPING_INVALID,
11830     [UVM_PROT_READ_ONLY]         = UVM_TEST_PTE_MAPPING_READ_ONLY,
11831     [UVM_PROT_READ_WRITE]        = UVM_TEST_PTE_MAPPING_READ_WRITE,
11832     [UVM_PROT_READ_WRITE_ATOMIC] = UVM_TEST_PTE_MAPPING_READ_WRITE_ATOMIC,
11833 };
11834 
11835 NV_STATUS uvm_test_change_pte_mapping(UVM_TEST_CHANGE_PTE_MAPPING_PARAMS *params, struct file *filp)
11836 {
11837     uvm_va_space_t *va_space = uvm_va_space_get(filp);
11838     uvm_va_block_t *block;
11839     struct mm_struct *mm;
11840     NV_STATUS status = NV_OK;
11841     uvm_prot_t curr_prot, new_prot;
11842     uvm_gpu_t *gpu = NULL;
11843     uvm_processor_id_t id;
11844     uvm_tracker_t local_tracker;
11845     uvm_va_block_region_t region;
11846     uvm_va_block_context_t *block_context = NULL;
11847 
11848     if (!PAGE_ALIGNED(params->va))
11849         return NV_ERR_INVALID_ADDRESS;
11850 
11851     if (params->mapping >= UVM_TEST_PTE_MAPPING_MAX)
11852         return NV_ERR_INVALID_ARGUMENT;
11853 
11854     new_prot = g_uvm_test_pte_mapping_to_prot[params->mapping];
11855 
11856     // mmap_lock isn't needed for invalidating CPU mappings, but it will be
11857     // needed for inserting them.
11858     mm = uvm_va_space_mm_or_current_retain_lock(va_space);
11859     uvm_va_space_down_read(va_space);
11860 
11861     if (uvm_uuid_is_cpu(&params->uuid)) {
11862         id = UVM_ID_CPU;
11863     }
11864     else {
11865         gpu = uvm_va_space_get_gpu_by_uuid_with_gpu_va_space(va_space, &params->uuid);
11866         if (!gpu) {
11867             status = NV_ERR_INVALID_DEVICE;
11868             goto out;
11869         }
11870 
11871         // Check if the GPU can access the VA
11872         if (!uvm_gpu_can_address(gpu, params->va, PAGE_SIZE)) {
11873             status = NV_ERR_OUT_OF_RANGE;
11874             goto out;
11875         }
11876 
11877         id = gpu->id;
11878     }
11879 
11880     block_context = uvm_va_block_context_alloc(mm);
11881     if (!block_context) {
11882         status = NV_ERR_NO_MEMORY;
11883         goto out;
11884     }
11885 
11886     status = uvm_va_block_find_create(va_space, params->va, block_context, &block);
11887     if (status != NV_OK)
11888         goto out;
11889 
11890     // TODO: Bug 3912902: UvmTestChangePteMapping() doesn't work on CPU.
11891     if (UVM_ID_IS_CPU(id) && uvm_va_block_is_hmm(block))
11892         goto out;
11893 
11894     uvm_mutex_lock(&block->lock);
11895 
11896     region = uvm_va_block_region_from_start_size(block, params->va, PAGE_SIZE);
11897     curr_prot = block_page_prot(block, id, region.first);
11898 
11899     if (new_prot == curr_prot) {
11900         status = NV_OK;
11901         goto out_block;
11902     }
11903 
11904     // TODO: Bug 1766124: Upgrades might require revoking other processors'
11905     //       access privileges. We just fail for now. Only downgrades are
11906     //       supported. If we allowed upgrades, we would need to check the mm
11907     //       like we do for revocation below.
11908     if (new_prot > curr_prot) {
11909         status = NV_ERR_INVALID_OPERATION;
11910         goto out_block;
11911     }
11912 
11913     block_context->policy = uvm_va_policy_get(block, params->va);
11914 
11915     if (new_prot == UVM_PROT_NONE) {
11916         status = uvm_va_block_unmap(block, block_context, id, region, NULL, &block->tracker);
11917     }
11918     else {
11919         UVM_ASSERT(block_is_page_resident_anywhere(block, region.first));
11920 
11921         // Revoking CPU mappings performs a combination of unmap + map. The map
11922         // portion requires a valid mm.
11923         if (UVM_ID_IS_CPU(id) && !uvm_va_range_vma_check(block->va_range, mm)) {
11924             status = NV_ERR_INVALID_STATE;
11925         }
11926         else {
11927             status = uvm_va_block_revoke_prot(block,
11928                                               block_context,
11929                                               id,
11930                                               region,
11931                                               NULL,
11932                                               new_prot + 1,
11933                                               &block->tracker);
11934         }
11935     }
11936 
11937 out_block:
11938     if (status == NV_OK)
11939         status = uvm_tracker_init_from(&local_tracker, &block->tracker);
11940 
11941     uvm_mutex_unlock(&block->lock);
11942 
11943     if (status == NV_OK)
11944         status = uvm_tracker_wait_deinit(&local_tracker);
11945 
11946 out:
11947     uvm_va_space_up_read(va_space);
11948     uvm_va_space_mm_or_current_release_unlock(va_space, mm);
11949 
11950     uvm_va_block_context_free(block_context);
11951 
11952     return status;
11953 }
11954 
11955 NV_STATUS uvm_test_va_block_info(UVM_TEST_VA_BLOCK_INFO_PARAMS *params, struct file *filp)
11956 {
11957     uvm_va_space_t *va_space = uvm_va_space_get(filp);
11958     uvm_va_block_t *va_block;
11959     uvm_va_range_t *va_range;
11960     struct mm_struct *mm;
11961     size_t index;
11962     NV_STATUS status = NV_OK;
11963 
11964     BUILD_BUG_ON(UVM_TEST_VA_BLOCK_SIZE != UVM_VA_BLOCK_SIZE);
11965 
11966     mm = uvm_va_space_mm_or_current_retain_lock(va_space);
11967     uvm_va_space_down_read(va_space);
11968 
11969     va_range = uvm_va_range_find(va_space, params->lookup_address);
11970     if (!va_range) {
11971         status = uvm_hmm_va_block_find(va_space, params->lookup_address, &va_block);
11972         if (status == NV_ERR_OBJECT_NOT_FOUND) {
11973             status = uvm_hmm_va_block_range_bounds(va_space,
11974                                                    mm,
11975                                                    params->lookup_address,
11976                                                    &params->va_block_start,
11977                                                    &params->va_block_end,
11978                                                    NULL);
11979             goto out;
11980         }
11981         else if (status != NV_OK) {
11982             goto out;
11983         }
11984     }
11985     else {
11986         index = uvm_va_range_block_index(va_range, params->lookup_address);
11987         va_block = uvm_va_range_block(va_range, index);
11988         if (!va_block) {
11989             status = NV_ERR_OBJECT_NOT_FOUND;
11990             goto out;
11991         }
11992     }
11993 
11994     params->va_block_start = va_block->start;
11995     params->va_block_end   = va_block->end;
11996 
11997 out:
11998     uvm_va_space_up_read(va_space);
11999     uvm_va_space_mm_or_current_release_unlock(va_space, mm);
12000     return status;
12001 }
12002 
12003 NV_STATUS uvm_test_va_residency_info(UVM_TEST_VA_RESIDENCY_INFO_PARAMS *params, struct file *filp)
12004 {
12005     NV_STATUS status = NV_OK;
12006     uvm_va_space_t *va_space = uvm_va_space_get(filp);
12007     uvm_va_range_t *va_range;
12008     uvm_va_block_t *block = NULL;
12009     struct mm_struct *mm;
12010     NvU32 count = 0;
12011     uvm_processor_mask_t resident_on_mask;
12012     uvm_processor_id_t id;
12013     uvm_page_index_t page_index;
12014     unsigned release_block_count = 0;
12015     NvU64 addr = UVM_ALIGN_DOWN(params->lookup_address, PAGE_SIZE);
12016     size_t index;
12017 
12018     mm = uvm_va_space_mm_or_current_retain_lock(va_space);
12019     uvm_va_space_down_read(va_space);
12020 
12021     // Inline uvm_va_block_find() to get the va_range.
12022     va_range = uvm_va_range_find(va_space, addr);
12023     if (!va_range) {
12024         NvU64 start, end;
12025 
12026         status = uvm_hmm_va_block_find(va_space, addr, &block);
12027         if (status != NV_OK) {
12028             if (status != NV_ERR_OBJECT_NOT_FOUND)
12029                 goto out;
12030             status = uvm_hmm_va_block_range_bounds(va_space, mm, addr, &start, &end, params);
12031             goto out;
12032         }
12033         // Update current CPU mapping information.
12034         status = uvm_hmm_va_block_update_residency_info(block, mm, addr, false);
12035         if (status != NV_OK) {
12036             block = NULL;
12037             goto out;
12038         }
12039     }
12040     else if (va_range->type != UVM_VA_RANGE_TYPE_MANAGED) {
12041         status = NV_ERR_INVALID_ADDRESS;
12042         goto out;
12043     }
12044     else {
12045         index = uvm_va_range_block_index(va_range, addr);
12046         block = uvm_va_range_block(va_range, index);
12047         if (!block) {
12048             params->resident_on_count = 0;
12049             params->populated_on_count = 0;
12050             params->mapped_on_count = 0;
12051 
12052             status = NV_OK;
12053 
12054             goto out;
12055         }
12056     }
12057 
12058     uvm_mutex_lock(&block->lock);
12059 
12060     page_index = uvm_va_block_cpu_page_index(block, addr);
12061     uvm_va_block_page_resident_processors(block, page_index, &resident_on_mask);
12062 
12063     for_each_id_in_mask(id, &resident_on_mask) {
12064         block_phys_page_t block_page = block_phys_page(id, page_index);
12065         uvm_va_space_processor_uuid(va_space, &params->resident_on[count], id);
12066         params->resident_physical_size[count] = block_phys_page_size(block, block_page);
12067         if (UVM_ID_IS_CPU(id)) {
12068             params->resident_physical_address[count] = page_to_phys(uvm_cpu_chunk_get_cpu_page(block, page_index));
12069         }
12070         else {
12071             params->resident_physical_address[count] =
12072                 block_phys_page_address(block, block_page, uvm_va_space_get_gpu(va_space, id)).address;
12073         }
12074         ++count;
12075     }
12076     params->resident_on_count = count;
12077 
12078     count = 0;
12079     for_each_id_in_mask(id, &block->mapped) {
12080         uvm_processor_id_t processor_to_map;
12081         block_phys_page_t block_page;
12082         NvU32 page_size = uvm_va_block_page_size_processor(block, id, page_index);
12083 
12084         if (page_size == 0)
12085             continue;
12086 
12087         uvm_va_space_processor_uuid(va_space, &params->mapped_on[count], id);
12088 
12089         params->mapping_type[count] = g_uvm_prot_to_test_pte_mapping[block_page_prot(block, id, page_index)];
12090         UVM_ASSERT(params->mapping_type[count] != UVM_TEST_PTE_MAPPING_INVALID);
12091         processor_to_map = block_get_processor_to_map(block, id, page_index);
12092         block_page = block_phys_page(processor_to_map, page_index);
12093 
12094         if (!UVM_ID_IS_CPU(id)) {
12095             uvm_gpu_phys_address_t gpu_phys_addr = block_phys_page_address(block,
12096                                                                            block_page,
12097                                                                            uvm_va_space_get_gpu(va_space, id));
12098             params->mapping_physical_address[count] = gpu_phys_addr.address;
12099         }
12100         else {
12101             struct page *page = block_page_get(block, block_page);
12102 
12103             params->mapping_physical_address[count] = page_to_phys(page);
12104         }
12105 
12106         params->page_size[count] = page_size;
12107         ++count;
12108     }
12109 
12110     if (params->resident_on_count == 1) {
12111         if (uvm_processor_mask_test(&resident_on_mask, UVM_ID_CPU)) {
12112             if (uvm_pmm_sysmem_mappings_indirect_supported()) {
12113                 for_each_gpu_id(id) {
12114                     NvU32 page_size = uvm_va_block_page_size_processor(block, id, page_index);
12115                     uvm_reverse_map_t sysmem_page;
12116                     uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index);
12117                     size_t num_pages;
12118                     uvm_gpu_t *gpu;
12119 
12120                     if (!uvm_va_block_gpu_state_get(block, id))
12121                         continue;
12122 
12123                     gpu = uvm_va_space_get_gpu(va_space, id);
12124 
12125                     if (!gpu->parent->access_counters_supported)
12126                         continue;
12127 
12128                     num_pages = uvm_pmm_sysmem_mappings_dma_to_virt(&gpu->pmm_reverse_sysmem_mappings,
12129                                                                     uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu->parent),
12130                                                                     uvm_cpu_chunk_get_size(chunk),
12131                                                                     &sysmem_page,
12132                                                                     1);
12133                     if (page_size > 0)
12134                         UVM_ASSERT(num_pages == 1);
12135                     else
12136                         UVM_ASSERT(num_pages <= 1);
12137 
12138                     if (num_pages == 1) {
12139                         UVM_ASSERT(sysmem_page.va_block == block);
12140                         UVM_ASSERT(uvm_reverse_map_start(&sysmem_page) <= addr);
12141                         UVM_ASSERT(uvm_reverse_map_end(&sysmem_page) > addr);
12142 
12143                         ++release_block_count;
12144                     }
12145                 }
12146             }
12147         }
12148         else {
12149             uvm_gpu_id_t id = uvm_processor_mask_find_first_id(&resident_on_mask);
12150             uvm_reverse_map_t gpu_mapping;
12151             size_t num_pages;
12152             uvm_gpu_t *gpu = uvm_va_space_get_gpu(va_space, id);
12153             uvm_gpu_phys_address_t phys_addr;
12154 
12155             phys_addr = uvm_va_block_gpu_phys_page_address(block, page_index, gpu);
12156             num_pages = uvm_pmm_gpu_phys_to_virt(&gpu->pmm, phys_addr.address, PAGE_SIZE, &gpu_mapping);
12157 
12158             // Chunk may be in TEMP_PINNED state so it may not have a VA block
12159             // assigned. In that case, we don't get a valid translation.
12160             if (num_pages > 0) {
12161                 UVM_ASSERT(num_pages == 1);
12162                 UVM_ASSERT(gpu_mapping.va_block == block);
12163                 UVM_ASSERT(uvm_reverse_map_start(&gpu_mapping) == addr);
12164 
12165                 ++release_block_count;
12166             }
12167         }
12168     }
12169 
12170     params->mapped_on_count = count;
12171 
12172     count = 0;
12173     for_each_processor_id(id) {
12174         if (!block_processor_page_is_populated(block, id, page_index))
12175             continue;
12176 
12177         uvm_va_space_processor_uuid(va_space, &params->populated_on[count], id);
12178         ++count;
12179     }
12180     params->populated_on_count = count;
12181 
12182 out:
12183     if (block) {
12184         if (!params->is_async && status == NV_OK)
12185             status = uvm_tracker_wait(&block->tracker);
12186         uvm_mutex_unlock(&block->lock);
12187         while (release_block_count--)
12188             uvm_va_block_release(block);
12189     }
12190     uvm_va_space_up_read(va_space);
12191     uvm_va_space_mm_or_current_release_unlock(va_space, mm);
12192     return status;
12193 }
12194 
12195 void uvm_va_block_mark_cpu_dirty(uvm_va_block_t *va_block)
12196 {
12197     block_mark_region_cpu_dirty(va_block, uvm_va_block_region_from_block(va_block));
12198 }
12199