1 /*******************************************************************************
2     Copyright (c) 2015-2023 NVIDIA Corporation
3 
4     Permission is hereby granted, free of charge, to any person obtaining a copy
5     of this software and associated documentation files (the "Software"), to
6     deal in the Software without restriction, including without limitation the
7     rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
8     sell copies of the Software, and to permit persons to whom the Software is
9     furnished to do so, subject to the following conditions:
10 
11         The above copyright notice and this permission notice shall be
12         included in all copies or substantial portions of the Software.
13 
14     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17     THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19     FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20     DEALINGS IN THE SOFTWARE.
21 
22 *******************************************************************************/
23 
24 #include "uvm_linux.h"
25 #include "uvm_common.h"
26 #include "uvm_api.h"
27 #include "uvm_gpu.h"
28 #include "uvm_va_space.h"
29 #include "uvm_va_range.h"
30 #include "uvm_va_block.h"
31 #include "uvm_hal_types.h"
32 #include "uvm_kvmalloc.h"
33 #include "uvm_tools.h"
34 #include "uvm_push.h"
35 #include "uvm_hal.h"
36 #include "uvm_perf_thrashing.h"
37 #include "uvm_perf_prefetch.h"
38 #include "uvm_mem.h"
39 #include "uvm_gpu_access_counters.h"
40 #include "uvm_va_space_mm.h"
41 #include "uvm_test_ioctl.h"
42 #include "uvm_conf_computing.h"
43 
44 typedef enum
45 {
46     BLOCK_PTE_OP_MAP,
47     BLOCK_PTE_OP_REVOKE,
48     BLOCK_PTE_OP_COUNT
49 } block_pte_op_t;
50 
51 static NvU64 uvm_perf_authorized_cpu_fault_tracking_window_ns = 300000;
52 
53 static struct kmem_cache *g_uvm_va_block_cache __read_mostly;
54 static struct kmem_cache *g_uvm_va_block_gpu_state_cache __read_mostly;
55 static struct kmem_cache *g_uvm_page_mask_cache __read_mostly;
56 static struct kmem_cache *g_uvm_va_block_context_cache __read_mostly;
57 
58 static int uvm_fault_force_sysmem __read_mostly = 0;
59 module_param(uvm_fault_force_sysmem, int, S_IRUGO|S_IWUSR);
60 MODULE_PARM_DESC(uvm_fault_force_sysmem, "Force (1) using sysmem storage for pages that faulted. Default: 0.");
61 
62 static int uvm_perf_map_remote_on_eviction __read_mostly = 1;
63 module_param(uvm_perf_map_remote_on_eviction, int, S_IRUGO);
64 
65 // Caching is always disabled for mappings to remote memory. The following two
66 // module parameters can be used to force caching for GPU peer/sysmem mappings.
67 //
68 // However, it is important to note that it may not be safe to enable caching
69 // in the general case so the enablement should only be used for experiments.
70 static unsigned uvm_exp_gpu_cache_peermem __read_mostly = 0;
71 module_param(uvm_exp_gpu_cache_peermem, uint, S_IRUGO);
72 MODULE_PARM_DESC(uvm_exp_gpu_cache_peermem,
73                  "Force caching for mappings to peer memory. "
74                  "This is an experimental parameter that may cause correctness issues if used.");
75 
76 static unsigned uvm_exp_gpu_cache_sysmem __read_mostly = 0;
77 module_param(uvm_exp_gpu_cache_sysmem, uint, S_IRUGO);
78 MODULE_PARM_DESC(uvm_exp_gpu_cache_sysmem,
79                  "Force caching for mappings to system memory. "
80                  "This is an experimental parameter that may cause correctness issues if used.");
81 
82 static void block_add_eviction_mappings_entry(void *args);
83 
84 uvm_va_space_t *uvm_va_block_get_va_space_maybe_dead(uvm_va_block_t *va_block)
85 {
86 #if UVM_IS_CONFIG_HMM()
87     if (va_block->hmm.va_space)
88         return va_block->hmm.va_space;
89 #endif
90 
91     if (va_block->va_range)
92         return va_block->va_range->va_space;
93 
94     return NULL;
95 }
96 
97 uvm_va_space_t *uvm_va_block_get_va_space(uvm_va_block_t *va_block)
98 {
99     uvm_va_space_t *va_space;
100 
101     UVM_ASSERT(!uvm_va_block_is_dead(va_block));
102 
103     va_space = uvm_va_block_get_va_space_maybe_dead(va_block);
104     UVM_ASSERT(va_space);
105 
106     return va_space;
107 }
108 
109 static NvU64 block_gpu_pte_flag_cacheable(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_processor_id_t resident_id)
110 {
111     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
112 
113     UVM_ASSERT(UVM_ID_IS_VALID(resident_id));
114 
115     // Local vidmem is always cached
116     if (uvm_id_equal(resident_id, gpu->id))
117         return UVM_MMU_PTE_FLAGS_CACHED;
118 
119     if (UVM_ID_IS_CPU(resident_id))
120         return uvm_exp_gpu_cache_sysmem == 0 ? UVM_MMU_PTE_FLAGS_NONE : UVM_MMU_PTE_FLAGS_CACHED;
121 
122     UVM_ASSERT(uvm_processor_mask_test(&va_space->can_access[uvm_id_value(gpu->id)], resident_id));
123 
124     return uvm_exp_gpu_cache_peermem == 0 ? UVM_MMU_PTE_FLAGS_NONE : UVM_MMU_PTE_FLAGS_CACHED;
125 }
126 
127 static uvm_gpu_t *block_get_gpu(uvm_va_block_t *block, uvm_gpu_id_t gpu_id)
128 {
129     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
130 
131     return uvm_va_space_get_gpu(va_space, gpu_id);
132 }
133 
134 static const char *block_processor_name(uvm_va_block_t *block, uvm_processor_id_t id)
135 {
136     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
137 
138     return uvm_va_space_processor_name(va_space, id);
139 }
140 
141 static bool block_processor_has_memory(uvm_va_block_t *block, uvm_processor_id_t id)
142 {
143     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
144 
145     return uvm_va_space_processor_has_memory(va_space, id);
146 }
147 
148 static bool is_uvm_fault_force_sysmem_set(void)
149 {
150     // Only enforce this during testing
151     return uvm_enable_builtin_tests && uvm_fault_force_sysmem != 0;
152 }
153 
154 bool uvm_va_space_map_remote_on_eviction(uvm_va_space_t *va_space)
155 {
156     return uvm_perf_map_remote_on_eviction &&
157            uvm_va_space_has_access_counter_migrations(va_space);
158 }
159 
160 static const uvm_processor_mask_t *block_get_uvm_lite_gpus(uvm_va_block_t *va_block)
161 {
162     // Note that for HMM we always return a pointer to a zero bitmap
163     // (not allocated on the stack) since uvm_lite GPUs are not supported.
164     static const uvm_processor_mask_t uvm_lite_gpus = {};
165 
166     if (uvm_va_block_is_hmm(va_block))
167         return &uvm_lite_gpus;
168     else
169         return &va_block->va_range->uvm_lite_gpus;
170 }
171 
172 void uvm_va_block_retry_init(uvm_va_block_retry_t *retry)
173 {
174     if (!retry)
175         return;
176 
177     uvm_tracker_init(&retry->tracker);
178     INIT_LIST_HEAD(&retry->used_chunks);
179     INIT_LIST_HEAD(&retry->free_chunks);
180 }
181 
182 // The bottom bit of uvm_va_block_t::chunks is used to indicate how CPU chunks
183 // are stored.
184 //
185 // CPU chunk storage is handled in three different ways depending on the
186 // type of chunks the VA block owns. This is done to minimize the memory
187 // required to hold metadata.
188 typedef enum
189 {
190     // The uvm_va_block_t::chunk pointer points to a single 2MB
191     // CPU chunk.
192     UVM_CPU_CHUNK_STORAGE_CHUNK = 0,
193 
194     // The uvm_va_block_t::chunks pointer points to a
195     // structure of mixed (64K and 4K) chunks.
196     UVM_CPU_CHUNK_STORAGE_MIXED,
197     UVM_CPU_CHUNK_STORAGE_COUNT,
198 } uvm_cpu_chunk_storage_type_t;
199 
200 #define UVM_CPU_CHUNK_STORAGE_MASK 0x1
201 
202 // The maximum number of slots in the mixed chunk mode (64K + 4K chunks) is
203 // MAX_BIG_PAGES_PER_UVM_VA_BLOCK. Any leading/trailing misaligned pages will
204 // be stored in the first/last entry, respectively.
205 #define MAX_BIG_CPU_CHUNK_SLOTS_PER_UVM_VA_BLOCK MAX_BIG_PAGES_PER_UVM_VA_BLOCK
206 
207 #define MAX_SMALL_CHUNKS_PER_BIG_SLOT (UVM_MIN_BIG_PAGE_SIZE / PAGE_SIZE)
208 
209 // This structure is used when a VA block contains 64K or a mix of 64K and 4K
210 // CPU chunks.
211 // For every 64K CPU chunks, big_chunks will have its corresponding bit set
212 // and the corresponding index in slots will point directly to the
213 // uvm_cpu_chunk_t structure.
214 //
215 // For 4K CPU chunks, the corresponding bit in big_chunks will be clear and
216 // the element in slots will point to an array of 16 uvm_cpu_chunk_t pointers.
217 typedef struct {
218     DECLARE_BITMAP(big_chunks, MAX_BIG_CPU_CHUNK_SLOTS_PER_UVM_VA_BLOCK);
219     void *slots[MAX_BIG_CPU_CHUNK_SLOTS_PER_UVM_VA_BLOCK];
220 } uvm_cpu_chunk_storage_mixed_t;
221 
222 static uvm_va_block_region_t uvm_cpu_chunk_block_region(uvm_va_block_t *va_block,
223                                                         uvm_cpu_chunk_t *chunk,
224                                                         uvm_page_index_t page_index)
225 {
226     UVM_ASSERT(chunk);
227     return uvm_va_block_chunk_region(va_block, uvm_cpu_chunk_get_size(chunk), page_index);
228 }
229 
230 static void *uvm_cpu_storage_get_ptr(uvm_va_block_t *block)
231 {
232     return (void *)(block->cpu.chunks & ~UVM_CPU_CHUNK_STORAGE_MASK);
233 }
234 
235 static uvm_cpu_chunk_storage_type_t uvm_cpu_storage_get_type(uvm_va_block_t *block)
236 {
237     return block->cpu.chunks & UVM_CPU_CHUNK_STORAGE_MASK;
238 }
239 
240 static uvm_page_index_t compute_page_prefix(uvm_va_block_t *va_block, uvm_chunk_size_t size)
241 {
242     return (UVM_ALIGN_UP(va_block->start, size) - va_block->start) / PAGE_SIZE;
243 }
244 
245 static size_t compute_slot_index(uvm_va_block_t *va_block, uvm_page_index_t page_index)
246 {
247     uvm_va_block_region_t block_region = uvm_va_block_region_from_block(va_block);
248     uvm_page_index_t prefix;
249     size_t slot_index;
250 
251     UVM_ASSERT(page_index < block_region.outer);
252     prefix = compute_page_prefix(va_block, UVM_PAGE_SIZE_64K);
253 
254     if (page_index < prefix)
255         return 0;
256 
257     slot_index = ((page_index - prefix) / MAX_SMALL_CHUNKS_PER_BIG_SLOT) + !!prefix;
258     UVM_ASSERT(slot_index < MAX_BIG_CPU_CHUNK_SLOTS_PER_UVM_VA_BLOCK);
259 
260     return slot_index;
261 }
262 
263 static size_t compute_small_index(uvm_va_block_t *va_block, uvm_page_index_t page_index)
264 {
265     size_t prefix = compute_page_prefix(va_block, UVM_PAGE_SIZE_64K);
266 
267     if (page_index < prefix)
268         return page_index;
269 
270     return (page_index - prefix) % MAX_SMALL_CHUNKS_PER_BIG_SLOT;
271 }
272 
273 NV_STATUS uvm_cpu_chunk_insert_in_block(uvm_va_block_t *va_block,
274                                         uvm_cpu_chunk_t *chunk,
275                                         uvm_page_index_t page_index)
276 {
277     uvm_chunk_size_t chunk_size = uvm_cpu_chunk_get_size(chunk);
278     uvm_va_block_region_t chunk_region = uvm_va_block_region(page_index, page_index + uvm_cpu_chunk_num_pages(chunk));
279     size_t slot_index;
280     uvm_cpu_chunk_storage_mixed_t *mixed;
281     uvm_cpu_chunk_t **chunks = NULL;
282 
283     // We only want to use the bottom bit of a pointer.
284     BUILD_BUG_ON(UVM_CPU_CHUNK_STORAGE_COUNT > 2);
285 
286     // We want to protect against two threads manipulating the VA block's CPU
287     // chunks at the same time. However, when a block is split, the new block's
288     // lock is locked without tracking. So, we can't use
289     // uvm_assert_mutex_locked().
290     UVM_ASSERT(mutex_is_locked(&va_block->lock.m));
291 
292     if (chunk_size == UVM_CHUNK_SIZE_2M) {
293         UVM_ASSERT(uvm_va_block_size(va_block) == UVM_PAGE_SIZE_2M);
294         UVM_ASSERT(!va_block->cpu.chunks);
295         va_block->cpu.chunks = (unsigned long)chunk | UVM_CPU_CHUNK_STORAGE_CHUNK;
296     }
297     else {
298         if (!va_block->cpu.chunks) {
299             mixed = uvm_kvmalloc_zero(sizeof(*mixed));
300             if (!mixed)
301                 return NV_ERR_NO_MEMORY;
302 
303             va_block->cpu.chunks = (unsigned long)mixed | UVM_CPU_CHUNK_STORAGE_MIXED;
304         }
305 
306         UVM_ASSERT(uvm_cpu_storage_get_type(va_block) == UVM_CPU_CHUNK_STORAGE_MIXED);
307         mixed = uvm_cpu_storage_get_ptr(va_block);
308         slot_index = compute_slot_index(va_block, page_index);
309         UVM_ASSERT(compute_slot_index(va_block, page_index + uvm_cpu_chunk_num_pages(chunk) - 1) == slot_index);
310         UVM_ASSERT(!test_bit(slot_index, mixed->big_chunks));
311 
312         if (chunk_size == UVM_CHUNK_SIZE_64K) {
313             mixed->slots[slot_index] = chunk;
314             set_bit(slot_index, mixed->big_chunks);
315         }
316         else {
317             size_t small_index;
318 
319             UVM_ASSERT(chunk_size == UVM_CHUNK_SIZE_4K);
320             chunks = mixed->slots[slot_index];
321 
322             if (!chunks) {
323                 chunks = uvm_kvmalloc_zero(sizeof(*chunks) * MAX_SMALL_CHUNKS_PER_BIG_SLOT);
324                 if (!chunks)
325                     return NV_ERR_NO_MEMORY;
326                 mixed->slots[slot_index] = chunks;
327             }
328 
329             small_index = compute_small_index(va_block, page_index);
330             chunks[small_index] = chunk;
331         }
332     }
333 
334     uvm_page_mask_region_fill(&va_block->cpu.allocated, chunk_region);
335     return NV_OK;
336 }
337 
338 uvm_cpu_chunk_t *uvm_cpu_chunk_get_chunk_for_page(uvm_va_block_t *va_block, uvm_page_index_t page_index)
339 {
340     uvm_cpu_chunk_storage_mixed_t *mixed;
341     uvm_cpu_chunk_t *chunk;
342     uvm_cpu_chunk_t **chunks;
343     size_t slot_index;
344 
345     UVM_ASSERT(page_index < uvm_va_block_num_cpu_pages(va_block));
346     if (!uvm_page_mask_test(&va_block->cpu.allocated, page_index))
347         return NULL;
348 
349     UVM_ASSERT(va_block->cpu.chunks);
350 
351     if (uvm_cpu_storage_get_type(va_block) == UVM_CPU_CHUNK_STORAGE_CHUNK) {
352         return uvm_cpu_storage_get_ptr(va_block);
353     }
354     else {
355         mixed = uvm_cpu_storage_get_ptr(va_block);
356         slot_index = compute_slot_index(va_block, page_index);
357         UVM_ASSERT(mixed->slots[slot_index] != NULL);
358         if (test_bit(slot_index, mixed->big_chunks))
359             return mixed->slots[slot_index];
360 
361         chunks = mixed->slots[slot_index];
362         chunk = chunks[compute_small_index(va_block, page_index)];
363     }
364 
365     UVM_ASSERT(chunk);
366     return chunk;
367 }
368 
369 void uvm_cpu_chunk_remove_from_block(uvm_va_block_t *va_block,
370                                      uvm_page_index_t page_index)
371 {
372     uvm_cpu_chunk_storage_mixed_t *mixed;
373     uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, page_index);
374     uvm_va_block_region_t chunk_region = uvm_cpu_chunk_block_region(va_block, chunk, page_index);
375     size_t slot_index;
376     uvm_cpu_chunk_t **chunks;
377 
378     // We want to protect against two threads manipulating the VA block's CPU
379     // chunks at the same time. However, when a block is split, the new block's
380     // lock is locked without tracking. So, we can't use
381     // uvm_assert_mutex_locked().
382     UVM_ASSERT(mutex_is_locked(&va_block->lock.m));
383     UVM_ASSERT(va_block->cpu.chunks);
384     UVM_ASSERT(uvm_va_block_region_num_pages(chunk_region) == uvm_cpu_chunk_num_pages(chunk));
385 
386     if (uvm_cpu_storage_get_type(va_block) == UVM_CPU_CHUNK_STORAGE_CHUNK) {
387         UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_2M);
388         UVM_ASSERT(uvm_cpu_storage_get_ptr(va_block) == chunk);
389         va_block->cpu.chunks = 0;
390     }
391     else {
392         UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) != UVM_CHUNK_SIZE_2M);
393         mixed = uvm_cpu_storage_get_ptr(va_block);
394         slot_index = compute_slot_index(va_block, page_index);
395         UVM_ASSERT(mixed->slots[slot_index] != NULL);
396 
397         if (test_bit(slot_index, mixed->big_chunks)) {
398             UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_64K);
399             UVM_ASSERT(mixed->slots[slot_index] == chunk);
400             mixed->slots[slot_index] = NULL;
401             clear_bit(slot_index, mixed->big_chunks);
402         }
403         else {
404             size_t small_index;
405 
406             UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_4K);
407             chunks = mixed->slots[slot_index];
408             small_index = compute_small_index(va_block, page_index);
409             UVM_ASSERT(chunks[small_index] == chunk);
410             chunks[small_index] = NULL;
411 
412             for (small_index = 0; small_index < MAX_SMALL_CHUNKS_PER_BIG_SLOT; small_index++) {
413                 if (chunks[small_index])
414                     break;
415             }
416 
417             if (small_index == MAX_SMALL_CHUNKS_PER_BIG_SLOT) {
418                 uvm_kvfree(chunks);
419                 mixed->slots[slot_index] = NULL;
420             }
421         }
422     }
423 
424     uvm_page_mask_region_clear(&va_block->cpu.allocated, chunk_region);
425 
426     if (uvm_page_mask_empty(&va_block->cpu.allocated) && va_block->cpu.chunks) {
427         uvm_kvfree(uvm_cpu_storage_get_ptr(va_block));
428         va_block->cpu.chunks = 0;
429     }
430 }
431 
432 struct page *uvm_cpu_chunk_get_cpu_page(uvm_va_block_t *va_block, uvm_page_index_t page_index)
433 {
434     uvm_va_block_region_t chunk_region;
435     uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, page_index);
436 
437     UVM_ASSERT(chunk);
438     UVM_ASSERT(chunk->page);
439     chunk_region = uvm_va_block_chunk_region(va_block, uvm_cpu_chunk_get_size(chunk), page_index);
440     return chunk->page + (page_index - chunk_region.first);
441 }
442 
443 static uvm_cpu_chunk_t *uvm_cpu_chunk_first_in_region(uvm_va_block_t *va_block,
444                                                       uvm_va_block_region_t region,
445                                                       uvm_page_index_t *first_chunk_page)
446 {
447     uvm_cpu_chunk_t *chunk = NULL;
448     uvm_page_index_t page_index;
449 
450     page_index = uvm_va_block_first_page_in_mask(region, &va_block->cpu.allocated);
451     if (page_index < region.outer)
452         chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, page_index);
453 
454     if (first_chunk_page && chunk) {
455         uvm_va_block_region_t chunk_region = uvm_cpu_chunk_block_region(va_block, chunk, page_index);
456         *first_chunk_page = chunk_region.first;
457     }
458 
459     return chunk;
460 }
461 
462 #define for_each_cpu_chunk_in_block_region(chunk, page_index, va_block, region)                                       \
463     for ((chunk) = uvm_cpu_chunk_first_in_region((va_block), (region), &(page_index));                                \
464          (chunk) != NULL;                                                                                             \
465          (chunk) = uvm_cpu_chunk_first_in_region((va_block),                                                          \
466                                                  uvm_va_block_region((page_index) + uvm_cpu_chunk_num_pages((chunk)), \
467                                                                      (region).outer),                                 \
468                                                  &(page_index)))
469 
470 #define for_each_cpu_chunk_in_block_region_safe(chunk, page_index, next_page_index, va_block, region)    \
471     for ((chunk) = uvm_cpu_chunk_first_in_region((va_block), (region), &(page_index)),                   \
472                        (next_page_index) = (page_index) + (chunk ? uvm_cpu_chunk_num_pages(chunk) : 0);  \
473          (chunk) != NULL;                                                                                \
474          (chunk) = uvm_cpu_chunk_first_in_region((va_block),                                             \
475                                                  uvm_va_block_region((next_page_index), (region).outer), \
476                                                  &(page_index)),                                         \
477              (next_page_index) = (page_index) + ((chunk) ? uvm_cpu_chunk_num_pages((chunk)) : 0))
478 
479 #define for_each_cpu_chunk_in_block(chunk, page_index, va_block)        \
480     for_each_cpu_chunk_in_block_region((chunk), (page_index), (va_block), uvm_va_block_region_from_block((va_block)))
481 
482 #define for_each_cpu_chunk_in_block_safe(chunk, page_index, next_page_index, va_block)  \
483     for_each_cpu_chunk_in_block_region_safe((chunk),                                    \
484                                             (page_index),                               \
485                                             (next_page_index),                          \
486                                             (va_block),                                 \
487                                             uvm_va_block_region_from_block((va_block)))
488 
489 struct vm_area_struct *uvm_va_block_find_vma_region(uvm_va_block_t *va_block,
490                                                     struct mm_struct *mm,
491                                                     NvU64 start,
492                                                     uvm_va_block_region_t *region)
493 {
494     struct vm_area_struct *vma;
495     NvU64 end;
496 
497     if (start > va_block->end)
498         return NULL;
499 
500     vma = find_vma_intersection(mm, start, va_block->end + 1);
501     if (!vma)
502         return NULL;
503 
504     if (start < vma->vm_start)
505         start = vma->vm_start;
506 
507     end = vma->vm_end - 1;
508     if (end > va_block->end)
509         end = va_block->end;
510 
511     *region = uvm_va_block_region_from_start_end(va_block, start, end);
512 
513     return vma;
514 }
515 
516 static bool block_check_cpu_chunks(uvm_va_block_t *block)
517 {
518     uvm_cpu_chunk_t *chunk;
519     size_t alloced_pages = 0;
520     uvm_va_block_region_t prev_region = { 0 };
521     uvm_page_index_t page_index;
522 
523     for_each_cpu_chunk_in_block(chunk, page_index, block) {
524         uvm_va_block_region_t chunk_region = uvm_cpu_chunk_block_region(block, chunk, page_index);
525         size_t num_chunk_pages = uvm_cpu_chunk_num_pages(chunk);
526         uvm_page_index_t chunk_page;
527 
528         UVM_ASSERT(prev_region.outer <= chunk_region.first);
529         UVM_ASSERT(IS_ALIGNED(uvm_va_block_region_start(block, chunk_region), uvm_cpu_chunk_get_size(chunk)));
530         UVM_ASSERT(chunk_region.outer <= uvm_va_block_num_cpu_pages(block));
531 
532         alloced_pages += uvm_cpu_chunk_num_pages(chunk);
533         UVM_ASSERT(uvm_page_mask_region_full(&block->cpu.allocated, chunk_region));
534         prev_region = chunk_region;
535 
536         for (chunk_page = page_index; chunk_page < page_index + num_chunk_pages; chunk_page++)
537             UVM_ASSERT(uvm_cpu_chunk_get_chunk_for_page(block, chunk_page) == chunk);
538     }
539 
540     UVM_ASSERT(alloced_pages == uvm_page_mask_weight(&block->cpu.allocated));
541 
542     return true;
543 }
544 
545 // Frees any left-over free chunks and unpins all the used chunks
546 void uvm_va_block_retry_deinit(uvm_va_block_retry_t *retry, uvm_va_block_t *va_block)
547 {
548     uvm_gpu_t *gpu;
549     uvm_gpu_chunk_t *gpu_chunk;
550     uvm_gpu_chunk_t *next_chunk;
551 
552     if (!retry)
553         return;
554 
555     uvm_tracker_deinit(&retry->tracker);
556 
557     // Free any unused chunks
558     list_for_each_entry_safe(gpu_chunk, next_chunk, &retry->free_chunks, list) {
559         list_del_init(&gpu_chunk->list);
560         gpu = uvm_gpu_chunk_get_gpu(gpu_chunk);
561         uvm_pmm_gpu_free(&gpu->pmm, gpu_chunk, NULL);
562     }
563 
564     // Unpin all the used chunks now that we are done
565     list_for_each_entry_safe(gpu_chunk, next_chunk, &retry->used_chunks, list) {
566         list_del_init(&gpu_chunk->list);
567         gpu = uvm_gpu_chunk_get_gpu(gpu_chunk);
568         // HMM should have already moved allocated blocks to the referenced
569         // state so any left over were not migrated and should be freed.
570         if (uvm_va_block_is_hmm(va_block))
571             uvm_pmm_gpu_free(&gpu->pmm, gpu_chunk, NULL);
572         else
573             uvm_pmm_gpu_unpin_allocated(&gpu->pmm, gpu_chunk, va_block);
574     }
575 }
576 
577 static void block_retry_add_free_chunk(uvm_va_block_retry_t *retry, uvm_gpu_chunk_t *gpu_chunk)
578 {
579     list_add_tail(&gpu_chunk->list, &retry->free_chunks);
580 }
581 
582 static void block_retry_add_used_chunk(uvm_va_block_retry_t *retry, uvm_gpu_chunk_t *gpu_chunk)
583 {
584     list_add_tail(&gpu_chunk->list, &retry->used_chunks);
585 }
586 
587 static uvm_gpu_chunk_t *block_retry_get_free_chunk(uvm_va_block_retry_t *retry, uvm_gpu_t *gpu, uvm_chunk_size_t size)
588 {
589     uvm_gpu_chunk_t *gpu_chunk;
590 
591     list_for_each_entry(gpu_chunk, &retry->free_chunks, list) {
592         if (uvm_gpu_chunk_get_gpu(gpu_chunk) == gpu && uvm_gpu_chunk_get_size(gpu_chunk) == size) {
593             list_del_init(&gpu_chunk->list);
594             return gpu_chunk;
595         }
596     }
597 
598     return NULL;
599 }
600 
601 // Encapsulates a reference to a physical page belonging to a specific processor
602 // within a VA block.
603 typedef struct
604 {
605     // Processor the page is on
606     uvm_processor_id_t processor;
607 
608     // The page index
609     uvm_page_index_t page_index;
610 } block_phys_page_t;
611 
612 static block_phys_page_t block_phys_page(uvm_processor_id_t processor, uvm_page_index_t page_index)
613 {
614     return (block_phys_page_t){ processor, page_index };
615 }
616 
617 NV_STATUS uvm_va_block_init(void)
618 {
619     if (uvm_enable_builtin_tests)
620         g_uvm_va_block_cache = NV_KMEM_CACHE_CREATE("uvm_va_block_wrapper_t", uvm_va_block_wrapper_t);
621     else
622         g_uvm_va_block_cache = NV_KMEM_CACHE_CREATE("uvm_va_block_t", uvm_va_block_t);
623 
624     if (!g_uvm_va_block_cache)
625         return NV_ERR_NO_MEMORY;
626 
627     g_uvm_va_block_gpu_state_cache = NV_KMEM_CACHE_CREATE("uvm_va_block_gpu_state_t", uvm_va_block_gpu_state_t);
628     if (!g_uvm_va_block_gpu_state_cache)
629         return NV_ERR_NO_MEMORY;
630 
631     g_uvm_page_mask_cache = NV_KMEM_CACHE_CREATE("uvm_page_mask_t", uvm_page_mask_t);
632     if (!g_uvm_page_mask_cache)
633         return NV_ERR_NO_MEMORY;
634 
635     g_uvm_va_block_context_cache = NV_KMEM_CACHE_CREATE("uvm_va_block_context_t", uvm_va_block_context_t);
636     if (!g_uvm_va_block_context_cache)
637         return NV_ERR_NO_MEMORY;
638 
639     return NV_OK;
640 }
641 
642 void uvm_va_block_exit(void)
643 {
644     kmem_cache_destroy_safe(&g_uvm_va_block_context_cache);
645     kmem_cache_destroy_safe(&g_uvm_page_mask_cache);
646     kmem_cache_destroy_safe(&g_uvm_va_block_gpu_state_cache);
647     kmem_cache_destroy_safe(&g_uvm_va_block_cache);
648 }
649 
650 uvm_va_block_context_t *uvm_va_block_context_alloc(struct mm_struct *mm)
651 {
652     uvm_va_block_context_t *block_context = kmem_cache_alloc(g_uvm_va_block_context_cache, NV_UVM_GFP_FLAGS);
653     if (block_context)
654         uvm_va_block_context_init(block_context, mm);
655 
656     return block_context;
657 }
658 
659 void uvm_va_block_context_free(uvm_va_block_context_t *va_block_context)
660 {
661     if (va_block_context)
662         kmem_cache_free(g_uvm_va_block_context_cache, va_block_context);
663 }
664 
665 // Convert from page_index to chunk_index. The goal is for each system page in
666 // the region [start, start + size) to be covered by the largest naturally-
667 // aligned user chunk size.
668 size_t uvm_va_block_gpu_chunk_index_range(NvU64 start,
669                                           NvU64 size,
670                                           uvm_gpu_t *gpu,
671                                           uvm_page_index_t page_index,
672                                           uvm_chunk_size_t *out_chunk_size)
673 {
674     uvm_chunk_sizes_mask_t chunk_sizes = gpu->parent->mmu_user_chunk_sizes;
675     uvm_chunk_size_t chunk_size, final_chunk_size;
676     size_t num_chunks, num_chunks_total;
677     NvU64 addr, end, aligned_start, aligned_addr, aligned_end, temp_size;
678 
679     UVM_ASSERT(PAGE_ALIGNED(start));
680     UVM_ASSERT(PAGE_ALIGNED(size));
681     UVM_ASSERT(size > 0);
682     UVM_ASSERT(size <= UVM_CHUNK_SIZE_2M);
683     UVM_ASSERT(UVM_ALIGN_DOWN(start, UVM_CHUNK_SIZE_2M) == UVM_ALIGN_DOWN(start + size - 1, UVM_CHUNK_SIZE_2M));
684     BUILD_BUG_ON(UVM_VA_BLOCK_SIZE != UVM_CHUNK_SIZE_2M);
685 
686     // PAGE_SIZE needs to be the lowest natively-supported chunk size in the
687     // mask, since we never deal with chunk sizes smaller than that (although we
688     // may have PTEs mapping pages smaller than that).
689     UVM_ASSERT(uvm_chunk_find_first_size(chunk_sizes) == PAGE_SIZE);
690 
691     // Optimize the ideal Pascal+ case: the whole block is covered by a single
692     // 2M page.
693     if ((chunk_sizes & UVM_CHUNK_SIZE_2M) && size == UVM_CHUNK_SIZE_2M) {
694         UVM_ASSERT(IS_ALIGNED(start, UVM_CHUNK_SIZE_2M));
695         final_chunk_size = UVM_CHUNK_SIZE_2M;
696         num_chunks_total = 0;
697         goto out;
698     }
699 
700     // Only one 2M chunk can fit within a VA block on any GPU architecture, so
701     // remove that size from consideration.
702     chunk_sizes &= ~UVM_CHUNK_SIZE_2M;
703 
704     // Next common case: the whole block is aligned and sized to perfectly fit
705     // the largest page size.
706     final_chunk_size = uvm_chunk_find_last_size(chunk_sizes);
707     if (IS_ALIGNED(start, final_chunk_size) && IS_ALIGNED(size, final_chunk_size)) {
708         num_chunks_total = (size_t)uvm_div_pow2_64(page_index * PAGE_SIZE, final_chunk_size);
709         goto out;
710     }
711 
712     // We didn't hit our special paths. Do it the hard way.
713 
714     num_chunks_total = 0;
715     addr = start + page_index * PAGE_SIZE;
716     end = start + size;
717     final_chunk_size = 0;
718     UVM_ASSERT(addr < end);
719 
720     // The below loop collapses almost completely when chunk_size == PAGE_SIZE
721     // since in that lowest-common-denominator case everything is already
722     // aligned. Skip it and handle that specially after the loop.
723     //
724     // Note that since we removed 2M already above, this loop will only iterate
725     // once on x86 Pascal+ since only 64K is left.
726     chunk_sizes &= ~PAGE_SIZE;
727 
728     // This loop calculates the number of chunks between start and addr by
729     // calculating the number of whole chunks of each size between them,
730     // starting with the largest allowed chunk size. This requires fewer
731     // iterations than if we began from start and kept calculating the next
732     // larger chunk size boundary.
733     for_each_chunk_size_rev(chunk_size, chunk_sizes) {
734         aligned_start = UVM_ALIGN_UP(start, chunk_size);
735         aligned_addr  = UVM_ALIGN_DOWN(addr, chunk_size);
736         aligned_end   = UVM_ALIGN_DOWN(end, chunk_size);
737 
738         // If addr and start are within the same chunk, try smaller
739         if (aligned_start > aligned_addr)
740             continue;
741 
742         // If addr and end are not in the same chunk, then addr is covered by a
743         // single chunk of the current size. Ignore smaller boundaries between
744         // addr and aligned_addr.
745         if (aligned_addr < aligned_end && final_chunk_size == 0) {
746             addr = aligned_addr;
747             final_chunk_size = chunk_size;
748         }
749 
750         // How many chunks of this size are between start and addr? Note that
751         // this might be 0 since aligned_addr and aligned_start could be in the
752         // same chunk.
753         num_chunks = uvm_div_pow2_32(((NvU32)aligned_addr - aligned_start), chunk_size);
754         num_chunks_total += num_chunks;
755 
756         // We've already accounted for these chunks, so "remove" them by
757         // bringing start, addr, and end closer together to calculate the
758         // remaining chunk sizes.
759         temp_size = num_chunks * chunk_size;
760         addr -= temp_size;
761         end -= temp_size;
762 
763         // Once there's no separation between addr and start, and we've
764         // successfully found the right chunk size when taking end into account,
765         // we're done.
766         if (addr == start && final_chunk_size)
767             break;
768     }
769 
770     // Handle PAGE_SIZE cleanup since we skipped it in the loop
771     num_chunks_total += (addr - start) / PAGE_SIZE;
772     if (final_chunk_size == 0)
773         final_chunk_size = PAGE_SIZE;
774 
775 out:
776     if (out_chunk_size)
777         *out_chunk_size = final_chunk_size;
778 
779     return num_chunks_total;
780 }
781 
782 static size_t block_gpu_chunk_index_range(uvm_va_block_t *va_block,
783                                           NvU64 start,
784                                           NvU64 size,
785                                           uvm_gpu_t *gpu,
786                                           uvm_page_index_t page_index,
787                                           uvm_chunk_size_t *out_chunk_size)
788 {
789     if (uvm_va_block_is_hmm(va_block)) {
790         if (out_chunk_size)
791             *out_chunk_size = PAGE_SIZE;
792         return page_index;
793     }
794 
795     return uvm_va_block_gpu_chunk_index_range(start, size, gpu, page_index, out_chunk_size);
796 }
797 
798 static size_t block_gpu_chunk_index(uvm_va_block_t *block,
799                                     uvm_gpu_t *gpu,
800                                     uvm_page_index_t page_index,
801                                     uvm_chunk_size_t *out_chunk_size)
802 {
803     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
804     uvm_chunk_size_t size;
805     uvm_gpu_chunk_t *chunk;
806     size_t index;
807 
808     index = block_gpu_chunk_index_range(block, block->start, uvm_va_block_size(block), gpu, page_index, &size);
809 
810     UVM_ASSERT(size >= PAGE_SIZE);
811 
812     if (gpu_state) {
813         UVM_ASSERT(gpu_state->chunks);
814         chunk = gpu_state->chunks[index];
815         if (chunk) {
816             UVM_ASSERT(uvm_gpu_chunk_get_size(chunk) == size);
817             UVM_ASSERT(chunk->state != UVM_PMM_GPU_CHUNK_STATE_PMA_OWNED);
818             UVM_ASSERT(chunk->state != UVM_PMM_GPU_CHUNK_STATE_FREE);
819         }
820     }
821 
822     if (out_chunk_size)
823         *out_chunk_size = size;
824 
825     return index;
826 }
827 
828 // Compute the size of the chunk known to start at start_page_index
829 static uvm_chunk_size_t block_gpu_chunk_size(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_page_index_t start_page_index)
830 {
831     uvm_chunk_sizes_mask_t chunk_sizes = gpu->parent->mmu_user_chunk_sizes;
832     uvm_chunk_sizes_mask_t start_alignments, pow2_leq_size, allowed_sizes;
833     NvU64 start = uvm_va_block_cpu_page_address(block, start_page_index);
834     NvU64 size = block->end - start + 1;
835 
836     if (uvm_va_block_is_hmm(block))
837         return PAGE_SIZE;
838 
839     // Create a mask of all sizes for which start is aligned. x ^ (x-1) yields a
840     // mask of the rightmost 1 bit in x, as well as all trailing 0 bits in x.
841     // Example: 1011000 -> 0001111
842     start_alignments = (uvm_chunk_sizes_mask_t)(start ^ (start - 1));
843 
844     // Next, compute all sizes (powers of two) which are <= size.
845     pow2_leq_size = (uvm_chunk_sizes_mask_t)rounddown_pow_of_two(size);
846     pow2_leq_size |= pow2_leq_size - 1;
847 
848     // Now and them all together to get our list of GPU-supported chunk sizes
849     // which are aligned to start and will fit within size.
850     allowed_sizes = chunk_sizes & start_alignments & pow2_leq_size;
851 
852     // start and size must always be aligned to at least the smallest supported
853     // chunk size (PAGE_SIZE).
854     UVM_ASSERT(allowed_sizes >= PAGE_SIZE);
855 
856     // Take the largest allowed size
857     return uvm_chunk_find_last_size(allowed_sizes);
858 }
859 
860 static size_t block_num_gpu_chunks(uvm_va_block_t *block, uvm_gpu_t *gpu)
861 {
862     return block_gpu_chunk_index(block, gpu, uvm_va_block_cpu_page_index(block, block->end), NULL) + 1;
863 }
864 
865 static size_t block_num_gpu_chunks_range(uvm_va_block_t *block, NvU64 start, NvU64 size, uvm_gpu_t *gpu)
866 {
867     uvm_page_index_t last_page_index = (size_t)((size / PAGE_SIZE) - 1);
868     return block_gpu_chunk_index_range(block, start, size, gpu, last_page_index, NULL) + 1;
869 }
870 
871 uvm_gpu_chunk_t *uvm_va_block_lookup_gpu_chunk(uvm_va_block_t *va_block, uvm_gpu_t *gpu, NvU64 address)
872 {
873     size_t chunk_index;
874     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id);
875     uvm_page_index_t page_index = uvm_va_block_cpu_page_index(va_block, address);
876 
877     uvm_assert_mutex_locked(&va_block->lock);
878 
879     if (!gpu_state)
880         return NULL;
881 
882     chunk_index = block_gpu_chunk_index(va_block, gpu, page_index, NULL);
883 
884     return gpu_state->chunks[chunk_index];
885 }
886 
887 NV_STATUS uvm_va_block_create(uvm_va_range_t *va_range,
888                               NvU64 start,
889                               NvU64 end,
890                               uvm_va_block_t **out_block)
891 {
892     uvm_va_block_t *block = NULL;
893     NvU64 size = end - start + 1;
894 
895     UVM_ASSERT(PAGE_ALIGNED(start));
896     UVM_ASSERT(PAGE_ALIGNED(end + 1));
897     UVM_ASSERT(PAGE_ALIGNED(size));
898     UVM_ASSERT(size > 0);
899     UVM_ASSERT(size <= UVM_VA_BLOCK_SIZE);
900 
901     if (va_range) {
902         // Create a managed va_block.
903         UVM_ASSERT(start >= va_range->node.start);
904         UVM_ASSERT(end <= va_range->node.end);
905         UVM_ASSERT(va_range->type == UVM_VA_RANGE_TYPE_MANAGED);
906     }
907 
908     // Blocks can't span a block alignment boundary
909     UVM_ASSERT(UVM_VA_BLOCK_ALIGN_DOWN(start) == UVM_VA_BLOCK_ALIGN_DOWN(end));
910 
911     if (uvm_enable_builtin_tests) {
912         uvm_va_block_wrapper_t *block_wrapper = nv_kmem_cache_zalloc(g_uvm_va_block_cache, NV_UVM_GFP_FLAGS);
913 
914         if (block_wrapper)
915             block = &block_wrapper->block;
916     }
917     else {
918         block = nv_kmem_cache_zalloc(g_uvm_va_block_cache, NV_UVM_GFP_FLAGS);
919     }
920 
921     if (!block)
922         return NV_ERR_NO_MEMORY;
923 
924     nv_kref_init(&block->kref);
925     uvm_mutex_init(&block->lock, UVM_LOCK_ORDER_VA_BLOCK);
926     block->start = start;
927     block->end = end;
928     block->va_range = va_range;
929     uvm_tracker_init(&block->tracker);
930     block->prefetch_info.last_migration_proc_id = UVM_ID_INVALID;
931 
932     nv_kthread_q_item_init(&block->eviction_mappings_q_item, block_add_eviction_mappings_entry, block);
933 
934     *out_block = block;
935     return NV_OK;
936 }
937 
938 static void cpu_chunk_remove_sysmem_gpu_mapping(uvm_cpu_chunk_t *chunk, uvm_gpu_t *gpu)
939 {
940     NvU64 gpu_mapping_addr = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu->parent);
941     if (gpu_mapping_addr == 0)
942         return;
943 
944     uvm_pmm_sysmem_mappings_remove_gpu_mapping(&gpu->pmm_reverse_sysmem_mappings, gpu_mapping_addr);
945     uvm_cpu_chunk_unmap_gpu_phys(chunk, gpu->parent);
946 }
947 
948 static NV_STATUS cpu_chunk_add_sysmem_gpu_mapping(uvm_cpu_chunk_t *chunk,
949                                                   uvm_va_block_t *block,
950                                                   uvm_page_index_t page_index,
951                                                   uvm_gpu_t *gpu)
952 {
953     NV_STATUS status;
954     uvm_chunk_size_t chunk_size;
955 
956     // When the Confidential Computing feature is enabled the transfers don't
957     // use the DMA mapping of CPU chunks (since it's protected memory), but
958     // the DMA address of the unprotected dma buffer.
959     if (uvm_conf_computing_mode_enabled(gpu))
960         return NV_OK;
961 
962     status = uvm_cpu_chunk_map_gpu(chunk, gpu);
963     if (status != NV_OK)
964         return status;
965 
966     chunk_size = uvm_cpu_chunk_get_size(chunk);
967 
968     // TODO: Bug 3744779: Handle benign assertion in
969     //       pmm_sysmem_mappings_remove_gpu_mapping() in case of a
970     //       failure.
971     status = uvm_pmm_sysmem_mappings_add_gpu_mapping(&gpu->pmm_reverse_sysmem_mappings,
972                                                      uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu->parent),
973                                                      uvm_va_block_cpu_page_address(block, page_index),
974                                                      chunk_size,
975                                                      block,
976                                                      UVM_ID_CPU);
977     if (status != NV_OK)
978         cpu_chunk_remove_sysmem_gpu_mapping(chunk, gpu);
979 
980     return status;
981 }
982 
983 static void block_gpu_unmap_phys_all_cpu_pages(uvm_va_block_t *block, uvm_gpu_t *gpu)
984 {
985     uvm_cpu_chunk_t *chunk;
986     uvm_page_index_t page_index;
987 
988     for_each_cpu_chunk_in_block(chunk, page_index, block)
989         cpu_chunk_remove_sysmem_gpu_mapping(chunk, gpu);
990 }
991 
992 static NV_STATUS block_gpu_map_phys_all_cpu_pages(uvm_va_block_t *block, uvm_gpu_t *gpu)
993 {
994     NV_STATUS status;
995     uvm_cpu_chunk_t *chunk;
996     NvU64 block_mapping_size = uvm_va_block_size(block);
997     uvm_page_index_t page_index;
998 
999     UVM_ASSERT(IS_ALIGNED(block_mapping_size, UVM_PAGE_SIZE_4K));
1000 
1001     for_each_cpu_chunk_in_block(chunk, page_index, block) {
1002         UVM_ASSERT_MSG(uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu->parent) == 0,
1003                        "GPU%u DMA address 0x%llx\n",
1004                        uvm_id_value(gpu->id),
1005                        uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu->parent));
1006 
1007         status = cpu_chunk_add_sysmem_gpu_mapping(chunk, block, page_index, gpu);
1008         if (status != NV_OK)
1009             goto error;
1010     }
1011 
1012     return NV_OK;
1013 
1014 error:
1015     block_gpu_unmap_phys_all_cpu_pages(block, gpu);
1016     return status;
1017 }
1018 
1019 static NV_STATUS block_sysmem_mappings_add_gpu_chunk(uvm_va_block_t *block,
1020                                                      uvm_gpu_t *local_gpu,
1021                                                      uvm_gpu_chunk_t *chunk,
1022                                                      uvm_gpu_t *accessing_gpu)
1023 {
1024     NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(&local_gpu->pmm, chunk, accessing_gpu);
1025     return uvm_pmm_sysmem_mappings_add_gpu_chunk_mapping(&accessing_gpu->pmm_reverse_sysmem_mappings,
1026                                                          peer_addr,
1027                                                          block->start + chunk->va_block_page_index * PAGE_SIZE,
1028                                                          uvm_gpu_chunk_get_size(chunk),
1029                                                          block,
1030                                                          local_gpu->id);
1031 }
1032 
1033 static void block_sysmem_mappings_remove_gpu_chunk(uvm_gpu_t *local_gpu,
1034                                                    uvm_gpu_chunk_t *chunk,
1035                                                    uvm_gpu_t *accessing_gpu)
1036 {
1037     NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(&local_gpu->pmm, chunk, accessing_gpu);
1038     uvm_pmm_sysmem_mappings_remove_gpu_chunk_mapping(&accessing_gpu->pmm_reverse_sysmem_mappings, peer_addr);
1039 }
1040 
1041 static NV_STATUS block_gpu_map_all_chunks_indirect_peer(uvm_va_block_t *block,
1042                                                         uvm_gpu_t *local_gpu,
1043                                                         uvm_gpu_t *accessing_gpu)
1044 {
1045     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, local_gpu->id);
1046     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
1047     size_t num_chunks, i;
1048     NV_STATUS status;
1049 
1050     UVM_ASSERT(uvm_processor_mask_test(&va_space->indirect_peers[uvm_id_value(local_gpu->id)],
1051                                        accessing_gpu->id));
1052 
1053     // If no chunks are allocated currently, the mappings will be created later
1054     // at chunk allocation.
1055     if (!gpu_state || !gpu_state->chunks)
1056         return NV_OK;
1057 
1058     num_chunks = block_num_gpu_chunks(block, local_gpu);
1059     for (i = 0; i < num_chunks; i++) {
1060         uvm_gpu_chunk_t *chunk = gpu_state->chunks[i];
1061         if (!chunk)
1062             continue;
1063 
1064         status = uvm_pmm_gpu_indirect_peer_map(&local_gpu->pmm, chunk, accessing_gpu);
1065         if (status != NV_OK)
1066             goto error;
1067 
1068         status = block_sysmem_mappings_add_gpu_chunk(block, local_gpu, chunk, accessing_gpu);
1069         if (status != NV_OK)
1070             goto error;
1071     }
1072 
1073     return NV_OK;
1074 
1075 error:
1076     while (i-- > 0) {
1077         uvm_gpu_chunk_t *chunk = gpu_state->chunks[i];
1078         if (chunk) {
1079             // Indirect peer mappings are removed lazily by PMM, so if an error
1080             // occurs the mappings established above will be removed when the
1081             // chunk is freed later on. We only need to remove the sysmem
1082             // reverse mappings.
1083             block_sysmem_mappings_remove_gpu_chunk(local_gpu, chunk, accessing_gpu);
1084         }
1085     }
1086 
1087     return status;
1088 }
1089 
1090 // Mappings for indirect peers are removed lazily by PMM, but we need to remove
1091 // the entries from the reverse map.
1092 static void block_gpu_unmap_all_chunks_indirect_peer(uvm_va_block_t *block,
1093                                                      uvm_gpu_t *local_gpu,
1094                                                      uvm_gpu_t *accessing_gpu)
1095 {
1096     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, local_gpu->id);
1097     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
1098     size_t num_chunks, i;
1099 
1100     UVM_ASSERT(uvm_processor_mask_test(&va_space->indirect_peers[uvm_id_value(local_gpu->id)],
1101                                        accessing_gpu->id));
1102 
1103     // Exit if no chunks are allocated currently.
1104     if (!gpu_state || !gpu_state->chunks)
1105         return;
1106 
1107     num_chunks = block_num_gpu_chunks(block, local_gpu);
1108     for (i = 0; i < num_chunks; i++) {
1109         uvm_gpu_chunk_t *chunk = gpu_state->chunks[i];
1110         if (chunk)
1111             block_sysmem_mappings_remove_gpu_chunk(local_gpu, chunk, accessing_gpu);
1112     }
1113 }
1114 
1115 // Retrieves the gpu_state for the given GPU. The returned pointer is
1116 // internally managed and will be allocated (and freed) automatically,
1117 // rather than by the caller.
1118 static uvm_va_block_gpu_state_t *block_gpu_state_get_alloc(uvm_va_block_t *block, uvm_gpu_t *gpu)
1119 {
1120     NV_STATUS status;
1121     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
1122 
1123     if (gpu_state)
1124         return gpu_state;
1125 
1126     gpu_state = nv_kmem_cache_zalloc(g_uvm_va_block_gpu_state_cache, NV_UVM_GFP_FLAGS);
1127     if (!gpu_state)
1128         return NULL;
1129 
1130     gpu_state->chunks = uvm_kvmalloc_zero(block_num_gpu_chunks(block, gpu) * sizeof(gpu_state->chunks[0]));
1131     if (!gpu_state->chunks)
1132         goto error;
1133 
1134     block->gpus[uvm_id_gpu_index(gpu->id)] = gpu_state;
1135 
1136     status = block_gpu_map_phys_all_cpu_pages(block, gpu);
1137     if (status != NV_OK)
1138         goto error;
1139 
1140     return gpu_state;
1141 
1142 error:
1143     uvm_kvfree(gpu_state->chunks);
1144     kmem_cache_free(g_uvm_va_block_gpu_state_cache, gpu_state);
1145     block->gpus[uvm_id_gpu_index(gpu->id)] = NULL;
1146 
1147     return NULL;
1148 }
1149 
1150 NV_STATUS uvm_va_block_gpu_state_alloc(uvm_va_block_t *va_block)
1151 {
1152     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
1153     uvm_gpu_id_t gpu_id;
1154 
1155     UVM_ASSERT(uvm_va_block_is_hmm(va_block));
1156     uvm_assert_mutex_locked(&va_block->lock);
1157 
1158     for_each_gpu_id_in_mask(gpu_id, &va_space->registered_gpus) {
1159         if (!block_gpu_state_get_alloc(va_block, uvm_va_space_get_gpu(va_space, gpu_id)))
1160             return NV_ERR_NO_MEMORY;
1161     }
1162 
1163     return NV_OK;
1164 }
1165 
1166 void uvm_va_block_unmap_cpu_chunk_on_gpus(uvm_va_block_t *block,
1167                                           uvm_cpu_chunk_t *chunk,
1168                                           uvm_page_index_t page_index)
1169 {
1170     uvm_gpu_id_t id;
1171 
1172     for_each_gpu_id(id) {
1173         if (uvm_va_block_gpu_state_get(block, id))
1174             cpu_chunk_remove_sysmem_gpu_mapping(chunk, block_get_gpu(block, id));
1175     }
1176 }
1177 
1178 NV_STATUS uvm_va_block_map_cpu_chunk_on_gpus(uvm_va_block_t *block,
1179                                              uvm_page_index_t page_index)
1180 {
1181     NV_STATUS status;
1182     uvm_gpu_id_t id;
1183     uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index);
1184     uvm_chunk_size_t chunk_size = uvm_cpu_chunk_get_size(chunk);
1185     uvm_va_block_region_t chunk_region = uvm_va_block_chunk_region(block, chunk_size, page_index);
1186 
1187     // We can't iterate over va_space->registered_gpus because we might be
1188     // on the eviction path, which does not have the VA space lock held. We have
1189     // the VA block lock held however, so the gpu_states can't change.
1190     uvm_assert_mutex_locked(&block->lock);
1191 
1192     for_each_gpu_id(id) {
1193         uvm_gpu_t *gpu;
1194 
1195         if (!uvm_va_block_gpu_state_get(block, id))
1196             continue;
1197 
1198         gpu = block_get_gpu(block, id);
1199         status = cpu_chunk_add_sysmem_gpu_mapping(chunk, block, chunk_region.first, gpu);
1200         if (status != NV_OK)
1201             goto error;
1202     }
1203 
1204     return NV_OK;
1205 
1206 error:
1207     uvm_va_block_unmap_cpu_chunk_on_gpus(block, chunk, page_index);
1208     return status;
1209 }
1210 
1211 void uvm_va_block_remove_cpu_chunks(uvm_va_block_t *va_block, uvm_va_block_region_t region)
1212 {
1213     uvm_cpu_chunk_t *chunk;
1214     uvm_page_index_t page_index, next_page_index;
1215     uvm_va_block_region_t chunk_region;
1216 
1217     for_each_cpu_chunk_in_block_region_safe(chunk, page_index, next_page_index, va_block, region) {
1218         chunk_region = uvm_va_block_region(page_index, page_index + uvm_cpu_chunk_num_pages(chunk));
1219 
1220         uvm_page_mask_region_clear(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ], chunk_region);
1221         uvm_page_mask_region_clear(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE], chunk_region);
1222         uvm_page_mask_region_clear(&va_block->cpu.resident, chunk_region);
1223         uvm_cpu_chunk_remove_from_block(va_block, page_index);
1224         uvm_va_block_unmap_cpu_chunk_on_gpus(va_block, chunk, page_index);
1225         uvm_cpu_chunk_free(chunk);
1226     }
1227 
1228     if (uvm_page_mask_empty(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ]))
1229         uvm_processor_mask_clear(&va_block->mapped, UVM_ID_CPU);
1230     if (uvm_page_mask_empty(&va_block->cpu.resident))
1231         uvm_processor_mask_clear(&va_block->resident, UVM_ID_CPU);
1232 }
1233 
1234 // Create physical mappings to allow other GPUs to access this chunk.
1235 static NV_STATUS block_map_indirect_peers_to_gpu_chunk(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_gpu_chunk_t *chunk)
1236 {
1237     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
1238     uvm_gpu_t *accessing_gpu, *remove_gpu;
1239     NV_STATUS status;
1240 
1241     // Unlike uvm_va_block_map_cpu_chunk_on_gpus, this function isn't called on
1242     // the eviction path, so we can assume that the VA space is locked.
1243     //
1244     // TODO: Bug 2007346: In the future we may want to enable eviction to peers,
1245     //       meaning we may need to allocate peer memory and map it on the
1246     //       eviction path. That will require making sure that peers can't be
1247     //       enabled or disabled either in the VA space or globally within this
1248     //       function.
1249     uvm_assert_rwsem_locked(&va_space->lock);
1250     uvm_assert_mutex_locked(&block->lock);
1251 
1252     for_each_va_space_gpu_in_mask(accessing_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)]) {
1253         status = uvm_pmm_gpu_indirect_peer_map(&gpu->pmm, chunk, accessing_gpu);
1254         if (status != NV_OK)
1255             goto error;
1256 
1257         status = block_sysmem_mappings_add_gpu_chunk(block, gpu, chunk, accessing_gpu);
1258         if (status != NV_OK)
1259             goto error;
1260     }
1261 
1262     return NV_OK;
1263 
1264 error:
1265     for_each_va_space_gpu_in_mask(remove_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)]) {
1266         if (remove_gpu == accessing_gpu)
1267             break;
1268 
1269         // Indirect peer mappings are removed lazily by PMM, so if an error
1270         // occurs the mappings established above will be removed when the
1271         // chunk is freed later on. We only need to remove the sysmem
1272         // reverse mappings.
1273         block_sysmem_mappings_remove_gpu_chunk(gpu, chunk, remove_gpu);
1274     }
1275 
1276     return status;
1277 }
1278 
1279 static void block_unmap_indirect_peers_from_gpu_chunk(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_gpu_chunk_t *chunk)
1280 {
1281     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
1282     uvm_gpu_t *peer_gpu;
1283 
1284     uvm_assert_rwsem_locked(&va_space->lock);
1285     uvm_assert_mutex_locked(&block->lock);
1286 
1287     // Indirect peer mappings are removed lazily by PMM, so we only need to
1288     // remove the sysmem reverse mappings.
1289     for_each_va_space_gpu_in_mask(peer_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)])
1290         block_sysmem_mappings_remove_gpu_chunk(gpu, chunk, peer_gpu);
1291 }
1292 
1293 // Mark a CPU page as dirty.
1294 static void  block_mark_cpu_page_dirty(uvm_va_block_t *block, uvm_page_index_t page_index)
1295 {
1296     uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index);
1297     uvm_va_block_region_t chunk_region = uvm_va_block_chunk_region(block, uvm_cpu_chunk_get_size(chunk), page_index);
1298     uvm_cpu_chunk_mark_dirty(chunk, page_index - chunk_region.first);
1299 }
1300 
1301 // Mark a CPU page as clean.
1302 static void block_mark_cpu_page_clean(uvm_va_block_t *block, uvm_page_index_t page_index)
1303 {
1304     uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index);
1305     uvm_va_block_region_t chunk_region = uvm_va_block_chunk_region(block, uvm_cpu_chunk_get_size(chunk), page_index);
1306     uvm_cpu_chunk_mark_clean(chunk, page_index - chunk_region.first);
1307 }
1308 
1309 // Check if a CPU page is dirty.
1310 static bool block_cpu_page_is_dirty(uvm_va_block_t *block, uvm_page_index_t page_index)
1311 {
1312     uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index);
1313     uvm_va_block_region_t chunk_region = uvm_va_block_chunk_region(block, uvm_cpu_chunk_get_size(chunk), page_index);
1314     return uvm_cpu_chunk_is_dirty(chunk, page_index - chunk_region.first);
1315 }
1316 
1317 static NV_STATUS block_alloc_cpu_chunk(uvm_va_block_t *block,
1318                                        uvm_chunk_size_t alloc_size,
1319                                        uvm_cpu_chunk_alloc_flags_t flags,
1320                                        uvm_cpu_chunk_t **chunk)
1321 {
1322     uvm_va_block_test_t *block_test = uvm_va_block_get_test(block);
1323 
1324     // Return out of memory error if the tests have requested it. As opposed to
1325     // other error injection settings, this one fails N times and then succeeds.
1326     // TODO: Bug 3701182: This will print a warning in Linux kernels newer than
1327     // 5.16.0-rc1+.
1328     if (block_test && block_test->inject_cpu_pages_allocation_error_count) {
1329         if (block_test->inject_cpu_pages_allocation_error_count != ~(NvU32)0)
1330             block_test->inject_cpu_pages_allocation_error_count--;
1331         return NV_ERR_NO_MEMORY;
1332     }
1333 
1334     return uvm_cpu_chunk_alloc(alloc_size, flags, chunk);
1335 }
1336 
1337 // Allocates the input page in the block, if it doesn't already exist
1338 //
1339 // Also maps the page for physical access by all GPUs used by the block, which
1340 // is required for IOMMU support. Skipped on GPUs without access to CPU memory.
1341 // e.g., this happens when the Confidential Computing Feature is enabled.
1342 static NV_STATUS block_populate_pages_cpu(uvm_va_block_t *block,
1343                                           uvm_page_mask_t *populate_page_mask,
1344                                           uvm_va_block_region_t populate_region,
1345                                           uvm_va_block_context_t *block_context)
1346 {
1347     NV_STATUS status = NV_OK;
1348     uvm_cpu_chunk_t *chunk;
1349     uvm_va_block_test_t *block_test = uvm_va_block_get_test(block);
1350     uvm_chunk_sizes_mask_t cpu_allocation_sizes = uvm_cpu_chunk_get_allocation_sizes();
1351     uvm_chunk_size_t alloc_size;
1352     uvm_page_mask_t *resident_mask = &block_context->scratch_page_mask;
1353     uvm_cpu_chunk_alloc_flags_t alloc_flags = UVM_CPU_CHUNK_ALLOC_FLAGS_NONE;
1354     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
1355     uvm_processor_mask_t uvm_lite_gpus;
1356     uvm_page_index_t page_index;
1357     uvm_gpu_id_t id;
1358 
1359     // Check whether all requested pages have already been allocated.
1360     uvm_page_mask_init_from_region(&block_context->scratch_page_mask, populate_region, populate_page_mask);
1361     if (!uvm_page_mask_andnot(&block_context->scratch_page_mask,
1362                               &block_context->scratch_page_mask,
1363                               &block->cpu.allocated))
1364         return NV_OK;
1365 
1366     if (block_test) {
1367         if (block_test->cpu_chunk_allocation_size_mask)
1368             cpu_allocation_sizes &= block_test->cpu_chunk_allocation_size_mask;
1369     }
1370 
1371     uvm_page_mask_zero(resident_mask);
1372     for_each_id_in_mask (id, &block->resident)
1373         uvm_page_mask_or(resident_mask, resident_mask, uvm_va_block_resident_mask_get(block, id));
1374 
1375     // If the VA space has a UVM-Lite GPU registered, only PAGE_SIZE allocations
1376     // should be used in order to avoid extra copies due to dirty compound
1377     // pages. HMM va_blocks also require PAGE_SIZE allocations.
1378     // TODO: Bug 3368756: add support for HMM transparent huge page (THP)
1379     // migrations.
1380     uvm_processor_mask_andnot(&uvm_lite_gpus, &va_space->registered_gpus, &va_space->faultable_processors);
1381     if (!uvm_processor_mask_empty(&uvm_lite_gpus) || uvm_va_block_is_hmm(block))
1382         cpu_allocation_sizes = PAGE_SIZE;
1383 
1384     if (block_context->mm)
1385         alloc_flags |= UVM_CPU_CHUNK_ALLOC_FLAGS_ACCOUNT;
1386 
1387     UVM_ASSERT(cpu_allocation_sizes >= PAGE_SIZE);
1388     UVM_ASSERT(cpu_allocation_sizes & PAGE_SIZE);
1389 
1390     for_each_va_block_page_in_region_mask(page_index, populate_page_mask, populate_region) {
1391         uvm_cpu_chunk_alloc_flags_t chunk_alloc_flags;
1392         uvm_va_block_region_t region = populate_region;
1393 
1394         if (uvm_page_mask_test(&block->cpu.allocated, page_index)) {
1395             page_index = uvm_va_block_next_unset_page_in_mask(populate_region, &block->cpu.allocated, page_index) - 1;
1396             continue;
1397         }
1398 
1399         UVM_ASSERT(!uvm_page_mask_test(&block->cpu.resident, page_index));
1400 
1401         chunk_alloc_flags = alloc_flags;
1402 
1403         // Attempt to allocate CPU pages with the largest physically contiguous
1404         // size from the set of CPU chunk sizes that we can.
1405         // This is accomplished by:
1406         //   1. Aligning the CPU page address down to the allocation size.
1407         //   2. Ensuring that the entire allocation region fits withing the VA
1408         //      block.
1409         //   3. Ensuring that the region covered by the allocation is empty.
1410         for_each_chunk_size_rev(alloc_size, cpu_allocation_sizes) {
1411             NvU64 alloc_virt_addr;
1412 
1413             chunk = NULL;
1414             alloc_virt_addr = UVM_ALIGN_DOWN(uvm_va_block_cpu_page_address(block, page_index), alloc_size);
1415 
1416             if (!uvm_va_block_contains_address(block, alloc_virt_addr) ||
1417                 !uvm_va_block_contains_address(block, alloc_virt_addr + alloc_size - 1))
1418                 continue;
1419 
1420             region = uvm_va_block_region_from_start_end(block, alloc_virt_addr, alloc_virt_addr + alloc_size - 1);
1421 
1422             if (!uvm_page_mask_region_empty(&block->cpu.allocated, region))
1423                 continue;
1424 
1425             // If not all pages in the allocation region are resident somewhere,
1426             // zero out the allocated page.
1427             // This could be wasteful if only a few pages in high-order
1428             // allocation need to be zero'ed out but the alternative is to map
1429             // single sub-pages one-by-one.
1430             if (!uvm_page_mask_region_full(resident_mask, region))
1431                 chunk_alloc_flags |= UVM_CPU_CHUNK_ALLOC_FLAGS_ZERO;
1432 
1433             status = block_alloc_cpu_chunk(block, alloc_size, chunk_alloc_flags, &chunk);
1434             if (status == NV_OK) {
1435                 page_index = region.first;
1436                 break;
1437             }
1438 
1439             UVM_ASSERT(status == NV_ERR_NO_MEMORY);
1440         }
1441 
1442         if (status != NV_OK)
1443             break;
1444 
1445         status = uvm_cpu_chunk_insert_in_block(block, chunk, page_index);
1446         if (status != NV_OK) {
1447             uvm_cpu_chunk_free(chunk);
1448             return status;
1449         }
1450 
1451         status = uvm_va_block_map_cpu_chunk_on_gpus(block, page_index);
1452         if (status != NV_OK)
1453             break;
1454 
1455         // Skip iterating over all pages covered by the allocated chunk.
1456         page_index = region.outer - 1;
1457     }
1458 
1459     if (status != NV_OK && chunk) {
1460         uvm_cpu_chunk_remove_from_block(block, page_index);
1461         uvm_cpu_chunk_free(chunk);
1462     }
1463 
1464     return status;
1465 }
1466 
1467 // Try allocating a chunk. If eviction was required,
1468 // NV_ERR_MORE_PROCESSING_REQUIRED will be returned since the block's lock was
1469 // unlocked and relocked. The caller is responsible for adding the chunk to the
1470 // retry used_chunks list.
1471 static NV_STATUS block_alloc_gpu_chunk(uvm_va_block_t *block,
1472                                        uvm_va_block_retry_t *retry,
1473                                        uvm_gpu_t *gpu,
1474                                        uvm_chunk_size_t size,
1475                                        uvm_gpu_chunk_t **out_gpu_chunk)
1476 {
1477     NV_STATUS status = NV_OK;
1478     uvm_gpu_chunk_t *gpu_chunk;
1479 
1480     // First try getting a free chunk from previously-made allocations.
1481     gpu_chunk = block_retry_get_free_chunk(retry, gpu, size);
1482     if (!gpu_chunk) {
1483         uvm_va_block_test_t *block_test = uvm_va_block_get_test(block);
1484         if (block_test && block_test->user_pages_allocation_retry_force_count > 0) {
1485             // Force eviction by pretending the allocation failed with no memory
1486             --block_test->user_pages_allocation_retry_force_count;
1487             status = NV_ERR_NO_MEMORY;
1488         }
1489         else {
1490             // Try allocating a new one without eviction
1491             status = uvm_pmm_gpu_alloc_user(&gpu->pmm, 1, size, UVM_PMM_ALLOC_FLAGS_NONE, &gpu_chunk, &retry->tracker);
1492         }
1493 
1494         if (status == NV_ERR_NO_MEMORY) {
1495             // If that fails with no memory, try allocating with eviction and
1496             // return back to the caller immediately so that the operation can
1497             // be restarted.
1498             uvm_mutex_unlock(&block->lock);
1499 
1500             status = uvm_pmm_gpu_alloc_user(&gpu->pmm, 1, size, UVM_PMM_ALLOC_FLAGS_EVICT, &gpu_chunk, &retry->tracker);
1501             if (status == NV_OK) {
1502                 block_retry_add_free_chunk(retry, gpu_chunk);
1503                 status = NV_ERR_MORE_PROCESSING_REQUIRED;
1504             }
1505 
1506             uvm_mutex_lock(&block->lock);
1507             return status;
1508         }
1509         else if (status != NV_OK) {
1510             return status;
1511         }
1512     }
1513 
1514     *out_gpu_chunk = gpu_chunk;
1515     return NV_OK;
1516 }
1517 
1518 static bool block_gpu_has_page_tables(uvm_va_block_t *block, uvm_gpu_t *gpu)
1519 {
1520     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
1521 
1522     if (!gpu_state)
1523         return false;
1524 
1525     return gpu_state->page_table_range_4k.table  ||
1526            gpu_state->page_table_range_big.table ||
1527            gpu_state->page_table_range_2m.table;
1528 }
1529 
1530 // A helper to get a known-to-be-present GPU VA space given a VA block that's
1531 // locked. In order to use this function, the caller must know that at least one
1532 // of these conditions is true:
1533 //
1534 // 1) The VA space lock is held
1535 // 2) The VA block has active page tables for the GPU
1536 //
1537 // If the VA space lock is held (#1), then the gpu_va_space obviously can't go
1538 // away.
1539 //
1540 // On the eviction path, we don't have a lock on the VA space state. However,
1541 // since remove_gpu_va_space walks each block to unmap the GPU and free GPU page
1542 // tables before destroying the gpu_va_space, we're guaranteed that if this GPU
1543 // has page tables (#2), the gpu_va_space can't go away while we're holding the
1544 // block lock.
1545 static uvm_gpu_va_space_t *uvm_va_block_get_gpu_va_space(uvm_va_block_t *va_block, uvm_gpu_t *gpu)
1546 {
1547     uvm_gpu_va_space_t *gpu_va_space;
1548     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
1549 
1550     UVM_ASSERT(gpu);
1551 
1552     if (!block_gpu_has_page_tables(va_block, gpu))
1553         uvm_assert_rwsem_locked(&va_space->lock);
1554 
1555     UVM_ASSERT(uvm_processor_mask_test(&va_space->registered_gpu_va_spaces, gpu->id));
1556 
1557     gpu_va_space = va_space->gpu_va_spaces[uvm_id_gpu_index(gpu->id)];
1558 
1559     UVM_ASSERT(uvm_gpu_va_space_state(gpu_va_space) == UVM_GPU_VA_SPACE_STATE_ACTIVE);
1560     UVM_ASSERT(gpu_va_space->va_space == va_space);
1561     UVM_ASSERT(gpu_va_space->gpu == gpu);
1562 
1563     return gpu_va_space;
1564 }
1565 
1566 static bool block_gpu_supports_2m(uvm_va_block_t *block, uvm_gpu_t *gpu)
1567 {
1568     uvm_gpu_va_space_t *gpu_va_space;
1569 
1570     // TODO: Bug 3368756: add HMM support for transparent huge page migrations.
1571     if (uvm_va_block_size(block) < UVM_PAGE_SIZE_2M || uvm_va_block_is_hmm(block))
1572         return false;
1573 
1574     UVM_ASSERT(uvm_va_block_size(block) == UVM_PAGE_SIZE_2M);
1575 
1576     gpu_va_space = uvm_va_block_get_gpu_va_space(block, gpu);
1577     return uvm_mmu_page_size_supported(&gpu_va_space->page_tables, UVM_PAGE_SIZE_2M);
1578 }
1579 
1580 NvU32 uvm_va_block_gpu_big_page_size(uvm_va_block_t *va_block, uvm_gpu_t *gpu)
1581 {
1582     uvm_gpu_va_space_t *gpu_va_space;
1583 
1584     gpu_va_space = uvm_va_block_get_gpu_va_space(va_block, gpu);
1585     return gpu_va_space->page_tables.big_page_size;
1586 }
1587 
1588 static uvm_va_block_region_t range_big_page_region_all(NvU64 start, NvU64 end, NvU32 big_page_size)
1589 {
1590     NvU64 first_addr = UVM_ALIGN_UP(start, big_page_size);
1591     NvU64 outer_addr = UVM_ALIGN_DOWN(end + 1, big_page_size);
1592 
1593     // The range must fit within a VA block
1594     UVM_ASSERT(UVM_VA_BLOCK_ALIGN_DOWN(start) == UVM_VA_BLOCK_ALIGN_DOWN(end));
1595 
1596     if (outer_addr <= first_addr)
1597         return uvm_va_block_region(0, 0);
1598 
1599     return uvm_va_block_region((first_addr - start) / PAGE_SIZE, (outer_addr - start) / PAGE_SIZE);
1600 }
1601 
1602 static size_t range_num_big_pages(NvU64 start, NvU64 end, NvU32 big_page_size)
1603 {
1604     uvm_va_block_region_t region = range_big_page_region_all(start, end, big_page_size);
1605     return (size_t)uvm_div_pow2_64(uvm_va_block_region_size(region), big_page_size);
1606 }
1607 
1608 uvm_va_block_region_t uvm_va_block_big_page_region_all(uvm_va_block_t *va_block, NvU32 big_page_size)
1609 {
1610     return range_big_page_region_all(va_block->start, va_block->end, big_page_size);
1611 }
1612 
1613 uvm_va_block_region_t uvm_va_block_big_page_region_subset(uvm_va_block_t *va_block,
1614                                                           uvm_va_block_region_t region,
1615                                                           NvU32 big_page_size)
1616 {
1617     NvU64 start = uvm_va_block_region_start(va_block, region);
1618     NvU64 end = uvm_va_block_region_end(va_block, region);
1619     uvm_va_block_region_t big_region;
1620 
1621     UVM_ASSERT(start < va_block->end);
1622     UVM_ASSERT(end <= va_block->end);
1623 
1624     big_region = range_big_page_region_all(start, end, big_page_size);
1625     if (big_region.outer) {
1626         big_region.first += region.first;
1627         big_region.outer += region.first;
1628     }
1629 
1630     return big_region;
1631 }
1632 
1633 size_t uvm_va_block_num_big_pages(uvm_va_block_t *va_block, NvU32 big_page_size)
1634 {
1635     return range_num_big_pages(va_block->start, va_block->end, big_page_size);
1636 }
1637 
1638 NvU64 uvm_va_block_big_page_addr(uvm_va_block_t *va_block, size_t big_page_index, NvU32 big_page_size)
1639 {
1640     NvU64 addr = UVM_ALIGN_UP(va_block->start, big_page_size) + (big_page_index * big_page_size);
1641     UVM_ASSERT(addr >= va_block->start);
1642     UVM_ASSERT(addr < va_block->end);
1643     return addr;
1644 }
1645 
1646 uvm_va_block_region_t uvm_va_block_big_page_region(uvm_va_block_t *va_block, size_t big_page_index, NvU32 big_page_size)
1647 {
1648     NvU64 page_addr = uvm_va_block_big_page_addr(va_block, big_page_index, big_page_size);
1649 
1650     // Assume that we don't have to handle multiple big PTEs per system page.
1651     // It's not terribly difficult to implement, but we don't currently have a
1652     // use case.
1653     UVM_ASSERT(big_page_size >= PAGE_SIZE);
1654 
1655     return uvm_va_block_region_from_start_size(va_block, page_addr, big_page_size);
1656 }
1657 
1658 // Returns the big page index (the bit index within
1659 // uvm_va_block_gpu_state_t::big_ptes) corresponding to page_index. If
1660 // page_index cannot be covered by a big PTE due to alignment or block size,
1661 // MAX_BIG_PAGES_PER_UVM_VA_BLOCK is returned.
1662 size_t uvm_va_block_big_page_index(uvm_va_block_t *va_block, uvm_page_index_t page_index, NvU32 big_page_size)
1663 {
1664     uvm_va_block_region_t big_region_all = uvm_va_block_big_page_region_all(va_block, big_page_size);
1665     size_t big_index;
1666 
1667     // Note that this condition also handles the case of having no big pages in
1668     // the block, in which case .first >= .outer.
1669     if (page_index < big_region_all.first || page_index >= big_region_all.outer)
1670         return MAX_BIG_PAGES_PER_UVM_VA_BLOCK;
1671 
1672     big_index = (size_t)uvm_div_pow2_64((page_index - big_region_all.first) * PAGE_SIZE, big_page_size);
1673 
1674     UVM_ASSERT(uvm_va_block_big_page_addr(va_block, big_index, big_page_size) >= va_block->start);
1675     UVM_ASSERT(uvm_va_block_big_page_addr(va_block, big_index, big_page_size) + big_page_size <= va_block->end + 1);
1676 
1677     return big_index;
1678 }
1679 
1680 static void uvm_page_mask_init_from_big_ptes(uvm_va_block_t *block,
1681                                              uvm_gpu_t *gpu,
1682                                              uvm_page_mask_t *mask_out,
1683                                              const unsigned long *big_ptes_in)
1684 {
1685     uvm_va_block_region_t big_region;
1686     size_t big_page_index;
1687     NvU32 big_page_size = uvm_va_block_gpu_big_page_size(block, gpu);
1688 
1689     uvm_page_mask_zero(mask_out);
1690 
1691     for_each_set_bit(big_page_index, big_ptes_in, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) {
1692         big_region = uvm_va_block_big_page_region(block, big_page_index, big_page_size);
1693         uvm_page_mask_region_fill(mask_out, big_region);
1694     }
1695 }
1696 
1697 NvU32 uvm_va_block_page_size_cpu(uvm_va_block_t *va_block, uvm_page_index_t page_index)
1698 {
1699     if (!uvm_page_mask_test(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ], page_index))
1700         return 0;
1701 
1702     UVM_ASSERT(uvm_processor_mask_test(&va_block->mapped, UVM_ID_CPU));
1703 
1704     // Despite the fact that physical CPU memory can be allocated at sizes
1705     // greater than PAGE_SIZE, vm_insert_page(s)() always maps CPU memory
1706     // with 4K PTEs. Until the core kernel adds support for PMD mappings,
1707     // the return value of this function will remain at PAGE_SIZE.
1708     return PAGE_SIZE;
1709 }
1710 
1711 NvU32 uvm_va_block_page_size_gpu(uvm_va_block_t *va_block, uvm_gpu_id_t gpu_id, uvm_page_index_t page_index)
1712 {
1713     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu_id);
1714     size_t big_page_size, big_page_index;
1715 
1716     if (!gpu_state)
1717         return 0;
1718 
1719     if (!uvm_page_mask_test(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ], page_index))
1720         return 0;
1721 
1722     UVM_ASSERT(uvm_processor_mask_test(&va_block->mapped, gpu_id));
1723 
1724     if (gpu_state->pte_is_2m)
1725         return UVM_PAGE_SIZE_2M;
1726 
1727     big_page_size = uvm_va_block_gpu_big_page_size(va_block, block_get_gpu(va_block, gpu_id));
1728     big_page_index = uvm_va_block_big_page_index(va_block, page_index, big_page_size);
1729     if (big_page_index != MAX_BIG_PAGES_PER_UVM_VA_BLOCK && test_bit(big_page_index, gpu_state->big_ptes))
1730         return big_page_size;
1731 
1732     return UVM_PAGE_SIZE_4K;
1733 }
1734 
1735 // Get the size of the physical allocation backing the page, or 0 if not
1736 // resident. Note that this is different from uvm_va_block_page_size_* because
1737 // those return the size of the PTE which maps the page index, which may be
1738 // smaller than the physical allocation.
1739 static NvU32 block_phys_page_size(uvm_va_block_t *block, block_phys_page_t page)
1740 {
1741     uvm_va_block_gpu_state_t *gpu_state;
1742     uvm_chunk_size_t chunk_size;
1743 
1744     if (UVM_ID_IS_CPU(page.processor)) {
1745         uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page.page_index);
1746 
1747         if (!uvm_page_mask_test(&block->cpu.resident, page.page_index))
1748             return 0;
1749 
1750         UVM_ASSERT(uvm_processor_mask_test(&block->resident, UVM_ID_CPU));
1751         return (NvU32)uvm_cpu_chunk_get_size(chunk);
1752     }
1753 
1754     gpu_state = uvm_va_block_gpu_state_get(block, page.processor);
1755     if (!gpu_state || !uvm_page_mask_test(&gpu_state->resident, page.page_index))
1756         return 0;
1757 
1758     UVM_ASSERT(uvm_processor_mask_test(&block->resident, page.processor));
1759     block_gpu_chunk_index(block, block_get_gpu(block, page.processor), page.page_index, &chunk_size);
1760     return (NvU32)chunk_size;
1761 }
1762 
1763 static uvm_pte_bits_cpu_t get_cpu_pte_bit_index(uvm_prot_t prot)
1764 {
1765     uvm_pte_bits_cpu_t pte_bit_index = UVM_PTE_BITS_CPU_MAX;
1766 
1767     // ATOMIC and WRITE are synonyms for the CPU
1768     if (prot == UVM_PROT_READ_WRITE_ATOMIC || prot == UVM_PROT_READ_WRITE)
1769         pte_bit_index = UVM_PTE_BITS_CPU_WRITE;
1770     else if (prot == UVM_PROT_READ_ONLY)
1771         pte_bit_index = UVM_PTE_BITS_CPU_READ;
1772     else
1773         UVM_ASSERT_MSG(false, "Invalid access permissions %s\n", uvm_prot_string(prot));
1774 
1775     return pte_bit_index;
1776 }
1777 
1778 static uvm_pte_bits_gpu_t get_gpu_pte_bit_index(uvm_prot_t prot)
1779 {
1780     uvm_pte_bits_gpu_t pte_bit_index = UVM_PTE_BITS_GPU_MAX;
1781 
1782     if (prot == UVM_PROT_READ_WRITE_ATOMIC)
1783         pte_bit_index = UVM_PTE_BITS_GPU_ATOMIC;
1784     else if (prot == UVM_PROT_READ_WRITE)
1785         pte_bit_index = UVM_PTE_BITS_GPU_WRITE;
1786     else if (prot == UVM_PROT_READ_ONLY)
1787         pte_bit_index = UVM_PTE_BITS_GPU_READ;
1788     else
1789         UVM_ASSERT_MSG(false, "Invalid access permissions %s\n", uvm_prot_string(prot));
1790 
1791     return pte_bit_index;
1792 }
1793 
1794 uvm_page_mask_t *uvm_va_block_resident_mask_get(uvm_va_block_t *block, uvm_processor_id_t processor)
1795 {
1796     uvm_va_block_gpu_state_t *gpu_state;
1797 
1798     if (UVM_ID_IS_CPU(processor))
1799         return &block->cpu.resident;
1800 
1801     gpu_state = uvm_va_block_gpu_state_get(block, processor);
1802 
1803     UVM_ASSERT(gpu_state);
1804     return &gpu_state->resident;
1805 }
1806 
1807 // Get the page residency mask for a processor
1808 //
1809 // Notably this will allocate GPU state if not yet present and if that fails
1810 // NULL is returned.
1811 static uvm_page_mask_t *block_resident_mask_get_alloc(uvm_va_block_t *block, uvm_processor_id_t processor)
1812 {
1813     uvm_va_block_gpu_state_t *gpu_state;
1814 
1815     if (UVM_ID_IS_CPU(processor))
1816         return &block->cpu.resident;
1817 
1818     gpu_state = block_gpu_state_get_alloc(block, block_get_gpu(block, processor));
1819     if (!gpu_state)
1820         return NULL;
1821 
1822     return &gpu_state->resident;
1823 }
1824 
1825 static const uvm_page_mask_t *block_map_with_prot_mask_get(uvm_va_block_t *block,
1826                                                            uvm_processor_id_t processor,
1827                                                            uvm_prot_t prot)
1828 {
1829     uvm_va_block_gpu_state_t *gpu_state;
1830 
1831     if (UVM_ID_IS_CPU(processor))
1832         return &block->cpu.pte_bits[get_cpu_pte_bit_index(prot)];
1833 
1834     gpu_state = uvm_va_block_gpu_state_get(block, processor);
1835 
1836     UVM_ASSERT(gpu_state);
1837     return &gpu_state->pte_bits[get_gpu_pte_bit_index(prot)];
1838 }
1839 
1840 const uvm_page_mask_t *uvm_va_block_map_mask_get(uvm_va_block_t *block, uvm_processor_id_t processor)
1841 {
1842     return block_map_with_prot_mask_get(block, processor, UVM_PROT_READ_ONLY);
1843 }
1844 
1845 static const uvm_page_mask_t *block_evicted_mask_get(uvm_va_block_t *block, uvm_gpu_id_t gpu_id)
1846 {
1847     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu_id);
1848     UVM_ASSERT(gpu_state);
1849 
1850     return &gpu_state->evicted;
1851 }
1852 
1853 static bool block_is_page_resident_anywhere(uvm_va_block_t *block, uvm_page_index_t page_index)
1854 {
1855     uvm_processor_id_t id;
1856     for_each_id_in_mask(id, &block->resident) {
1857         if (uvm_page_mask_test(uvm_va_block_resident_mask_get(block, id), page_index))
1858             return true;
1859     }
1860 
1861     return false;
1862 }
1863 
1864 static bool block_processor_page_is_populated(uvm_va_block_t *block, uvm_processor_id_t proc, uvm_page_index_t page_index)
1865 {
1866     uvm_va_block_gpu_state_t *gpu_state;
1867     size_t chunk_index;
1868 
1869     if (UVM_ID_IS_CPU(proc))
1870         return uvm_page_mask_test(&block->cpu.allocated, page_index);
1871 
1872     gpu_state = uvm_va_block_gpu_state_get(block, proc);
1873     if (!gpu_state)
1874         return false;
1875 
1876     chunk_index = block_gpu_chunk_index(block, block_get_gpu(block, proc), page_index, NULL);
1877     return gpu_state->chunks[chunk_index] != NULL;
1878 }
1879 
1880 static bool block_processor_page_is_resident_on(uvm_va_block_t *block, uvm_processor_id_t proc, uvm_page_index_t page_index)
1881 {
1882     const uvm_page_mask_t *resident_mask;
1883 
1884     if (UVM_ID_IS_CPU(proc)) {
1885         resident_mask = &block->cpu.resident;
1886     }
1887     else {
1888         uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, proc);
1889         if (!gpu_state)
1890             return false;
1891 
1892         resident_mask = &gpu_state->resident;
1893     }
1894 
1895     return uvm_page_mask_test(resident_mask, page_index);
1896 }
1897 
1898 // Compute the gpus that have at least the given access permissions for the
1899 // range described by region and page_mask. The function sets the bit if any
1900 // page in the region has the permissions.
1901 static void block_region_authorized_gpus(uvm_va_block_t *va_block,
1902                                          uvm_va_block_region_t region,
1903                                          uvm_prot_t access_permission,
1904                                          uvm_processor_mask_t *authorized_gpus)
1905 {
1906     uvm_gpu_id_t gpu_id;
1907     uvm_pte_bits_gpu_t search_gpu_bit = get_gpu_pte_bit_index(access_permission);
1908 
1909     uvm_processor_mask_zero(authorized_gpus);
1910 
1911     // Test all GPUs with mappings on the block
1912     for_each_gpu_id_in_mask(gpu_id, &va_block->mapped) {
1913         uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu_id);
1914         if (gpu_state && !uvm_page_mask_region_empty(&gpu_state->pte_bits[search_gpu_bit], region))
1915             uvm_processor_mask_set(authorized_gpus, gpu_id);
1916     }
1917 }
1918 
1919 // Compute the processors that have at least the given access permissions for
1920 // the range described by region and page_mask. The function sets the bit if any
1921 // page in the region has the permissions.
1922 static void block_region_authorized_processors(uvm_va_block_t *va_block,
1923                                                uvm_va_block_region_t region,
1924                                                uvm_prot_t access_permission,
1925                                                uvm_processor_mask_t *authorized_processors)
1926 {
1927     uvm_pte_bits_cpu_t search_cpu_bit = get_cpu_pte_bit_index(access_permission);
1928 
1929     // Compute GPUs
1930     block_region_authorized_gpus(va_block, region, access_permission, authorized_processors);
1931 
1932     // Test CPU
1933     if (uvm_processor_mask_test(&va_block->mapped, UVM_ID_CPU) &&
1934         !uvm_page_mask_region_empty(&va_block->cpu.pte_bits[search_cpu_bit], region)) {
1935         uvm_processor_mask_set(authorized_processors, UVM_ID_CPU);
1936     }
1937 }
1938 
1939 static void block_page_authorized_processors(uvm_va_block_t *va_block,
1940                                              uvm_page_index_t page_index,
1941                                              uvm_prot_t access_permission,
1942                                              uvm_processor_mask_t *authorized_processors)
1943 {
1944     block_region_authorized_processors(va_block,
1945                                        uvm_va_block_region_for_page(page_index),
1946                                        access_permission,
1947                                        authorized_processors);
1948 }
1949 
1950 static bool block_is_gpu_authorized_on_whole_region(uvm_va_block_t *va_block,
1951                                                     uvm_va_block_region_t region,
1952                                                     uvm_gpu_id_t gpu_id,
1953                                                     uvm_prot_t required_prot)
1954 {
1955     uvm_pte_bits_gpu_t search_gpu_bit = get_gpu_pte_bit_index(required_prot);
1956     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu_id);
1957 
1958     if (!gpu_state)
1959         return false;
1960 
1961     return uvm_page_mask_region_full(&gpu_state->pte_bits[search_gpu_bit], region);
1962 }
1963 
1964 static bool block_is_processor_authorized_on_whole_region(uvm_va_block_t *va_block,
1965                                                           uvm_va_block_region_t region,
1966                                                           uvm_processor_id_t processor_id,
1967                                                           uvm_prot_t required_prot)
1968 {
1969     if (UVM_ID_IS_CPU(processor_id)) {
1970         uvm_pte_bits_cpu_t search_cpu_bit = get_cpu_pte_bit_index(required_prot);
1971 
1972         return uvm_page_mask_region_full(&va_block->cpu.pte_bits[search_cpu_bit], region);
1973     }
1974     else {
1975         return block_is_gpu_authorized_on_whole_region(va_block, region, processor_id, required_prot);
1976     }
1977 }
1978 
1979 bool uvm_va_block_page_is_gpu_authorized(uvm_va_block_t *va_block,
1980                                          uvm_page_index_t page_index,
1981                                          uvm_gpu_id_t gpu_id,
1982                                          uvm_prot_t required_prot)
1983 {
1984     return block_is_gpu_authorized_on_whole_region(va_block,
1985                                                    uvm_va_block_region_for_page(page_index),
1986                                                    gpu_id,
1987                                                    required_prot);
1988 }
1989 
1990 static bool block_page_is_processor_authorized(uvm_va_block_t *va_block,
1991                                                uvm_page_index_t page_index,
1992                                                uvm_processor_id_t processor_id,
1993                                                uvm_prot_t required_prot)
1994 {
1995     return block_is_processor_authorized_on_whole_region(va_block,
1996                                                          uvm_va_block_region_for_page(page_index),
1997                                                          processor_id,
1998                                                          required_prot);
1999 }
2000 
2001 // Compute the gpus that have a copy of the given page resident in their memory
2002 static void block_page_resident_gpus(uvm_va_block_t *va_block,
2003                                      uvm_page_index_t page_index,
2004                                      uvm_processor_mask_t *resident_gpus)
2005 {
2006     uvm_gpu_id_t id;
2007     uvm_processor_mask_zero(resident_gpus);
2008 
2009     for_each_gpu_id_in_mask(id, &va_block->resident) {
2010         if (uvm_page_mask_test(uvm_va_block_resident_mask_get(va_block, id), page_index)) {
2011             UVM_ASSERT(block_processor_page_is_populated(va_block, id, page_index));
2012             uvm_processor_mask_set(resident_gpus, id);
2013         }
2014     }
2015 }
2016 
2017 void uvm_va_block_page_resident_processors(uvm_va_block_t *va_block,
2018                                            uvm_page_index_t page_index,
2019                                            uvm_processor_mask_t *resident_processors)
2020 {
2021     block_page_resident_gpus(va_block, page_index, resident_processors);
2022 
2023     if (uvm_page_mask_test(uvm_va_block_resident_mask_get(va_block, UVM_ID_CPU), page_index)) {
2024         UVM_ASSERT(block_processor_page_is_populated(va_block, UVM_ID_CPU, page_index));
2025         uvm_processor_mask_set(resident_processors, UVM_ID_CPU);
2026     }
2027 }
2028 
2029 NvU32 uvm_va_block_page_resident_processors_count(uvm_va_block_t *va_block, uvm_page_index_t page_index)
2030 {
2031     uvm_processor_mask_t resident_processors;
2032     uvm_va_block_page_resident_processors(va_block, page_index, &resident_processors);
2033 
2034     return uvm_processor_mask_get_count(&resident_processors);
2035 }
2036 
2037 static uvm_processor_id_t block_page_get_closest_resident_in_mask(uvm_va_block_t *va_block,
2038                                                                   uvm_page_index_t page_index,
2039                                                                   uvm_processor_id_t processor,
2040                                                                   const uvm_processor_mask_t *processor_mask)
2041 {
2042     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
2043     uvm_processor_mask_t search_mask;
2044     uvm_processor_id_t id;
2045 
2046     if (processor_mask)
2047         uvm_processor_mask_and(&search_mask, processor_mask, &va_block->resident);
2048     else
2049         uvm_processor_mask_copy(&search_mask, &va_block->resident);
2050 
2051     for_each_closest_id(id, &search_mask, processor, va_space) {
2052         if (uvm_page_mask_test(uvm_va_block_resident_mask_get(va_block, id), page_index))
2053             return id;
2054     }
2055 
2056     return UVM_ID_INVALID;
2057 }
2058 
2059 uvm_processor_id_t uvm_va_block_page_get_closest_resident(uvm_va_block_t *va_block,
2060                                                           uvm_page_index_t page_index,
2061                                                           uvm_processor_id_t processor)
2062 {
2063     return block_page_get_closest_resident_in_mask(va_block, page_index, processor, NULL);
2064 }
2065 
2066 // We don't track the specific aperture of each mapped page. Instead, we assume
2067 // that each virtual mapping from a given processor always targets the closest
2068 // processor on which that page is resident (with special rules for UVM-Lite).
2069 //
2070 // This function verifies that assumption: before a page becomes resident on a
2071 // new location, assert that no processor has a valid mapping to a farther
2072 // processor on that page.
2073 static bool block_check_resident_proximity(uvm_va_block_t *block, uvm_page_index_t page_index, uvm_processor_id_t new_residency)
2074 {
2075     uvm_processor_mask_t resident_procs, mapped_procs;
2076     uvm_processor_id_t mapped_id, closest_id;
2077     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
2078 
2079     uvm_processor_mask_andnot(&mapped_procs, &block->mapped, block_get_uvm_lite_gpus(block));
2080 
2081     for_each_id_in_mask(mapped_id, &mapped_procs) {
2082         if (!uvm_page_mask_test(uvm_va_block_map_mask_get(block, mapped_id), page_index))
2083             continue;
2084 
2085         uvm_va_block_page_resident_processors(block, page_index, &resident_procs);
2086         UVM_ASSERT(!uvm_processor_mask_empty(&resident_procs));
2087         UVM_ASSERT(!uvm_processor_mask_test(&resident_procs, new_residency));
2088         uvm_processor_mask_set(&resident_procs, new_residency);
2089         closest_id = uvm_processor_mask_find_closest_id(va_space, &resident_procs, mapped_id);
2090         UVM_ASSERT(!uvm_id_equal(closest_id, new_residency));
2091     }
2092 
2093     return true;
2094 }
2095 
2096 // Returns the processor to which page_index should be mapped on gpu
2097 static uvm_processor_id_t block_gpu_get_processor_to_map(uvm_va_block_t *block,
2098                                                          uvm_gpu_t *gpu,
2099                                                          uvm_page_index_t page_index)
2100 {
2101     uvm_processor_id_t dest_id;
2102 
2103     // UVM-Lite GPUs can only map pages on the preferred location
2104     if (uvm_processor_mask_test(block_get_uvm_lite_gpus(block), gpu->id))
2105         return uvm_va_range_get_policy(block->va_range)->preferred_location;
2106 
2107     // Otherwise we always map the closest resident processor
2108     dest_id = uvm_va_block_page_get_closest_resident(block, page_index, gpu->id);
2109     UVM_ASSERT(UVM_ID_IS_VALID(dest_id));
2110     return dest_id;
2111 }
2112 
2113 // Returns the processor to which page_index should be mapped on mapping_id
2114 static uvm_processor_id_t block_get_processor_to_map(uvm_va_block_t *block,
2115                                                      uvm_processor_id_t mapping_id,
2116                                                      uvm_page_index_t page_index)
2117 {
2118 
2119     if (UVM_ID_IS_CPU(mapping_id))
2120         return uvm_va_block_page_get_closest_resident(block, page_index, mapping_id);
2121 
2122     return block_gpu_get_processor_to_map(block, block_get_gpu(block, mapping_id), page_index);
2123 }
2124 
2125 static void block_get_mapped_processors(uvm_va_block_t *block,
2126                                         uvm_processor_id_t resident_id,
2127                                         uvm_page_index_t page_index,
2128                                         uvm_processor_mask_t *mapped_procs)
2129 {
2130     uvm_processor_id_t mapped_id;
2131 
2132     uvm_processor_mask_zero(mapped_procs);
2133 
2134     for_each_id_in_mask(mapped_id, &block->mapped) {
2135         if (uvm_page_mask_test(uvm_va_block_map_mask_get(block, mapped_id), page_index)) {
2136             uvm_processor_id_t to_map_id = block_get_processor_to_map(block, mapped_id, page_index);
2137 
2138             if (uvm_id_equal(to_map_id, resident_id))
2139                 uvm_processor_mask_set(mapped_procs, mapped_id);
2140         }
2141     }
2142 }
2143 
2144 // We use block_gpu_get_processor_to_map to find the destination processor of a
2145 // given GPU mapping. This function is called when the mapping is established to
2146 // sanity check that the destination of the mapping matches the query.
2147 static bool block_check_mapping_residency_region(uvm_va_block_t *block,
2148                                                  uvm_gpu_t *gpu,
2149                                                  uvm_processor_id_t mapping_dest,
2150                                                  uvm_va_block_region_t region,
2151                                                  const uvm_page_mask_t *page_mask)
2152 {
2153     uvm_page_index_t page_index;
2154     for_each_va_block_page_in_region_mask(page_index, page_mask, region) {
2155         NvU64 va = uvm_va_block_cpu_page_address(block, page_index);
2156         uvm_processor_id_t proc_to_map = block_gpu_get_processor_to_map(block, gpu, page_index);
2157         UVM_ASSERT_MSG(uvm_id_equal(mapping_dest, proc_to_map),
2158                        "VA 0x%llx on %s: mapping %s, supposed to map %s",
2159                        va,
2160                        uvm_gpu_name(gpu),
2161                        block_processor_name(block, mapping_dest),
2162                        block_processor_name(block, proc_to_map));
2163     }
2164     return true;
2165 }
2166 
2167 static bool block_check_mapping_residency(uvm_va_block_t *block,
2168                                           uvm_gpu_t *gpu,
2169                                           uvm_processor_id_t mapping_dest,
2170                                           const uvm_page_mask_t *page_mask)
2171 {
2172     return block_check_mapping_residency_region(block,
2173                                                 gpu,
2174                                                 mapping_dest,
2175                                                 uvm_va_block_region_from_block(block),
2176                                                 page_mask);
2177 }
2178 
2179 // Check that there are no mappings targeting resident_id from any processor in
2180 // the block.
2181 static bool block_check_processor_not_mapped(uvm_va_block_t *block, uvm_processor_id_t resident_id)
2182 {
2183     uvm_processor_id_t mapped_id;
2184     uvm_page_index_t page_index;
2185 
2186     for_each_id_in_mask(mapped_id, &block->mapped) {
2187         const uvm_page_mask_t *map_mask = uvm_va_block_map_mask_get(block, mapped_id);
2188 
2189         for_each_va_block_page_in_mask(page_index, map_mask, block) {
2190             uvm_processor_id_t to_map_id = block_get_processor_to_map(block, mapped_id, page_index);
2191             UVM_ASSERT(!uvm_id_equal(to_map_id, resident_id));
2192         }
2193     }
2194 
2195     return true;
2196 }
2197 
2198 // Zero all pages of the newly-populated chunk which are not resident anywhere
2199 // else in the system, adding that work to the block's tracker. In all cases,
2200 // this function adds a dependency on passed in tracker to the block's tracker.
2201 static NV_STATUS block_zero_new_gpu_chunk(uvm_va_block_t *block,
2202                                           uvm_gpu_t *gpu,
2203                                           uvm_gpu_chunk_t *chunk,
2204                                           uvm_va_block_region_t chunk_region,
2205                                           uvm_tracker_t *tracker)
2206 {
2207     uvm_va_block_gpu_state_t *gpu_state;
2208     NV_STATUS status;
2209     uvm_gpu_address_t memset_addr_base, memset_addr;
2210     uvm_push_t push;
2211     uvm_gpu_id_t id;
2212     uvm_va_block_region_t subregion;
2213     uvm_page_mask_t *zero_mask;
2214 
2215     UVM_ASSERT(uvm_va_block_region_size(chunk_region) == uvm_gpu_chunk_get_size(chunk));
2216 
2217     if (chunk->is_zero)
2218         return NV_OK;
2219 
2220     gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
2221     zero_mask = kmem_cache_alloc(g_uvm_page_mask_cache, NV_UVM_GFP_FLAGS);
2222 
2223     if (!zero_mask)
2224         return NV_ERR_NO_MEMORY;
2225 
2226     // Tradeoff: zeroing entire chunk vs zeroing only the pages needed for the
2227     // operation.
2228     //
2229     // We may over-zero the page with this approach. For example, we might be
2230     // populating a 2MB chunk because only a single page within that chunk needs
2231     // to be made resident. If we also zero non-resident pages outside of the
2232     // strict region, we could waste the effort if those pages are populated on
2233     // another processor later and migrated here.
2234     //
2235     // We zero all non-resident pages in the chunk anyway for two reasons:
2236     //
2237     // 1) Efficiency. It's better to do all zeros as pipelined transfers once
2238     //    rather than scatter them around for each populate operation.
2239     //
2240     // 2) Optimizing the common case of block_populate_gpu_chunk being called
2241     //    for already-populated chunks. If we zero once at initial populate, we
2242     //    can simply check whether the chunk is present in the array. Otherwise
2243     //    we'd have to recompute the "is any page resident" mask every time.
2244 
2245     // Roll up all pages in chunk_region which are resident somewhere
2246     uvm_page_mask_zero(zero_mask);
2247     for_each_id_in_mask(id, &block->resident)
2248         uvm_page_mask_or(zero_mask, zero_mask, uvm_va_block_resident_mask_get(block, id));
2249 
2250     // If all pages in the chunk are resident somewhere, we don't need to clear
2251     // anything. Just make sure the chunk is tracked properly.
2252     if (uvm_page_mask_region_full(zero_mask, chunk_region)) {
2253         status = uvm_tracker_add_tracker_safe(&block->tracker, tracker);
2254         goto out;
2255     }
2256 
2257     // Complement to get the pages which are not resident anywhere. These
2258     // are the pages which must be zeroed.
2259     uvm_page_mask_complement(zero_mask, zero_mask);
2260 
2261     memset_addr_base = uvm_gpu_address_copy(gpu, uvm_gpu_phys_address(UVM_APERTURE_VID, chunk->address));
2262     memset_addr = memset_addr_base;
2263 
2264     status = uvm_push_begin_acquire(gpu->channel_manager,
2265                                     UVM_CHANNEL_TYPE_GPU_INTERNAL,
2266                                     tracker,
2267                                     &push,
2268                                     "Zero out chunk [0x%llx, 0x%llx) for region [0x%llx, 0x%llx) in va block [0x%llx, 0x%llx)",
2269                                     chunk->address,
2270                                     chunk->address + uvm_gpu_chunk_get_size(chunk),
2271                                     uvm_va_block_region_start(block, chunk_region),
2272                                     uvm_va_block_region_end(block, chunk_region) + 1,
2273                                     block->start,
2274                                     block->end + 1);
2275     if (status != NV_OK)
2276         goto out;
2277 
2278     for_each_va_block_subregion_in_mask(subregion, zero_mask, chunk_region) {
2279         // Pipeline the memsets since they never overlap with each other
2280         uvm_push_set_flag(&push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
2281 
2282         // We'll push one membar later for all memsets in this loop
2283         uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
2284 
2285         memset_addr.address = memset_addr_base.address + (subregion.first - chunk_region.first) * PAGE_SIZE;
2286         gpu->parent->ce_hal->memset_8(&push, memset_addr, 0, uvm_va_block_region_size(subregion));
2287     }
2288 
2289     // A membar from this GPU is required between this memset and any PTE write
2290     // pointing this or another GPU to this chunk. Otherwise an engine could
2291     // read the PTE then access the page before the memset write is visible to
2292     // that engine.
2293     //
2294     // This memset writes GPU memory, so local mappings need only a GPU-local
2295     // membar. We can't easily determine here whether a peer GPU will ever map
2296     // this page in the future, so always use a sysmembar. uvm_push_end provides
2297     // one by default.
2298     //
2299     // TODO: Bug 1766424: Use GPU-local membars if no peer can currently map
2300     //       this page. When peer access gets enabled, do a MEMBAR_SYS at that
2301     //       point.
2302     uvm_push_end(&push);
2303     status = uvm_tracker_add_push_safe(&block->tracker, &push);
2304 
2305 out:
2306     if (zero_mask)
2307         kmem_cache_free(g_uvm_page_mask_cache, zero_mask);
2308 
2309     return status;
2310 }
2311 
2312 static NV_STATUS block_populate_gpu_chunk(uvm_va_block_t *block,
2313                                           uvm_va_block_retry_t *retry,
2314                                           uvm_gpu_t *gpu,
2315                                           size_t chunk_index,
2316                                           uvm_va_block_region_t chunk_region)
2317 {
2318     uvm_va_block_gpu_state_t *gpu_state = block_gpu_state_get_alloc(block, gpu);
2319     uvm_gpu_chunk_t *chunk = NULL;
2320     uvm_chunk_size_t chunk_size = uvm_va_block_region_size(chunk_region);
2321     uvm_va_block_test_t *block_test = uvm_va_block_get_test(block);
2322     NV_STATUS status;
2323 
2324     if (!gpu_state)
2325         return NV_ERR_NO_MEMORY;
2326 
2327     uvm_assert_mutex_locked(&block->lock);
2328     UVM_ASSERT(chunk_index < block_num_gpu_chunks(block, gpu));
2329     UVM_ASSERT(chunk_size & gpu->parent->mmu_user_chunk_sizes);
2330 
2331     // We zero chunks as necessary at initial population, so if the chunk is
2332     // already populated we're done. See the comment in
2333     // block_zero_new_gpu_chunk.
2334     if (gpu_state->chunks[chunk_index])
2335         return NV_OK;
2336 
2337     UVM_ASSERT(uvm_page_mask_region_empty(&gpu_state->resident, chunk_region));
2338 
2339     status = block_alloc_gpu_chunk(block, retry, gpu, chunk_size, &chunk);
2340     if (status != NV_OK)
2341         return status;
2342 
2343     // In some configurations such as SR-IOV heavy, the chunk cannot be
2344     // referenced using its physical address. Create a virtual mapping.
2345     status = uvm_mmu_chunk_map(chunk);
2346     if (status != NV_OK)
2347         goto chunk_free;
2348 
2349     status = block_zero_new_gpu_chunk(block, gpu, chunk, chunk_region, &retry->tracker);
2350     if (status != NV_OK)
2351         goto chunk_unmap;
2352 
2353     // It is safe to modify the page index field without holding any PMM locks
2354     // because the chunk is pinned, which means that none of the other fields in
2355     // the bitmap can change.
2356     chunk->va_block_page_index = chunk_region.first;
2357 
2358     // va_block_page_index is a bitfield of size PAGE_SHIFT. Make sure at
2359     // compile-time that it can store VA Block page indexes.
2360     BUILD_BUG_ON(PAGES_PER_UVM_VA_BLOCK >= PAGE_SIZE);
2361 
2362     status = block_map_indirect_peers_to_gpu_chunk(block, gpu, chunk);
2363     if (status != NV_OK)
2364         goto chunk_unmap;
2365 
2366     if (block_test && block_test->inject_populate_error) {
2367         block_test->inject_populate_error = false;
2368 
2369         // Use NV_ERR_MORE_PROCESSING_REQUIRED to force a retry rather than
2370         // causing a fatal OOM failure.
2371         status = NV_ERR_MORE_PROCESSING_REQUIRED;
2372         goto chunk_unmap_indirect_peers;
2373     }
2374 
2375     // Record the used chunk so that it can be unpinned at the end of the whole
2376     // operation.
2377     block_retry_add_used_chunk(retry, chunk);
2378     gpu_state->chunks[chunk_index] = chunk;
2379 
2380     return NV_OK;
2381 
2382 chunk_unmap_indirect_peers:
2383     block_unmap_indirect_peers_from_gpu_chunk(block, gpu, chunk);
2384 
2385 chunk_unmap:
2386     uvm_mmu_chunk_unmap(chunk, &block->tracker);
2387 
2388 chunk_free:
2389     // block_zero_new_gpu_chunk may have pushed memsets on this chunk which it
2390     // placed in the block tracker.
2391     uvm_pmm_gpu_free(&gpu->pmm, chunk, &block->tracker);
2392 
2393     return status;
2394 }
2395 
2396 // Populate all chunks which cover the given region and page mask.
2397 static NV_STATUS block_populate_pages_gpu(uvm_va_block_t *block,
2398                                           uvm_va_block_retry_t *retry,
2399                                           uvm_gpu_t *gpu,
2400                                           uvm_va_block_region_t region,
2401                                           const uvm_page_mask_t *populate_mask)
2402 {
2403     uvm_va_block_region_t chunk_region, check_region;
2404     size_t chunk_index;
2405     uvm_page_index_t page_index;
2406     uvm_chunk_size_t chunk_size;
2407     NV_STATUS status;
2408 
2409     page_index = uvm_va_block_first_page_in_mask(region, populate_mask);
2410     if (page_index == region.outer)
2411         return NV_OK;
2412 
2413     chunk_index = block_gpu_chunk_index(block, gpu, page_index, &chunk_size);
2414     chunk_region = uvm_va_block_chunk_region(block, chunk_size, page_index);
2415 
2416     while (1) {
2417         check_region = uvm_va_block_region(max(chunk_region.first, region.first),
2418                                            min(chunk_region.outer, region.outer));
2419         page_index = uvm_va_block_first_page_in_mask(check_region, populate_mask);
2420         if (page_index != check_region.outer) {
2421             status = block_populate_gpu_chunk(block, retry, gpu, chunk_index, chunk_region);
2422             if (status != NV_OK)
2423                 return status;
2424         }
2425 
2426         if (check_region.outer == region.outer)
2427             break;
2428 
2429         ++chunk_index;
2430         chunk_size = block_gpu_chunk_size(block, gpu, chunk_region.outer);
2431         chunk_region = uvm_va_block_region(chunk_region.outer, chunk_region.outer + (chunk_size / PAGE_SIZE));
2432     }
2433 
2434     return NV_OK;
2435 }
2436 
2437 static NV_STATUS block_populate_pages(uvm_va_block_t *block,
2438                                       uvm_va_block_retry_t *retry,
2439                                       uvm_va_block_context_t *block_context,
2440                                       uvm_processor_id_t dest_id,
2441                                       uvm_va_block_region_t region,
2442                                       const uvm_page_mask_t *page_mask)
2443 {
2444     NV_STATUS status;
2445     const uvm_page_mask_t *resident_mask = block_resident_mask_get_alloc(block, dest_id);
2446     uvm_page_mask_t *populate_page_mask = &block_context->make_resident.page_mask;
2447     uvm_memcg_context_t memcg_context;
2448 
2449     if (!resident_mask)
2450         return NV_ERR_NO_MEMORY;
2451 
2452     if (page_mask)
2453         uvm_page_mask_andnot(populate_page_mask, page_mask, resident_mask);
2454     else
2455         uvm_page_mask_complement(populate_page_mask, resident_mask);
2456 
2457     if (UVM_ID_IS_GPU(dest_id))
2458         return block_populate_pages_gpu(block, retry, block_get_gpu(block, dest_id), region, populate_page_mask);
2459 
2460     uvm_memcg_context_start(&memcg_context, block_context->mm);
2461     status = block_populate_pages_cpu(block, populate_page_mask, region, block_context);
2462     uvm_memcg_context_end(&memcg_context);
2463     return status;
2464 }
2465 
2466 static const uvm_processor_mask_t *block_get_can_copy_from_mask(uvm_va_block_t *block, uvm_processor_id_t from)
2467 {
2468     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
2469 
2470     return &va_space->can_copy_from[uvm_id_value(from)];
2471 }
2472 
2473 static bool block_can_copy_from(uvm_va_block_t *va_block, uvm_processor_id_t from, uvm_processor_id_t to)
2474 {
2475     return uvm_processor_mask_test(block_get_can_copy_from_mask(va_block, to), from);
2476 }
2477 
2478 // Get the chunk containing the given page, along with the offset of that page
2479 // within the chunk.
2480 static uvm_gpu_chunk_t *block_phys_page_chunk(uvm_va_block_t *block, block_phys_page_t block_page, size_t *chunk_offset)
2481 {
2482     uvm_gpu_t *gpu = block_get_gpu(block, block_page.processor);
2483     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, block_page.processor);
2484     size_t chunk_index;
2485     uvm_gpu_chunk_t *chunk;
2486     uvm_chunk_size_t chunk_size;
2487 
2488     UVM_ASSERT(gpu_state);
2489 
2490     chunk_index = block_gpu_chunk_index(block, gpu, block_page.page_index, &chunk_size);
2491     chunk = gpu_state->chunks[chunk_index];
2492     UVM_ASSERT(chunk);
2493 
2494     if (chunk_offset) {
2495         size_t page_offset = block_page.page_index -
2496                              uvm_va_block_chunk_region(block,chunk_size, block_page.page_index).first;
2497         *chunk_offset = page_offset * PAGE_SIZE;
2498     }
2499 
2500     return chunk;
2501 }
2502 
2503 // Get the physical GPU address of a block's page from the POV of the specified GPU
2504 // This is the address that should be used for making PTEs for the specified GPU.
2505 static uvm_gpu_phys_address_t block_phys_page_address(uvm_va_block_t *block,
2506                                                       block_phys_page_t block_page,
2507                                                       uvm_gpu_t *gpu)
2508 {
2509     uvm_va_block_gpu_state_t *accessing_gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
2510     size_t chunk_offset;
2511     uvm_gpu_chunk_t *chunk;
2512 
2513     UVM_ASSERT(accessing_gpu_state);
2514 
2515     if (UVM_ID_IS_CPU(block_page.processor)) {
2516         uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, block_page.page_index);
2517         NvU64 dma_addr = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu->parent);
2518         uvm_va_block_region_t chunk_region = uvm_va_block_chunk_region(block,
2519                                                                        uvm_cpu_chunk_get_size(chunk),
2520                                                                        block_page.page_index);
2521 
2522         // The page should be mapped for physical access already as we do that
2523         // eagerly on CPU page population and GPU state alloc.
2524         UVM_ASSERT(dma_addr != 0);
2525         dma_addr += (block_page.page_index - chunk_region.first) * PAGE_SIZE;
2526 
2527         return uvm_gpu_phys_address(UVM_APERTURE_SYS, dma_addr);
2528     }
2529 
2530     chunk = block_phys_page_chunk(block, block_page, &chunk_offset);
2531 
2532     if (uvm_id_equal(block_page.processor, gpu->id)) {
2533         return uvm_gpu_phys_address(UVM_APERTURE_VID, chunk->address + chunk_offset);
2534     }
2535     else {
2536         uvm_gpu_phys_address_t phys_addr;
2537         uvm_gpu_t *owning_gpu = block_get_gpu(block, block_page.processor);
2538         uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
2539 
2540         UVM_ASSERT(uvm_va_space_peer_enabled(va_space, gpu, owning_gpu));
2541         phys_addr = uvm_pmm_gpu_peer_phys_address(&owning_gpu->pmm, chunk, gpu);
2542         phys_addr.address += chunk_offset;
2543         return phys_addr;
2544     }
2545 }
2546 
2547 // Get the physical GPU address of a block's page from the POV of the specified
2548 // GPU, suitable for accessing the memory from UVM-internal CE channels.
2549 //
2550 // Notably this is may be different from block_phys_page_address() to handle CE
2551 // limitations in addressing physical memory directly.
2552 static uvm_gpu_address_t block_phys_page_copy_address(uvm_va_block_t *block,
2553                                                       block_phys_page_t block_page,
2554                                                       uvm_gpu_t *gpu)
2555 {
2556     uvm_gpu_t *owning_gpu;
2557     size_t chunk_offset;
2558     uvm_gpu_chunk_t *chunk;
2559     uvm_gpu_address_t copy_addr;
2560     uvm_va_space_t *va_space;
2561 
2562     UVM_ASSERT_MSG(block_can_copy_from(block, gpu->id, block_page.processor),
2563                    "from %s to %s\n",
2564                    block_processor_name(block, gpu->id),
2565                    block_processor_name(block, block_page.processor));
2566 
2567     // CPU and local GPU accesses can rely on block_phys_page_address, but the
2568     // resulting physical address may need to be converted into virtual.
2569     if (UVM_ID_IS_CPU(block_page.processor) || uvm_id_equal(block_page.processor, gpu->id))
2570         return uvm_gpu_address_copy(gpu, block_phys_page_address(block, block_page, gpu));
2571 
2572     va_space = uvm_va_block_get_va_space(block);
2573 
2574     // See the comments on the peer_identity_mappings_supported assignments in
2575     // the HAL for why we disable direct copies between peers.
2576     owning_gpu = block_get_gpu(block, block_page.processor);
2577 
2578     UVM_ASSERT(uvm_va_space_peer_enabled(va_space, gpu, owning_gpu));
2579 
2580     chunk = block_phys_page_chunk(block, block_page, &chunk_offset);
2581     copy_addr = uvm_pmm_gpu_peer_copy_address(&owning_gpu->pmm, chunk, gpu);
2582     copy_addr.address += chunk_offset;
2583     return copy_addr;
2584 }
2585 
2586 uvm_gpu_phys_address_t uvm_va_block_res_phys_page_address(uvm_va_block_t *va_block,
2587                                                           uvm_page_index_t page_index,
2588                                                           uvm_processor_id_t residency,
2589                                                           uvm_gpu_t *gpu)
2590 {
2591     uvm_assert_mutex_locked(&va_block->lock);
2592 
2593     return block_phys_page_address(va_block, block_phys_page(residency, page_index), gpu);
2594 }
2595 
2596 uvm_gpu_phys_address_t uvm_va_block_gpu_phys_page_address(uvm_va_block_t *va_block,
2597                                                           uvm_page_index_t page_index,
2598                                                           uvm_gpu_t *gpu)
2599 {
2600     return uvm_va_block_res_phys_page_address(va_block, page_index, gpu->id, gpu);
2601 }
2602 
2603 typedef struct
2604 {
2605     // Location of the memory
2606     uvm_processor_id_t id;
2607 
2608     // Whether the whole block has a single physically-contiguous chunk of
2609     // storage on the processor.
2610     bool is_block_contig;
2611 
2612     // Starting address of the physically-contiguous allocation, from the view
2613     // of the copying GPU. Valid only if is_block_contig.
2614     uvm_gpu_address_t gpu_address;
2615 } block_copy_addr_t;
2616 
2617 typedef struct
2618 {
2619     block_copy_addr_t src;
2620     block_copy_addr_t dst;
2621     uvm_conf_computing_dma_buffer_t *dma_buffer;
2622 } block_copy_state_t;
2623 
2624 // Begin a push appropriate for copying data from src_id processor to dst_id processor.
2625 // One of src_id and dst_id needs to be a GPU.
2626 static NV_STATUS block_copy_begin_push(uvm_va_block_t *va_block,
2627                                        block_copy_state_t *copy_state,
2628                                        uvm_tracker_t *tracker,
2629                                        uvm_push_t *push)
2630 {
2631     uvm_gpu_t *gpu;
2632     NV_STATUS status;
2633     uvm_channel_type_t channel_type;
2634     uvm_tracker_t *tracker_ptr = tracker;
2635     uvm_processor_id_t dst_id = copy_state->dst.id;
2636     uvm_processor_id_t src_id = copy_state->src.id;
2637     uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
2638 
2639     UVM_ASSERT_MSG(!uvm_id_equal(src_id, dst_id),
2640                    "Unexpected copy to self, processor %s\n",
2641                    block_processor_name(va_block, src_id));
2642 
2643     if (UVM_ID_IS_CPU(src_id)) {
2644         gpu = block_get_gpu(va_block, dst_id);
2645         channel_type = UVM_CHANNEL_TYPE_CPU_TO_GPU;
2646     }
2647     else if (UVM_ID_IS_CPU(dst_id)) {
2648         gpu = block_get_gpu(va_block, src_id);
2649         channel_type = UVM_CHANNEL_TYPE_GPU_TO_CPU;
2650     }
2651     else {
2652         // For GPU to GPU copies, prefer to "push" the data from the source as
2653         // that works better at least for P2P over PCI-E.
2654         gpu = block_get_gpu(va_block, src_id);
2655 
2656         channel_type = UVM_CHANNEL_TYPE_GPU_TO_GPU;
2657     }
2658 
2659     UVM_ASSERT_MSG(block_can_copy_from(va_block, gpu->id, dst_id),
2660                    "GPU %s dst %s src %s\n",
2661                    block_processor_name(va_block, gpu->id),
2662                    block_processor_name(va_block, dst_id),
2663                    block_processor_name(va_block, src_id));
2664     UVM_ASSERT_MSG(block_can_copy_from(va_block, gpu->id, src_id),
2665                    "GPU %s dst %s src %s\n",
2666                    block_processor_name(va_block, gpu->id),
2667                    block_processor_name(va_block, dst_id),
2668                    block_processor_name(va_block, src_id));
2669 
2670     if (channel_type == UVM_CHANNEL_TYPE_GPU_TO_GPU) {
2671         uvm_gpu_t *dst_gpu = block_get_gpu(va_block, dst_id);
2672         return uvm_push_begin_acquire_gpu_to_gpu(gpu->channel_manager,
2673                                                  dst_gpu,
2674                                                  tracker,
2675                                                  push,
2676                                                  "Copy from %s to %s for block [0x%llx, 0x%llx]",
2677                                                  block_processor_name(va_block, src_id),
2678                                                  block_processor_name(va_block, dst_id),
2679                                                  va_block->start,
2680                                                  va_block->end);
2681     }
2682 
2683     if (uvm_conf_computing_mode_enabled(gpu)) {
2684         // When the Confidential Feature is enabled, additional dependencies
2685         // apply to the input tracker as well as the dma_buffer tracker.
2686         // * In the CPU to GPU case, because UVM performs CPU side
2687         //   crypto-operations first before the GPU copy, we both need to
2688         //   ensure that the dma_buffer and the input tracker are completed.
2689         // * In the GPU to CPU case, the GPU copy happens first, but the same
2690         //   principles apply. Hence, UVM acquires the input tracker and the
2691         //   dma buffer.
2692         status = uvm_tracker_overwrite_safe(&local_tracker, tracker);
2693         if (status != NV_OK)
2694             goto error;
2695 
2696         UVM_ASSERT(copy_state->dma_buffer == NULL);
2697         status = uvm_conf_computing_dma_buffer_alloc(&gpu->conf_computing.dma_buffer_pool,
2698                                                      &copy_state->dma_buffer,
2699                                                      &local_tracker);
2700 
2701         if (status != NV_OK)
2702             goto error;
2703 
2704         if (channel_type == UVM_CHANNEL_TYPE_CPU_TO_GPU) {
2705             status = uvm_tracker_wait(&local_tracker);
2706             if (status != NV_OK)
2707                 goto error;
2708         }
2709 
2710         tracker_ptr = &local_tracker;
2711     }
2712 
2713     status = uvm_push_begin_acquire(gpu->channel_manager,
2714                                     channel_type,
2715                                     tracker_ptr,
2716                                     push,
2717                                     "Copy from %s to %s for block [0x%llx, 0x%llx]",
2718                                     block_processor_name(va_block, src_id),
2719                                     block_processor_name(va_block, dst_id),
2720                                     va_block->start,
2721                                     va_block->end);
2722 
2723 error:
2724     // Caller is responsible for freeing the DMA buffer on error
2725     uvm_tracker_deinit(&local_tracker);
2726     return status;
2727 }
2728 
2729 // A page is clean iff...
2730 // the destination is the preferred location and
2731 // the source is the CPU and
2732 // the destination does not support faults/eviction and
2733 // the CPU page is not dirty
2734 static bool block_page_is_clean(uvm_va_block_t *block,
2735                                 uvm_processor_id_t dst_id,
2736                                 uvm_processor_id_t src_id,
2737                                 uvm_page_index_t page_index)
2738 {
2739     return !uvm_va_block_is_hmm(block) &&
2740            uvm_id_equal(dst_id, uvm_va_range_get_policy(block->va_range)->preferred_location) &&
2741            UVM_ID_IS_CPU(src_id) &&
2742            !block_get_gpu(block, dst_id)->parent->isr.replayable_faults.handling &&
2743            !block_cpu_page_is_dirty(block, page_index);
2744 }
2745 
2746 // When the destination is the CPU...
2747 // if the source is the preferred location, mark as clean
2748 // otherwise, mark as dirty
2749 static void block_update_page_dirty_state(uvm_va_block_t *block,
2750                                           uvm_processor_id_t dst_id,
2751                                           uvm_processor_id_t src_id,
2752                                           uvm_page_index_t page_index)
2753 {
2754     if (UVM_ID_IS_GPU(dst_id))
2755         return;
2756 
2757     if (uvm_id_equal(src_id, uvm_va_range_get_policy(block->va_range)->preferred_location))
2758         block_mark_cpu_page_clean(block, page_index);
2759     else
2760         block_mark_cpu_page_dirty(block, page_index);
2761 }
2762 
2763 static void block_mark_memory_used(uvm_va_block_t *block, uvm_processor_id_t id)
2764 {
2765     uvm_gpu_t *gpu;
2766 
2767     if (UVM_ID_IS_CPU(id))
2768         return;
2769 
2770     gpu = block_get_gpu(block, id);
2771 
2772     // If the block is of the max size and the GPU supports eviction, mark the
2773     // root chunk as used in PMM.
2774     // HMM always allocates PAGE_SIZE GPU chunks so skip HMM va_blocks.
2775     if (!uvm_va_block_is_hmm(block) &&
2776         uvm_va_block_size(block) == UVM_CHUNK_SIZE_MAX &&
2777         uvm_gpu_supports_eviction(gpu)) {
2778         // The chunk has to be there if this GPU is resident
2779         UVM_ASSERT(uvm_processor_mask_test(&block->resident, id));
2780         uvm_pmm_gpu_mark_root_chunk_used(&gpu->pmm, uvm_va_block_gpu_state_get(block, gpu->id)->chunks[0]);
2781     }
2782 }
2783 
2784 static void block_set_resident_processor(uvm_va_block_t *block, uvm_processor_id_t id)
2785 {
2786     UVM_ASSERT(!uvm_page_mask_empty(uvm_va_block_resident_mask_get(block, id)));
2787 
2788     if (uvm_processor_mask_test_and_set(&block->resident, id))
2789         return;
2790 
2791     block_mark_memory_used(block, id);
2792 }
2793 
2794 static void block_clear_resident_processor(uvm_va_block_t *block, uvm_processor_id_t id)
2795 {
2796     uvm_gpu_t *gpu;
2797 
2798     UVM_ASSERT(uvm_page_mask_empty(uvm_va_block_resident_mask_get(block, id)));
2799 
2800     if (!uvm_processor_mask_test_and_clear(&block->resident, id))
2801         return;
2802 
2803     if (UVM_ID_IS_CPU(id))
2804         return;
2805 
2806     gpu = block_get_gpu(block, id);
2807 
2808     // If the block is of the max size and the GPU supports eviction, mark the
2809     // root chunk as unused in PMM.
2810     if (!uvm_va_block_is_hmm(block) &&
2811         uvm_va_block_size(block) == UVM_CHUNK_SIZE_MAX &&
2812         uvm_gpu_supports_eviction(gpu)) {
2813         // The chunk may not be there any more when residency is cleared.
2814         uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
2815         if (gpu_state && gpu_state->chunks[0])
2816             uvm_pmm_gpu_mark_root_chunk_unused(&gpu->pmm, gpu_state->chunks[0]);
2817     }
2818 }
2819 
2820 static bool block_phys_copy_contig_check(uvm_va_block_t *block,
2821                                          uvm_page_index_t page_index,
2822                                          const uvm_gpu_address_t *base_address,
2823                                          uvm_processor_id_t proc_id,
2824                                          uvm_gpu_t *copying_gpu)
2825 {
2826     uvm_gpu_address_t page_address;
2827     uvm_gpu_address_t contig_address = *base_address;
2828 
2829     contig_address.address += page_index * PAGE_SIZE;
2830 
2831     page_address = block_phys_page_copy_address(block, block_phys_page(proc_id, page_index), copying_gpu);
2832 
2833     return uvm_gpu_addr_cmp(page_address, contig_address) == 0;
2834 }
2835 
2836 // Check if the VA block has a single physically-contiguous chunk of storage
2837 // on the processor.
2838 static bool is_block_phys_contig(uvm_va_block_t *block, uvm_processor_id_t id)
2839 {
2840     uvm_cpu_chunk_t *chunk;
2841 
2842     if (UVM_ID_IS_GPU(id))
2843         return uvm_va_block_size(block) == block_gpu_chunk_size(block, block_get_gpu(block, id), 0);
2844 
2845     chunk = uvm_cpu_chunk_first_in_region(block, uvm_va_block_region_from_block(block), NULL);
2846     return chunk && (uvm_va_block_size(block) == uvm_cpu_chunk_get_size(chunk));
2847 }
2848 
2849 static uvm_va_block_region_t block_phys_contig_region(uvm_va_block_t *block,
2850                                                       uvm_page_index_t page_index,
2851                                                       uvm_processor_id_t resident_id)
2852 {
2853     if (UVM_ID_IS_CPU(resident_id)) {
2854         uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index);
2855         return uvm_cpu_chunk_block_region(block, chunk, page_index);
2856     }
2857     else {
2858         uvm_chunk_size_t chunk_size;
2859         (void)block_gpu_chunk_index(block, block_get_gpu(block, resident_id), page_index, &chunk_size);
2860         return uvm_va_block_chunk_region(block, chunk_size, page_index);
2861     }
2862 }
2863 
2864 // Like block_phys_page_copy_address, but uses the address cached in bca when
2865 // possible.
2866 static uvm_gpu_address_t block_copy_get_address(uvm_va_block_t *block,
2867                                                 block_copy_addr_t *bca,
2868                                                 uvm_page_index_t page_index,
2869                                                 uvm_gpu_t *copying_gpu)
2870 {
2871     if (bca->is_block_contig) {
2872         uvm_gpu_address_t addr = bca->gpu_address;
2873         addr.address += page_index * PAGE_SIZE;
2874         UVM_ASSERT(block_phys_copy_contig_check(block, page_index, &bca->gpu_address, bca->id, copying_gpu));
2875         return addr;
2876     }
2877 
2878     return block_phys_page_copy_address(block, block_phys_page(bca->id, page_index), copying_gpu);
2879 }
2880 
2881 // When the Confidential Computing feature is enabled, the function performs
2882 // CPU side page encryption and GPU side decryption to the CPR.
2883 // GPU operations respect the caller's membar previously set in the push.
2884 static void conf_computing_block_copy_push_cpu_to_gpu(uvm_va_block_t *block,
2885                                                       block_copy_state_t *copy_state,
2886                                                       uvm_va_block_region_t region,
2887                                                       uvm_push_t *push)
2888 {
2889     uvm_push_flag_t membar_flag = 0;
2890     uvm_gpu_t *gpu = uvm_push_get_gpu(push);
2891     uvm_page_index_t page_index = region.first;
2892     uvm_conf_computing_dma_buffer_t *dma_buffer = copy_state->dma_buffer;
2893     struct page *src_page = uvm_cpu_chunk_get_cpu_page(block, page_index);
2894     uvm_gpu_address_t staging_buffer = uvm_mem_gpu_address_virtual_kernel(dma_buffer->alloc, gpu);
2895     uvm_gpu_address_t auth_tag_buffer = uvm_mem_gpu_address_virtual_kernel(dma_buffer->auth_tag, gpu);
2896     char *cpu_auth_tag_buffer = (char *)uvm_mem_get_cpu_addr_kernel(dma_buffer->auth_tag) +
2897                                         (page_index * UVM_CONF_COMPUTING_AUTH_TAG_SIZE);
2898     uvm_gpu_address_t dst_address = block_copy_get_address(block, &copy_state->dst, page_index, gpu);
2899     char *cpu_va_staging_buffer = (char *)uvm_mem_get_cpu_addr_kernel(dma_buffer->alloc) + (page_index * PAGE_SIZE);
2900 
2901     UVM_ASSERT(UVM_ID_IS_CPU(copy_state->src.id));
2902     UVM_ASSERT(UVM_ID_IS_GPU(copy_state->dst.id));
2903 
2904     UVM_ASSERT(uvm_conf_computing_mode_enabled(gpu));
2905 
2906     // See comment in block_copy_begin_push.
2907     UVM_ASSERT(uvm_tracker_is_completed(&block->tracker));
2908 
2909     staging_buffer.address += page_index * PAGE_SIZE;
2910     auth_tag_buffer.address += page_index * UVM_CONF_COMPUTING_AUTH_TAG_SIZE;
2911 
2912     if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE))
2913         membar_flag = UVM_PUSH_FLAG_NEXT_MEMBAR_NONE;
2914     else if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU))
2915         membar_flag = UVM_PUSH_FLAG_NEXT_MEMBAR_GPU;
2916 
2917     // kmap() only guarantees PAGE_SIZE contiguity, all encryption and
2918     // decryption must happen on a PAGE_SIZE basis.
2919     for_each_va_block_page_in_region(page_index, region) {
2920         void *src_cpu_virt_addr;
2921 
2922         // The caller guarantees that all pages in region are contiguous,
2923         // meaning they're guaranteed to be part of the same compound page.
2924         UVM_ASSERT(src_page == uvm_cpu_chunk_get_cpu_page(block, page_index));
2925 
2926         src_cpu_virt_addr = kmap(src_page);
2927         uvm_conf_computing_cpu_encrypt(push->channel,
2928                                        cpu_va_staging_buffer,
2929                                        src_cpu_virt_addr,
2930                                        NULL,
2931                                        PAGE_SIZE,
2932                                        cpu_auth_tag_buffer);
2933         kunmap(src_page);
2934 
2935         // First LCE operation should be non-pipelined to guarantee ordering as
2936         // we do not know when was the last non-pipelined copy.
2937         // Last one applies the membar originally planned for the push if any
2938         // TODO: 3857691: Inherit policy instead of forcing first invocation to
2939         // be non pipelined.
2940         if (page_index > region.first)
2941             uvm_push_set_flag(push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
2942 
2943         if (page_index < (region.outer - 1))
2944             uvm_push_set_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
2945         else if (membar_flag)
2946             uvm_push_set_flag(push, membar_flag);
2947 
2948         gpu->parent->ce_hal->decrypt(push, dst_address, staging_buffer, PAGE_SIZE, auth_tag_buffer);
2949 
2950         src_page++;
2951         dst_address.address += PAGE_SIZE;
2952         cpu_va_staging_buffer += PAGE_SIZE;
2953         staging_buffer.address += PAGE_SIZE;
2954         cpu_auth_tag_buffer += UVM_CONF_COMPUTING_AUTH_TAG_SIZE;
2955         auth_tag_buffer.address += UVM_CONF_COMPUTING_AUTH_TAG_SIZE;
2956     }
2957 }
2958 
2959 // When the Confidential Computing feature is enabled, the function performs
2960 // GPU side page encryption. GPU operations respect the caller's membar
2961 // previously set in the push.
2962 static void conf_computing_block_copy_push_gpu_to_cpu(uvm_va_block_t *block,
2963                                                       block_copy_state_t *copy_state,
2964                                                       uvm_va_block_region_t region,
2965                                                       uvm_push_t *push)
2966 {
2967     uvm_push_flag_t membar_flag = 0;
2968     uvm_gpu_t *gpu = uvm_push_get_gpu(push);
2969     uvm_page_index_t page_index = region.first;
2970     uvm_conf_computing_dma_buffer_t *dma_buffer = copy_state->dma_buffer;
2971     uvm_gpu_address_t staging_buffer = uvm_mem_gpu_address_virtual_kernel(dma_buffer->alloc, gpu);
2972     uvm_gpu_address_t auth_tag_buffer = uvm_mem_gpu_address_virtual_kernel(dma_buffer->auth_tag, gpu);
2973     uvm_gpu_address_t src_address = block_copy_get_address(block, &copy_state->src, page_index, gpu);
2974 
2975     UVM_ASSERT(UVM_ID_IS_GPU(copy_state->src.id));
2976     UVM_ASSERT(UVM_ID_IS_CPU(copy_state->dst.id));
2977 
2978     UVM_ASSERT(uvm_conf_computing_mode_enabled(gpu));
2979 
2980     staging_buffer.address += page_index * PAGE_SIZE;
2981     auth_tag_buffer.address += page_index * UVM_CONF_COMPUTING_AUTH_TAG_SIZE;
2982 
2983     if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE))
2984         membar_flag = UVM_PUSH_FLAG_NEXT_MEMBAR_NONE;
2985     else if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU))
2986         membar_flag = UVM_PUSH_FLAG_NEXT_MEMBAR_GPU;
2987 
2988     // Because we use kmap() for mapping pages for CPU side
2989     // crypto-operations and it only guarantees PAGE_SIZE contiguity, all
2990     // encryptions and decryptions must happen on a PAGE_SIZE basis.
2991     for_each_va_block_page_in_region(page_index, region) {
2992         uvm_conf_computing_log_gpu_encryption(push->channel, &dma_buffer->decrypt_iv[page_index]);
2993 
2994         // First LCE operation should be non-pipelined to guarantee ordering as
2995         // we do not know when was the last non-pipelined copy.
2996         // Last one applies the membar originally planned for the push if any
2997         // TODO: 3857691: Inherit policy instead of forcing first invocation to
2998         // be non pipelined.
2999         if (page_index > region.first)
3000             uvm_push_set_flag(push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
3001 
3002         if (page_index < (region.outer - 1))
3003             uvm_push_set_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
3004         else if (membar_flag)
3005             uvm_push_set_flag(push, membar_flag);
3006 
3007         gpu->parent->ce_hal->encrypt(push, staging_buffer, src_address, PAGE_SIZE, auth_tag_buffer);
3008 
3009         src_address.address += PAGE_SIZE;
3010         staging_buffer.address += PAGE_SIZE;
3011         auth_tag_buffer.address += UVM_CONF_COMPUTING_AUTH_TAG_SIZE;
3012     }
3013 
3014     uvm_page_mask_region_fill(&dma_buffer->encrypted_page_mask, region);
3015 }
3016 
3017 static NV_STATUS conf_computing_copy_pages_finish(uvm_va_block_t *block,
3018                                                   block_copy_state_t *copy_state,
3019                                                   uvm_push_t *push)
3020 {
3021     NV_STATUS status;
3022     uvm_page_index_t page_index;
3023     uvm_conf_computing_dma_buffer_t *dma_buffer = copy_state->dma_buffer;
3024     uvm_page_mask_t *encrypted_page_mask = &dma_buffer->encrypted_page_mask;
3025     void *auth_tag_buffer_base = uvm_mem_get_cpu_addr_kernel(dma_buffer->auth_tag);
3026     void *staging_buffer_base = uvm_mem_get_cpu_addr_kernel(dma_buffer->alloc);
3027 
3028     UVM_ASSERT(uvm_conf_computing_mode_enabled(push->gpu));
3029 
3030     if (UVM_ID_IS_GPU(copy_state->dst.id))
3031         return NV_OK;
3032 
3033     UVM_ASSERT(UVM_ID_IS_GPU(copy_state->src.id));
3034 
3035     status = uvm_push_wait(push);
3036     if (status != NV_OK)
3037         return status;
3038 
3039     // kmap() only guarantees PAGE_SIZE contiguity, all encryption and
3040     // decryption must happen on a PAGE_SIZE basis.
3041     for_each_va_block_page_in_mask(page_index, encrypted_page_mask, block) {
3042         struct page *dst_page = uvm_cpu_chunk_get_cpu_page(block, page_index);
3043         void *staging_buffer = (char *)staging_buffer_base + (page_index * PAGE_SIZE);
3044         void *auth_tag_buffer = (char *)auth_tag_buffer_base + (page_index * UVM_CONF_COMPUTING_AUTH_TAG_SIZE);
3045         void *cpu_page_address = kmap(dst_page);
3046 
3047         status = uvm_conf_computing_cpu_decrypt(push->channel,
3048                                                 cpu_page_address,
3049                                                 staging_buffer,
3050                                                 &dma_buffer->decrypt_iv[page_index],
3051                                                 PAGE_SIZE,
3052                                                 auth_tag_buffer);
3053         kunmap(dst_page);
3054         if (status != NV_OK) {
3055             // TODO: Bug 3814087: [UVM][HCC] Handle CSL auth_tag verification
3056             //                    failures & other failures gracefully.
3057             // uvm_conf_computing_cpu_decrypt() can fail if the authentication
3058             // tag verification fails. May this happen, it is considered a
3059             // critical failure and cannot be recovered.
3060             uvm_global_set_fatal_error(status);
3061             return status;
3062         }
3063     }
3064 
3065     return NV_OK;
3066 }
3067 
3068 static void block_copy_push(uvm_va_block_t *block,
3069                             block_copy_state_t *copy_state,
3070                             uvm_va_block_region_t region,
3071                             uvm_push_t *push)
3072 {
3073     uvm_gpu_address_t gpu_dst_address;
3074     uvm_gpu_address_t gpu_src_address;
3075     uvm_gpu_t *gpu = uvm_push_get_gpu(push);
3076 
3077     uvm_push_set_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
3078 
3079     if (uvm_conf_computing_mode_enabled(gpu)) {
3080         if (UVM_ID_IS_CPU(copy_state->src.id))
3081             conf_computing_block_copy_push_cpu_to_gpu(block, copy_state, region, push);
3082         else
3083             conf_computing_block_copy_push_gpu_to_cpu(block, copy_state, region, push);
3084 
3085         return;
3086     }
3087 
3088     gpu_dst_address = block_copy_get_address(block, &copy_state->dst, region.first, gpu);
3089     gpu_src_address = block_copy_get_address(block, &copy_state->src, region.first, gpu);
3090     gpu->parent->ce_hal->memcopy(push, gpu_dst_address, gpu_src_address, uvm_va_block_region_size(region));
3091 }
3092 
3093 static NV_STATUS block_copy_end_push(uvm_va_block_t *block,
3094                                      block_copy_state_t *copy_state,
3095                                      uvm_tracker_t *copy_tracker,
3096                                      NV_STATUS push_status,
3097                                      uvm_push_t *push)
3098 {
3099     NV_STATUS tracker_status;
3100 
3101     // TODO: Bug 1766424: If the destination is a GPU and the copy was done
3102     //       by that GPU, use a GPU-local membar if no peer can currently
3103     //       map this page. When peer access gets enabled, do a MEMBAR_SYS
3104     //       at that point.
3105     uvm_push_end(push);
3106 
3107     if ((push_status == NV_OK) && uvm_conf_computing_mode_enabled(push->gpu))
3108         push_status = conf_computing_copy_pages_finish(block, copy_state, push);
3109 
3110     tracker_status = uvm_tracker_add_push_safe(copy_tracker, push);
3111     if (push_status == NV_OK)
3112         push_status = tracker_status;
3113 
3114     if (uvm_conf_computing_mode_enabled(push->gpu)) {
3115         uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
3116 
3117         uvm_tracker_overwrite_with_push(&local_tracker, push);
3118         uvm_conf_computing_dma_buffer_free(&push->gpu->conf_computing.dma_buffer_pool,
3119                                            copy_state->dma_buffer,
3120                                            &local_tracker);
3121         copy_state->dma_buffer = NULL;
3122         uvm_tracker_deinit(&local_tracker);
3123     }
3124 
3125     return push_status;
3126 }
3127 
3128 // Copies pages resident on the src_id processor to the dst_id processor
3129 //
3130 // The function adds the pages that were successfully copied to the output
3131 // migrated_pages mask and returns the number of pages in copied_pages. These
3132 // fields are reliable even if an error is returned.
3133 //
3134 // Acquires the block's tracker and adds all of its pushes to the copy_tracker.
3135 static NV_STATUS block_copy_resident_pages_between(uvm_va_block_t *block,
3136                                                    uvm_va_block_context_t *block_context,
3137                                                    uvm_processor_id_t dst_id,
3138                                                    uvm_processor_id_t src_id,
3139                                                    uvm_va_block_region_t region,
3140                                                    uvm_page_mask_t *copy_mask,
3141                                                    const uvm_page_mask_t *prefetch_page_mask,
3142                                                    uvm_va_block_transfer_mode_t transfer_mode,
3143                                                    uvm_page_mask_t *migrated_pages,
3144                                                    NvU32 *copied_pages,
3145                                                    uvm_tracker_t *copy_tracker)
3146 {
3147     NV_STATUS status = NV_OK;
3148     uvm_page_mask_t *dst_resident_mask = uvm_va_block_resident_mask_get(block, dst_id);
3149     uvm_gpu_t *copying_gpu = NULL;
3150     uvm_push_t push;
3151     uvm_page_index_t page_index;
3152     uvm_page_index_t contig_start_index = region.outer;
3153     uvm_page_index_t last_index = region.outer;
3154     uvm_range_group_range_t *rgr = NULL;
3155     bool rgr_has_changed = false;
3156     uvm_make_resident_cause_t cause = block_context->make_resident.cause;
3157     uvm_make_resident_cause_t contig_cause = cause;
3158     const bool may_prefetch = (cause == UVM_MAKE_RESIDENT_CAUSE_REPLAYABLE_FAULT ||
3159                                cause == UVM_MAKE_RESIDENT_CAUSE_NON_REPLAYABLE_FAULT ||
3160                                cause == UVM_MAKE_RESIDENT_CAUSE_ACCESS_COUNTER) && !!prefetch_page_mask;
3161     block_copy_state_t copy_state = {0};
3162     uvm_va_range_t *va_range = block->va_range;
3163     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
3164 
3165     copy_state.src.id = src_id;
3166     copy_state.dst.id = dst_id;
3167     copy_state.src.is_block_contig = is_block_phys_contig(block, src_id);
3168     copy_state.dst.is_block_contig = is_block_phys_contig(block, dst_id);
3169 
3170     *copied_pages = 0;
3171 
3172     // If there are no pages to be copied, exit early
3173     if (!uvm_page_mask_andnot(copy_mask, copy_mask, dst_resident_mask) ||
3174         !uvm_page_mask_andnot(copy_mask, copy_mask, migrated_pages))
3175         return NV_OK;
3176 
3177     // uvm_range_group_range_iter_first should only be called when the va_space
3178     // lock is held, which is always the case unless an eviction is taking
3179     // place.
3180     if (cause != UVM_MAKE_RESIDENT_CAUSE_EVICTION) {
3181         rgr = uvm_range_group_range_iter_first(va_space,
3182                                                uvm_va_block_region_start(block, region),
3183                                                uvm_va_block_region_end(block, region));
3184         rgr_has_changed = true;
3185     }
3186 
3187     if (UVM_ID_IS_CPU(dst_id)) {
3188         uvm_memcg_context_t memcg_context;
3189 
3190         // To support staging through CPU, populate CPU pages on demand.
3191         // GPU destinations should have their pages populated already, but
3192         // that might change if we add staging through GPUs.
3193         uvm_memcg_context_start(&memcg_context, block_context->mm);
3194         status = block_populate_pages_cpu(block, copy_mask, region, block_context);
3195         uvm_memcg_context_end(&memcg_context);
3196         if (status != NV_OK)
3197             return status;
3198     }
3199 
3200     // TODO: Bug 3745051: This function is complicated and needs refactoring
3201     for_each_va_block_page_in_region_mask(page_index, copy_mask, region) {
3202         NvU64 page_start = uvm_va_block_cpu_page_address(block, page_index);
3203         uvm_make_resident_cause_t page_cause = (may_prefetch && uvm_page_mask_test(prefetch_page_mask, page_index)) ?
3204                                                 UVM_MAKE_RESIDENT_CAUSE_PREFETCH:
3205                                                 cause;
3206 
3207         UVM_ASSERT(block_check_resident_proximity(block, page_index, dst_id));
3208         UVM_ASSERT(block_processor_page_is_populated(block, dst_id, page_index));
3209 
3210         // If we're not evicting and we're migrating away from the preferred
3211         // location, then we should add the range group range to the list of
3212         // migrated ranges in the range group. It's safe to skip this because
3213         // the use of range_group's migrated_ranges list is a UVM-Lite
3214         // optimization - eviction is not supported on UVM-Lite GPUs.
3215         if (cause != UVM_MAKE_RESIDENT_CAUSE_EVICTION && !uvm_va_block_is_hmm(block) &&
3216             uvm_id_equal(src_id, uvm_va_range_get_policy(va_range)->preferred_location)) {
3217             // rgr_has_changed is used to minimize the number of times the
3218             // migrated_ranges_lock is taken. It is set to false when the range
3219             // group range pointed by rgr is added to the migrated_ranges list,
3220             // and it is just set back to true when we move to a different
3221             // range group range.
3222 
3223             // The current page could be after the end of rgr. Iterate over the
3224             // range group ranges until rgr's end location is greater than or
3225             // equal to the current page.
3226             while (rgr && rgr->node.end < page_start) {
3227                 rgr = uvm_range_group_range_iter_next(va_space, rgr, uvm_va_block_region_end(block, region));
3228                 rgr_has_changed = true;
3229             }
3230 
3231             // Check whether the current page lies within rgr. A single page
3232             // must entirely reside within a range group range. Since we've
3233             // incremented rgr until its end is higher than page_start, we now
3234             // check if page_start lies within rgr.
3235             if (rgr && rgr_has_changed && page_start >= rgr->node.start && page_start <= rgr->node.end) {
3236                 uvm_spin_lock(&rgr->range_group->migrated_ranges_lock);
3237                 if (list_empty(&rgr->range_group_migrated_list_node))
3238                     list_move_tail(&rgr->range_group_migrated_list_node, &rgr->range_group->migrated_ranges);
3239                 uvm_spin_unlock(&rgr->range_group->migrated_ranges_lock);
3240 
3241                 rgr_has_changed = false;
3242             }
3243         }
3244 
3245         // No need to copy pages that haven't changed.  Just clear residency
3246         // information
3247         if (block_page_is_clean(block, dst_id, src_id, page_index))
3248             continue;
3249 
3250         if (!copying_gpu) {
3251             status = block_copy_begin_push(block, &copy_state, &block->tracker, &push);
3252 
3253             if (status != NV_OK)
3254                 break;
3255             copying_gpu = uvm_push_get_gpu(&push);
3256 
3257             // Record all processors involved in the copy
3258             uvm_processor_mask_set(&block_context->make_resident.all_involved_processors, copying_gpu->id);
3259             uvm_processor_mask_set(&block_context->make_resident.all_involved_processors, dst_id);
3260             uvm_processor_mask_set(&block_context->make_resident.all_involved_processors, src_id);
3261 
3262             // This function is called just once per VA block and needs to
3263             // receive the "main" cause for the migration (it mainly checks if
3264             // we are in the eviction path). Therefore, we pass cause instead
3265             // of contig_cause
3266             uvm_tools_record_block_migration_begin(block, &push, dst_id, src_id, page_start, cause);
3267         }
3268         else {
3269             uvm_push_set_flag(&push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
3270         }
3271 
3272         if (!uvm_va_block_is_hmm(block))
3273             block_update_page_dirty_state(block, dst_id, src_id, page_index);
3274 
3275         if (last_index == region.outer) {
3276             bool can_cache_src_phys_addr = copy_state.src.is_block_contig;
3277             bool can_cache_dst_phys_addr = copy_state.dst.is_block_contig;
3278             contig_start_index = page_index;
3279             contig_cause = page_cause;
3280 
3281             // When CC is enabled, transfers between GPU and CPU don't rely on
3282             // any GPU mapping of CPU chunks, physical or virtual.
3283             if (UVM_ID_IS_CPU(src_id) && uvm_conf_computing_mode_enabled(copying_gpu))
3284                 can_cache_src_phys_addr = false;
3285             if (UVM_ID_IS_CPU(dst_id) && uvm_conf_computing_mode_enabled(copying_gpu))
3286                 can_cache_dst_phys_addr = false;
3287             // Computing the physical address is a non-trivial operation and
3288             // seems to be a performance limiter on systems with 2 or more
3289             // NVLINK links. Therefore, for physically-contiguous block
3290             // storage, we cache the start address and compute the page address
3291             // using the page index.
3292             if (can_cache_src_phys_addr) {
3293                 copy_state.src.gpu_address = block_phys_page_copy_address(block,
3294                                                                           block_phys_page(src_id, 0),
3295                                                                           copying_gpu);
3296             }
3297             if (can_cache_dst_phys_addr) {
3298                 copy_state.dst.gpu_address = block_phys_page_copy_address(block,
3299                                                                           block_phys_page(dst_id, 0),
3300                                                                           copying_gpu);
3301             }
3302         }
3303         else if ((page_index != last_index + 1) || contig_cause != page_cause) {
3304             uvm_va_block_region_t contig_region = uvm_va_block_region(contig_start_index, last_index + 1);
3305             UVM_ASSERT(uvm_va_block_region_contains_region(region, contig_region));
3306 
3307             // If both src and dst are physically-contiguous, consolidate copies
3308             // of contiguous pages into a single method.
3309             if (copy_state.src.is_block_contig && copy_state.dst.is_block_contig)
3310                 block_copy_push(block, &copy_state, contig_region, &push);
3311 
3312             uvm_perf_event_notify_migration(&va_space->perf_events,
3313                                             &push,
3314                                             block,
3315                                             dst_id,
3316                                             src_id,
3317                                             uvm_va_block_region_start(block, contig_region),
3318                                             uvm_va_block_region_size(contig_region),
3319                                             transfer_mode,
3320                                             contig_cause,
3321                                             &block_context->make_resident);
3322 
3323             contig_start_index = page_index;
3324             contig_cause = page_cause;
3325         }
3326 
3327         if (!copy_state.src.is_block_contig || !copy_state.dst.is_block_contig)
3328             block_copy_push(block, &copy_state, uvm_va_block_region_for_page(page_index), &push);
3329 
3330         last_index = page_index;
3331     }
3332 
3333     // Copy the remaining pages
3334     if (copying_gpu) {
3335         uvm_va_block_region_t contig_region = uvm_va_block_region(contig_start_index, last_index + 1);
3336         UVM_ASSERT(uvm_va_block_region_contains_region(region, contig_region));
3337 
3338         if (copy_state.src.is_block_contig && copy_state.dst.is_block_contig)
3339             block_copy_push(block, &copy_state, contig_region, &push);
3340 
3341         uvm_perf_event_notify_migration(&va_space->perf_events,
3342                                         &push,
3343                                         block,
3344                                         dst_id,
3345                                         src_id,
3346                                         uvm_va_block_region_start(block, contig_region),
3347                                         uvm_va_block_region_size(contig_region),
3348                                         transfer_mode,
3349                                         contig_cause,
3350                                         &block_context->make_resident);
3351 
3352         status = block_copy_end_push(block, &copy_state, copy_tracker, status, &push);
3353     }
3354 
3355     // Update VA block status bits
3356     //
3357     // Only update the bits for the pages that succeeded
3358     if (status != NV_OK)
3359         uvm_page_mask_region_clear(copy_mask, uvm_va_block_region(page_index, PAGES_PER_UVM_VA_BLOCK));
3360 
3361     *copied_pages = uvm_page_mask_weight(copy_mask);
3362     if (*copied_pages)
3363         uvm_page_mask_or(migrated_pages, migrated_pages, copy_mask);
3364 
3365     return status;
3366 }
3367 
3368 // Copy resident pages to the destination from all source processors in the
3369 // src_processor_mask
3370 //
3371 // The function adds the pages that were successfully copied to the output
3372 // migrated_pages mask and returns the number of pages in copied_pages. These
3373 // fields are reliable even if an error is returned.
3374 static NV_STATUS block_copy_resident_pages_mask(uvm_va_block_t *block,
3375                                                 uvm_va_block_context_t *block_context,
3376                                                 uvm_processor_id_t dst_id,
3377                                                 const uvm_processor_mask_t *src_processor_mask,
3378                                                 uvm_va_block_region_t region,
3379                                                 const uvm_page_mask_t *page_mask,
3380                                                 const uvm_page_mask_t *prefetch_page_mask,
3381                                                 uvm_va_block_transfer_mode_t transfer_mode,
3382                                                 NvU32 max_pages_to_copy,
3383                                                 uvm_page_mask_t *migrated_pages,
3384                                                 NvU32 *copied_pages_out,
3385                                                 uvm_tracker_t *tracker_out)
3386 {
3387     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
3388     uvm_processor_id_t src_id;
3389     uvm_processor_mask_t search_mask;
3390     uvm_page_mask_t *copy_mask = &block_context->make_resident.copy_resident_pages_mask;
3391 
3392     uvm_processor_mask_copy(&search_mask, src_processor_mask);
3393 
3394     *copied_pages_out = 0;
3395 
3396     for_each_closest_id(src_id, &search_mask, dst_id, va_space) {
3397         uvm_page_mask_t *src_resident_mask = uvm_va_block_resident_mask_get(block, src_id);
3398         NV_STATUS status;
3399         NvU32 copied_pages_from_src;
3400 
3401         UVM_ASSERT(!uvm_id_equal(src_id, dst_id));
3402 
3403         uvm_page_mask_init_from_region(copy_mask, region, src_resident_mask);
3404 
3405         if (page_mask)
3406             uvm_page_mask_and(copy_mask, copy_mask, page_mask);
3407 
3408         status = block_copy_resident_pages_between(block,
3409                                                    block_context,
3410                                                    dst_id,
3411                                                    src_id,
3412                                                    region,
3413                                                    copy_mask,
3414                                                    prefetch_page_mask,
3415                                                    transfer_mode,
3416                                                    migrated_pages,
3417                                                    &copied_pages_from_src,
3418                                                    tracker_out);
3419         *copied_pages_out += copied_pages_from_src;
3420         UVM_ASSERT(*copied_pages_out <= max_pages_to_copy);
3421 
3422         if (status != NV_OK)
3423             return status;
3424 
3425         // Break out once we copied max pages already
3426         if (*copied_pages_out == max_pages_to_copy)
3427             break;
3428     }
3429 
3430     return NV_OK;
3431 }
3432 
3433 static void break_read_duplication_in_region(uvm_va_block_t *block,
3434                                              uvm_va_block_context_t *block_context,
3435                                              uvm_processor_id_t dst_id,
3436                                              uvm_va_block_region_t region,
3437                                              const uvm_page_mask_t *page_mask)
3438 {
3439     uvm_processor_id_t id;
3440     uvm_page_mask_t *break_pages_in_region = &block_context->scratch_page_mask;
3441 
3442     uvm_page_mask_init_from_region(break_pages_in_region, region, page_mask);
3443 
3444     UVM_ASSERT(uvm_page_mask_subset(break_pages_in_region, uvm_va_block_resident_mask_get(block, dst_id)));
3445 
3446     // Clear read_duplicated bit for all pages in region
3447     uvm_page_mask_andnot(&block->read_duplicated_pages, &block->read_duplicated_pages, break_pages_in_region);
3448 
3449     // Clear residency bits for all processors other than dst_id
3450     for_each_id_in_mask(id, &block->resident) {
3451         uvm_page_mask_t *other_resident_mask;
3452 
3453         if (uvm_id_equal(id, dst_id))
3454             continue;
3455 
3456         other_resident_mask = uvm_va_block_resident_mask_get(block, id);
3457 
3458         if (!uvm_page_mask_andnot(other_resident_mask, other_resident_mask, break_pages_in_region))
3459             block_clear_resident_processor(block, id);
3460     }
3461 }
3462 
3463 static void block_copy_set_first_touch_residency(uvm_va_block_t *block,
3464                                                  uvm_va_block_context_t *block_context,
3465                                                  uvm_processor_id_t dst_id,
3466                                                  uvm_va_block_region_t region,
3467                                                  const uvm_page_mask_t *page_mask)
3468 {
3469     uvm_page_index_t page_index;
3470     uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(block, dst_id);
3471     uvm_page_mask_t *first_touch_mask = &block_context->make_resident.page_mask;
3472 
3473     if (page_mask)
3474         uvm_page_mask_andnot(first_touch_mask, page_mask, resident_mask);
3475     else
3476         uvm_page_mask_complement(first_touch_mask, resident_mask);
3477 
3478     uvm_page_mask_region_clear_outside(first_touch_mask, region);
3479 
3480     for_each_va_block_page_in_mask(page_index, first_touch_mask, block) {
3481         UVM_ASSERT(!block_is_page_resident_anywhere(block, page_index));
3482         UVM_ASSERT(block_processor_page_is_populated(block, dst_id, page_index));
3483         UVM_ASSERT(block_check_resident_proximity(block, page_index, dst_id));
3484     }
3485 
3486     uvm_page_mask_or(resident_mask, resident_mask, first_touch_mask);
3487     if (!uvm_page_mask_empty(resident_mask))
3488         block_set_resident_processor(block, dst_id);
3489 
3490     // Add them to the output mask, too
3491     uvm_page_mask_or(&block_context->make_resident.pages_changed_residency,
3492                      &block_context->make_resident.pages_changed_residency,
3493                      first_touch_mask);
3494 }
3495 
3496 // Copy resident pages from other processors to the destination.
3497 // All the pages on the destination need to be populated by the caller first.
3498 // Pages not resident anywhere else need to be zeroed out as well.
3499 // The transfer_mode is only used to tell uvm_perf_event_notify_migration()
3500 // whether the copy is for a migration or read duplication.
3501 static NV_STATUS block_copy_resident_pages(uvm_va_block_t *block,
3502                                            uvm_va_block_context_t *block_context,
3503                                            uvm_processor_id_t dst_id,
3504                                            uvm_va_block_region_t region,
3505                                            const uvm_page_mask_t *page_mask,
3506                                            const uvm_page_mask_t *prefetch_page_mask,
3507                                            uvm_va_block_transfer_mode_t transfer_mode)
3508 {
3509     NV_STATUS status = NV_OK;
3510     NV_STATUS tracker_status;
3511     uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
3512     uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(block, dst_id);
3513     NvU32 missing_pages_count;
3514     NvU32 pages_copied;
3515     NvU32 pages_copied_to_cpu;
3516     uvm_processor_mask_t src_processor_mask;
3517     uvm_page_mask_t *copy_page_mask = &block_context->make_resident.page_mask;
3518     uvm_page_mask_t *migrated_pages = &block_context->make_resident.pages_migrated;
3519     uvm_page_mask_t *staged_pages = &block_context->make_resident.pages_staged;
3520 
3521     uvm_page_mask_zero(migrated_pages);
3522     uvm_page_mask_zero(staged_pages);
3523 
3524     if (page_mask)
3525         uvm_page_mask_andnot(copy_page_mask, page_mask, resident_mask);
3526     else
3527         uvm_page_mask_complement(copy_page_mask, resident_mask);
3528 
3529     missing_pages_count = uvm_page_mask_region_weight(copy_page_mask, region);
3530 
3531     if (missing_pages_count == 0)
3532         goto out;
3533 
3534     // TODO: Bug 1753731: Add P2P2P copies staged through a GPU
3535     // TODO: Bug 1753731: When a page is resident in multiple locations due to
3536     //       read-duplication, spread out the source of the copy so we don't
3537     //       bottleneck on a single location.
3538 
3539     uvm_processor_mask_zero(&src_processor_mask);
3540 
3541     if (!uvm_id_equal(dst_id, UVM_ID_CPU)) {
3542         // If the destination is a GPU, first copy everything from processors
3543         // with copy access supported. Notably this will copy pages from the CPU
3544         // as well even if later some extra copies from CPU are required for
3545         // staged copies.
3546         uvm_processor_mask_and(&src_processor_mask, block_get_can_copy_from_mask(block, dst_id), &block->resident);
3547         uvm_processor_mask_clear(&src_processor_mask, dst_id);
3548 
3549         status = block_copy_resident_pages_mask(block,
3550                                                 block_context,
3551                                                 dst_id,
3552                                                 &src_processor_mask,
3553                                                 region,
3554                                                 copy_page_mask,
3555                                                 prefetch_page_mask,
3556                                                 transfer_mode,
3557                                                 missing_pages_count,
3558                                                 migrated_pages,
3559                                                 &pages_copied,
3560                                                 &local_tracker);
3561 
3562         UVM_ASSERT(missing_pages_count >= pages_copied);
3563         missing_pages_count -= pages_copied;
3564 
3565         if (status != NV_OK)
3566             goto out;
3567 
3568         if (missing_pages_count == 0)
3569             goto out;
3570 
3571         if (pages_copied)
3572             uvm_page_mask_andnot(copy_page_mask, copy_page_mask, migrated_pages);
3573     }
3574 
3575     // Now copy from everywhere else to the CPU. This is both for when the
3576     // destination is the CPU (src_processor_mask empty) and for a staged copy
3577     // (src_processor_mask containing processors with copy access to dst_id).
3578     uvm_processor_mask_andnot(&src_processor_mask, &block->resident, &src_processor_mask);
3579     uvm_processor_mask_clear(&src_processor_mask, dst_id);
3580     uvm_processor_mask_clear(&src_processor_mask, UVM_ID_CPU);
3581 
3582     status = block_copy_resident_pages_mask(block,
3583                                             block_context,
3584                                             UVM_ID_CPU,
3585                                             &src_processor_mask,
3586                                             region,
3587                                             copy_page_mask,
3588                                             prefetch_page_mask,
3589                                             transfer_mode,
3590                                             missing_pages_count,
3591                                             staged_pages,
3592                                             &pages_copied_to_cpu,
3593                                             &local_tracker);
3594     if (status != NV_OK)
3595         goto out;
3596 
3597     // If destination is the CPU then we copied everything there above
3598     if (UVM_ID_IS_CPU(dst_id)) {
3599         uvm_page_mask_or(migrated_pages, migrated_pages, staged_pages);
3600         missing_pages_count -= pages_copied_to_cpu;
3601 
3602         goto out;
3603     }
3604 
3605     // Add everything to the block's tracker so that the
3606     // block_copy_resident_pages_between() call below will acquire it.
3607     status = uvm_tracker_add_tracker_safe(&block->tracker, &local_tracker);
3608     if (status != NV_OK)
3609         goto out;
3610     uvm_tracker_clear(&local_tracker);
3611 
3612     // Now copy staged pages from the CPU to the destination.
3613     status = block_copy_resident_pages_between(block,
3614                                                block_context,
3615                                                dst_id,
3616                                                UVM_ID_CPU,
3617                                                region,
3618                                                staged_pages,
3619                                                prefetch_page_mask,
3620                                                transfer_mode,
3621                                                migrated_pages,
3622                                                &pages_copied,
3623                                                &local_tracker);
3624 
3625     UVM_ASSERT(missing_pages_count >= pages_copied);
3626     missing_pages_count -= pages_copied;
3627 
3628     if (status != NV_OK)
3629         goto out;
3630 
3631     // If we get here, that means we were staging the copy through the CPU and
3632     // we should copy as many pages from the CPU as we copied to the CPU.
3633     UVM_ASSERT(pages_copied == pages_copied_to_cpu);
3634 
3635 out:
3636     // Add everything from the local tracker to the block's tracker.
3637     // Notably this is also needed for handling
3638     // block_copy_resident_pages_between() failures in the first loop.
3639     tracker_status = uvm_tracker_add_tracker_safe(&block->tracker, &local_tracker);
3640     uvm_tracker_deinit(&local_tracker);
3641 
3642     return status == NV_OK ? tracker_status : status;
3643 }
3644 
3645 NV_STATUS uvm_va_block_make_resident_copy(uvm_va_block_t *va_block,
3646                                           uvm_va_block_retry_t *va_block_retry,
3647                                           uvm_va_block_context_t *va_block_context,
3648                                           uvm_processor_id_t dest_id,
3649                                           uvm_va_block_region_t region,
3650                                           const uvm_page_mask_t *page_mask,
3651                                           const uvm_page_mask_t *prefetch_page_mask,
3652                                           uvm_make_resident_cause_t cause)
3653 {
3654     NV_STATUS status;
3655     uvm_processor_mask_t unmap_processor_mask;
3656     uvm_page_mask_t *unmap_page_mask = &va_block_context->make_resident.page_mask;
3657     uvm_page_mask_t *resident_mask;
3658 
3659     va_block_context->make_resident.dest_id = dest_id;
3660     va_block_context->make_resident.cause = cause;
3661 
3662     if (prefetch_page_mask) {
3663         UVM_ASSERT(cause == UVM_MAKE_RESIDENT_CAUSE_REPLAYABLE_FAULT ||
3664                    cause == UVM_MAKE_RESIDENT_CAUSE_NON_REPLAYABLE_FAULT ||
3665                    cause == UVM_MAKE_RESIDENT_CAUSE_ACCESS_COUNTER);
3666     }
3667 
3668     uvm_assert_mutex_locked(&va_block->lock);
3669     UVM_ASSERT(uvm_va_block_is_hmm(va_block) || va_block->va_range->type == UVM_VA_RANGE_TYPE_MANAGED);
3670 
3671     resident_mask = block_resident_mask_get_alloc(va_block, dest_id);
3672     if (!resident_mask)
3673         return NV_ERR_NO_MEMORY;
3674 
3675     // Unmap all mapped processors except for UVM-Lite GPUs as their mappings
3676     // are largely persistent.
3677     uvm_processor_mask_andnot(&unmap_processor_mask, &va_block->mapped, block_get_uvm_lite_gpus(va_block));
3678 
3679     if (page_mask)
3680         uvm_page_mask_andnot(unmap_page_mask, page_mask, resident_mask);
3681     else
3682         uvm_page_mask_complement(unmap_page_mask, resident_mask);
3683     uvm_page_mask_region_clear_outside(unmap_page_mask, region);
3684 
3685     // Unmap all pages not resident on the destination
3686     status = uvm_va_block_unmap_mask(va_block, va_block_context, &unmap_processor_mask, region, unmap_page_mask);
3687     if (status != NV_OK)
3688         return status;
3689 
3690     if (page_mask)
3691         uvm_page_mask_and(unmap_page_mask, page_mask, &va_block->read_duplicated_pages);
3692     else
3693         uvm_page_mask_init_from_region(unmap_page_mask, region, &va_block->read_duplicated_pages);
3694     uvm_page_mask_region_clear_outside(unmap_page_mask, region);
3695 
3696     // Also unmap read-duplicated pages excluding dest_id
3697     uvm_processor_mask_clear(&unmap_processor_mask, dest_id);
3698     status = uvm_va_block_unmap_mask(va_block, va_block_context, &unmap_processor_mask, region, unmap_page_mask);
3699     if (status != NV_OK)
3700         return status;
3701 
3702     uvm_tools_record_read_duplicate_invalidate(va_block,
3703                                                dest_id,
3704                                                region,
3705                                                unmap_page_mask);
3706 
3707     // Note that block_populate_pages and block_copy_resident_pages also use
3708     // va_block_context->make_resident.page_mask.
3709     unmap_page_mask = NULL;
3710 
3711     status = block_populate_pages(va_block, va_block_retry, va_block_context, dest_id, region, page_mask);
3712     if (status != NV_OK)
3713         return status;
3714 
3715     return block_copy_resident_pages(va_block,
3716                                      va_block_context,
3717                                      dest_id,
3718                                      region,
3719                                      page_mask,
3720                                      prefetch_page_mask,
3721                                      UVM_VA_BLOCK_TRANSFER_MODE_MOVE);
3722 }
3723 
3724 static void block_make_resident_clear_evicted(uvm_va_block_t *va_block,
3725                                               uvm_processor_id_t dst_id,
3726                                               uvm_page_mask_t *page_mask)
3727 {
3728     uvm_va_block_gpu_state_t *dst_gpu_state = uvm_va_block_gpu_state_get(va_block, dst_id);
3729 
3730     UVM_ASSERT(dst_gpu_state);
3731 
3732     if (!uvm_page_mask_andnot(&dst_gpu_state->evicted, &dst_gpu_state->evicted, page_mask))
3733         uvm_processor_mask_clear(&va_block->evicted_gpus, dst_id);
3734 }
3735 
3736 static void block_make_resident_update_state(uvm_va_block_t *va_block,
3737                                              uvm_va_block_context_t *va_block_context,
3738                                              uvm_processor_id_t dst_id,
3739                                              uvm_va_block_region_t region,
3740                                              uvm_page_mask_t *copy_mask,
3741                                              uvm_make_resident_cause_t cause)
3742 {
3743     uvm_page_mask_t *dst_resident_mask = uvm_va_block_resident_mask_get(va_block, dst_id);
3744 
3745     uvm_page_mask_or(dst_resident_mask, dst_resident_mask, copy_mask);
3746     block_set_resident_processor(va_block, dst_id);
3747 
3748     // Accumulate the pages that migrated into the output mask.
3749     uvm_page_mask_or(&va_block_context->make_resident.pages_changed_residency,
3750                      &va_block_context->make_resident.pages_changed_residency,
3751                      copy_mask);
3752 
3753     // Any move operation implies that mappings have been removed from all
3754     // non-UVM-Lite GPUs.
3755     uvm_page_mask_andnot(&va_block->maybe_mapped_pages, &va_block->maybe_mapped_pages, copy_mask);
3756 
3757     // If we are migrating due to an eviction, set the GPU as evicted and
3758     // mark the evicted pages. If we are migrating away from the CPU this
3759     // means that those pages are not evicted.
3760     if (cause == UVM_MAKE_RESIDENT_CAUSE_EVICTION) {
3761         uvm_processor_id_t src_id;
3762 
3763         UVM_ASSERT(UVM_ID_IS_CPU(dst_id));
3764 
3765         // Note that the destination is the CPU so this loop excludes it.
3766         for_each_gpu_id_in_mask(src_id, &va_block_context->make_resident.all_involved_processors) {
3767             uvm_va_block_gpu_state_t *src_gpu_state = uvm_va_block_gpu_state_get(va_block, src_id);
3768 
3769             UVM_ASSERT(src_gpu_state);
3770 
3771             uvm_page_mask_or(&src_gpu_state->evicted, &src_gpu_state->evicted, copy_mask);
3772             uvm_processor_mask_set(&va_block->evicted_gpus, src_id);
3773         }
3774     }
3775     else if (UVM_ID_IS_GPU(dst_id) && uvm_processor_mask_test(&va_block->evicted_gpus, dst_id))
3776         block_make_resident_clear_evicted(va_block, dst_id, copy_mask);
3777 }
3778 
3779 void uvm_va_block_make_resident_finish(uvm_va_block_t *va_block,
3780                                        uvm_va_block_context_t *va_block_context,
3781                                        uvm_va_block_region_t region,
3782                                        const uvm_page_mask_t *page_mask)
3783 {
3784     uvm_page_mask_t *migrated_pages = &va_block_context->make_resident.pages_migrated;
3785     uvm_processor_id_t dst_id = va_block_context->make_resident.dest_id;
3786 
3787     uvm_assert_mutex_locked(&va_block->lock);
3788 
3789     if (page_mask)
3790         uvm_page_mask_and(migrated_pages, migrated_pages, page_mask);
3791 
3792     if (!uvm_page_mask_empty(migrated_pages)) {
3793         // The migrated pages are now resident on the destination.
3794         block_make_resident_update_state(va_block,
3795                                          va_block_context,
3796                                          dst_id,
3797                                          region,
3798                                          migrated_pages,
3799                                          va_block_context->make_resident.cause);
3800     }
3801 
3802     // Pages that weren't resident anywhere else were populated at the
3803     // destination directly. Mark them as resident now.
3804     block_copy_set_first_touch_residency(va_block, va_block_context, dst_id, region, page_mask);
3805 
3806     // Break read duplication and clear residency from other processors.
3807     break_read_duplication_in_region(va_block, va_block_context, dst_id, region, page_mask);
3808 
3809     // Update eviction heuristics, if needed. Notably this could repeat the call
3810     // done in block_set_resident_processor(), but that doesn't do anything bad
3811     // and it's simpler to keep it in both places.
3812     //
3813     // Skip this if we didn't do anything (the input region and/or page mask was
3814     // empty).
3815     if (uvm_processor_mask_test(&va_block->resident, dst_id))
3816         block_mark_memory_used(va_block, dst_id);
3817 }
3818 
3819 NV_STATUS uvm_va_block_make_resident(uvm_va_block_t *va_block,
3820                                      uvm_va_block_retry_t *va_block_retry,
3821                                      uvm_va_block_context_t *va_block_context,
3822                                      uvm_processor_id_t dest_id,
3823                                      uvm_va_block_region_t region,
3824                                      const uvm_page_mask_t *page_mask,
3825                                      const uvm_page_mask_t *prefetch_page_mask,
3826                                      uvm_make_resident_cause_t cause)
3827 {
3828     NV_STATUS status;
3829 
3830     status = uvm_va_block_make_resident_copy(va_block,
3831                                              va_block_retry,
3832                                              va_block_context,
3833                                              dest_id,
3834                                              region,
3835                                              page_mask,
3836                                              prefetch_page_mask,
3837                                              cause);
3838     if (status != NV_OK)
3839         return status;
3840 
3841     uvm_va_block_make_resident_finish(va_block,
3842                                       va_block_context,
3843                                       region,
3844                                       page_mask);
3845 
3846     return NV_OK;
3847 }
3848 
3849 // Combination function which prepares the input {region, page_mask} for
3850 // entering read-duplication. It:
3851 // - Unmaps all processors but revoke_id
3852 // - Revokes write access from revoke_id
3853 static NV_STATUS block_prep_read_duplicate_mapping(uvm_va_block_t *va_block,
3854                                                    uvm_va_block_context_t *va_block_context,
3855                                                    uvm_processor_id_t revoke_id,
3856                                                    uvm_va_block_region_t region,
3857                                                    const uvm_page_mask_t *page_mask)
3858 {
3859     uvm_processor_mask_t unmap_processor_mask;
3860     uvm_processor_id_t unmap_id;
3861     uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
3862     NV_STATUS status, tracker_status;
3863 
3864     // Unmap everybody except revoke_id
3865     uvm_processor_mask_andnot(&unmap_processor_mask, &va_block->mapped, block_get_uvm_lite_gpus(va_block));
3866     uvm_processor_mask_clear(&unmap_processor_mask, revoke_id);
3867 
3868     for_each_id_in_mask(unmap_id, &unmap_processor_mask) {
3869         status = uvm_va_block_unmap(va_block,
3870                                     va_block_context,
3871                                     unmap_id,
3872                                     region,
3873                                     page_mask,
3874                                     &local_tracker);
3875         if (status != NV_OK)
3876             goto out;
3877     }
3878 
3879     // Revoke WRITE/ATOMIC access permissions from the remaining mapped
3880     // processor.
3881     status = uvm_va_block_revoke_prot(va_block,
3882                                       va_block_context,
3883                                       revoke_id,
3884                                       region,
3885                                       page_mask,
3886                                       UVM_PROT_READ_WRITE,
3887                                       &local_tracker);
3888     if (status != NV_OK)
3889         goto out;
3890 
3891 out:
3892     tracker_status = uvm_tracker_add_tracker_safe(&va_block->tracker, &local_tracker);
3893     uvm_tracker_deinit(&local_tracker);
3894     return status == NV_OK ? tracker_status : status;
3895 }
3896 
3897 NV_STATUS uvm_va_block_make_resident_read_duplicate(uvm_va_block_t *va_block,
3898                                                     uvm_va_block_retry_t *va_block_retry,
3899                                                     uvm_va_block_context_t *va_block_context,
3900                                                     uvm_processor_id_t dest_id,
3901                                                     uvm_va_block_region_t region,
3902                                                     const uvm_page_mask_t *page_mask,
3903                                                     const uvm_page_mask_t *prefetch_page_mask,
3904                                                     uvm_make_resident_cause_t cause)
3905 {
3906     NV_STATUS status = NV_OK;
3907     uvm_processor_id_t src_id;
3908     uvm_page_mask_t *dst_resident_mask;
3909     uvm_page_mask_t *cpu_resident_mask;
3910     uvm_page_mask_t *migrated_pages;
3911     uvm_page_mask_t *staged_pages;
3912     uvm_page_mask_t *first_touch_mask;
3913 
3914     // TODO: Bug 3660922: need to implement HMM read duplication support.
3915     UVM_ASSERT(!uvm_va_block_is_hmm(va_block));
3916 
3917     va_block_context->make_resident.dest_id = dest_id;
3918     va_block_context->make_resident.cause = cause;
3919 
3920     if (prefetch_page_mask) {
3921         // TODO: Bug 1877578: investigate automatic read-duplicate policies
3922         UVM_ASSERT(cause == UVM_MAKE_RESIDENT_CAUSE_REPLAYABLE_FAULT ||
3923                    cause == UVM_MAKE_RESIDENT_CAUSE_NON_REPLAYABLE_FAULT ||
3924                    cause == UVM_MAKE_RESIDENT_CAUSE_ACCESS_COUNTER);
3925     }
3926 
3927     uvm_assert_mutex_locked(&va_block->lock);
3928     UVM_ASSERT(!uvm_va_block_is_dead(va_block));
3929 
3930     // For pages that are entering read-duplication we need to unmap remote
3931     // mappings and revoke RW and higher access permissions.
3932     //
3933     // The current implementation:
3934     // - Unmaps pages from all processors but the one with the resident copy
3935     // - Revokes write access from the processor with the resident copy
3936     for_each_id_in_mask(src_id, &va_block->resident) {
3937         // Note that the below calls to block_populate_pages and
3938         // block_copy_resident_pages also use
3939         // va_block_context->make_resident.page_mask.
3940         uvm_page_mask_t *preprocess_page_mask = &va_block_context->make_resident.page_mask;
3941         const uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, src_id);
3942         UVM_ASSERT(!uvm_page_mask_empty(resident_mask));
3943 
3944         if (page_mask)
3945             uvm_page_mask_andnot(preprocess_page_mask, page_mask, &va_block->read_duplicated_pages);
3946         else
3947             uvm_page_mask_complement(preprocess_page_mask, &va_block->read_duplicated_pages);
3948 
3949         // If there are no pages that need to be unmapped/revoked, skip to the
3950         // next processor
3951         if (!uvm_page_mask_and(preprocess_page_mask, preprocess_page_mask, resident_mask))
3952             continue;
3953 
3954         status = block_prep_read_duplicate_mapping(va_block, va_block_context, src_id, region, preprocess_page_mask);
3955         if (status != NV_OK)
3956             return status;
3957     }
3958 
3959     status = block_populate_pages(va_block, va_block_retry, va_block_context, dest_id, region, page_mask);
3960     if (status != NV_OK)
3961         return status;
3962 
3963     status = block_copy_resident_pages(va_block,
3964                                        va_block_context,
3965                                        dest_id,
3966                                        region,
3967                                        page_mask,
3968                                        prefetch_page_mask,
3969                                        UVM_VA_BLOCK_TRANSFER_MODE_COPY);
3970     if (status != NV_OK)
3971         return status;
3972 
3973     // Pages that weren't resident anywhere else were populated at the
3974     // destination directly. Mark them as resident now, since there were no
3975     // errors from block_copy_resident_pages() above.
3976     // Note that va_block_context->scratch_page_mask is passed to
3977     // block_copy_set_first_touch_residency() which is generally unsafe but in
3978     // this case, block_copy_set_first_touch_residency() copies page_mask
3979     // before scratch_page_mask could be clobbered.
3980     migrated_pages = &va_block_context->make_resident.pages_migrated;
3981     first_touch_mask = &va_block_context->scratch_page_mask;
3982     uvm_page_mask_init_from_region(first_touch_mask, region, page_mask);
3983     uvm_page_mask_andnot(first_touch_mask, first_touch_mask, migrated_pages);
3984 
3985     if (!uvm_page_mask_empty(first_touch_mask))
3986         block_copy_set_first_touch_residency(va_block, va_block_context, dest_id, region, first_touch_mask);
3987 
3988     staged_pages = &va_block_context->make_resident.pages_staged;
3989     if (!UVM_ID_IS_CPU(dest_id) && !uvm_page_mask_empty(staged_pages)) {
3990         cpu_resident_mask = uvm_va_block_resident_mask_get(va_block, UVM_ID_CPU);
3991         uvm_page_mask_or(cpu_resident_mask, cpu_resident_mask, staged_pages);
3992         block_set_resident_processor(va_block, UVM_ID_CPU);
3993         uvm_page_mask_or(&va_block->read_duplicated_pages, &va_block->read_duplicated_pages, staged_pages);
3994         uvm_tools_record_read_duplicate(va_block, UVM_ID_CPU, region, staged_pages);
3995     }
3996 
3997     if (!uvm_page_mask_empty(migrated_pages)) {
3998         dst_resident_mask = uvm_va_block_resident_mask_get(va_block, dest_id);
3999         uvm_page_mask_or(dst_resident_mask, dst_resident_mask, migrated_pages);
4000         block_set_resident_processor(va_block, dest_id);
4001         uvm_page_mask_or(&va_block->read_duplicated_pages, &va_block->read_duplicated_pages, migrated_pages);
4002         uvm_tools_record_read_duplicate(va_block, dest_id, region, migrated_pages);
4003     }
4004 
4005     UVM_ASSERT(cause != UVM_MAKE_RESIDENT_CAUSE_EVICTION);
4006     if (UVM_ID_IS_GPU(dest_id) && uvm_processor_mask_test(&va_block->evicted_gpus, dest_id))
4007         block_make_resident_clear_evicted(va_block, dest_id, migrated_pages);
4008 
4009     // Update eviction heuristics, if needed. Notably this could repeat the call
4010     // done in block_set_resident_processor(), but that doesn't do anything bad
4011     // and it's simpler to keep it in both places.
4012     //
4013     // Skip this if we didn't do anything (the input region and/or page mask was
4014     // empty).
4015     if (uvm_processor_mask_test(&va_block->resident, dest_id))
4016         block_mark_memory_used(va_block, dest_id);
4017 
4018     return NV_OK;
4019 }
4020 
4021 // Looks up the current CPU mapping state of page from the
4022 // block->cpu.pte_bits bitmaps. If write access is enabled,
4023 // UVM_PROT_READ_WRITE_ATOMIC is returned instead of UVM_PROT_READ_WRITE, since
4024 // write access implies atomic access for CPUs.
4025 static uvm_prot_t block_page_prot_cpu(uvm_va_block_t *block, uvm_page_index_t page_index)
4026 {
4027     uvm_prot_t prot;
4028 
4029     UVM_ASSERT(!uvm_va_block_is_dead(block));
4030 
4031     if (uvm_page_mask_test(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE], page_index))
4032         prot = UVM_PROT_READ_WRITE_ATOMIC;
4033     else if (uvm_page_mask_test(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ], page_index))
4034         prot = UVM_PROT_READ_ONLY;
4035     else
4036         prot = UVM_PROT_NONE;
4037 
4038     return prot;
4039 }
4040 
4041 // Looks up the current GPU mapping state of page from the
4042 // block->gpus[i]->pte_bits bitmaps.
4043 static uvm_prot_t block_page_prot_gpu(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_page_index_t page_index)
4044 {
4045     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
4046     uvm_prot_t prot;
4047 
4048     UVM_ASSERT(!uvm_va_block_is_dead(block));
4049 
4050     if (!gpu_state)
4051         return UVM_PROT_NONE;
4052 
4053     if (uvm_page_mask_test(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_ATOMIC], page_index))
4054         prot = UVM_PROT_READ_WRITE_ATOMIC;
4055     else if (uvm_page_mask_test(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_WRITE], page_index))
4056         prot = UVM_PROT_READ_WRITE;
4057     else if (uvm_page_mask_test(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ], page_index))
4058         prot = UVM_PROT_READ_ONLY;
4059     else
4060         prot = UVM_PROT_NONE;
4061 
4062     return prot;
4063 }
4064 
4065 static uvm_prot_t block_page_prot(uvm_va_block_t *block, uvm_processor_id_t id, uvm_page_index_t page_index)
4066 {
4067     if (UVM_ID_IS_CPU(id))
4068         return block_page_prot_cpu(block, page_index);
4069     else
4070         return block_page_prot_gpu(block, block_get_gpu(block, id), page_index);
4071 }
4072 
4073 // Returns true if the block has any valid CPU PTE mapping in the block region.
4074 static bool block_has_valid_mapping_cpu(uvm_va_block_t *block, uvm_va_block_region_t region)
4075 {
4076     size_t valid_page;
4077 
4078     UVM_ASSERT(region.outer <= uvm_va_block_num_cpu_pages(block));
4079 
4080     // Early-out: check whether any address in this block has a CPU mapping
4081     if (!uvm_processor_mask_test(&block->mapped, UVM_ID_CPU)) {
4082         UVM_ASSERT(uvm_page_mask_empty(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ]));
4083         UVM_ASSERT(uvm_page_mask_empty(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE]));
4084         return false;
4085     }
4086 
4087     // All valid mappings have at least read permissions so we only need to
4088     // inspect the read bits.
4089     valid_page = uvm_va_block_first_page_in_mask(region, &block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ]);
4090     if (valid_page == region.outer)
4091         return false;
4092 
4093     UVM_ASSERT(block_page_prot_cpu(block, valid_page) != UVM_PROT_NONE);
4094     return true;
4095 }
4096 
4097 static bool block_check_chunk_indirect_peers(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_gpu_chunk_t *chunk)
4098 {
4099     uvm_gpu_t *accessing_gpu;
4100     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
4101 
4102     if (!uvm_pmm_sysmem_mappings_indirect_supported())
4103         return true;
4104 
4105     for_each_va_space_gpu_in_mask(accessing_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)]) {
4106         NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(&gpu->pmm, chunk, accessing_gpu);
4107         uvm_reverse_map_t reverse_map;
4108         size_t num_mappings;
4109 
4110         num_mappings = uvm_pmm_sysmem_mappings_dma_to_virt(&accessing_gpu->pmm_reverse_sysmem_mappings,
4111                                                            peer_addr,
4112                                                            uvm_gpu_chunk_get_size(chunk),
4113                                                            &reverse_map,
4114                                                            1);
4115         UVM_ASSERT(num_mappings == 1);
4116         UVM_ASSERT(reverse_map.va_block == block);
4117         UVM_ASSERT(reverse_map.region.first == chunk->va_block_page_index);
4118         UVM_ASSERT(uvm_va_block_region_size(reverse_map.region) == uvm_gpu_chunk_get_size(chunk));
4119 
4120         uvm_va_block_release_no_destroy(reverse_map.va_block);
4121     }
4122 
4123     return true;
4124 }
4125 
4126 // Sanity check the given GPU's chunks array
4127 static bool block_check_gpu_chunks(uvm_va_block_t *block, uvm_gpu_id_t id)
4128 {
4129     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, id);
4130     uvm_gpu_t *gpu;
4131     size_t i, num_chunks;
4132     uvm_page_index_t page_index;
4133     uvm_chunk_size_t chunk_size;
4134 
4135     if (!gpu_state)
4136         return true;
4137 
4138     gpu = block_get_gpu(block, id);
4139 
4140     num_chunks = block_num_gpu_chunks(block, gpu);
4141     for (page_index = 0, i = 0; i < num_chunks; i++) {
4142         uvm_gpu_chunk_t *chunk = gpu_state->chunks[i];
4143         size_t chunk_index = block_gpu_chunk_index(block, gpu, page_index, &chunk_size);
4144 
4145         if (chunk_index != i) {
4146             UVM_ERR_PRINT("chunk index mismatch: calculated %zu, is in %zu. VA block [0x%llx, 0x%llx) GPU %u page_index: %u\n",
4147                            chunk_index,
4148                            i,
4149                            block->start,
4150                            block->end + 1,
4151                            uvm_id_value(id),
4152                            page_index);
4153             return false;
4154         }
4155 
4156         if (chunk) {
4157             if (chunk_size != uvm_gpu_chunk_get_size(chunk)) {
4158                 UVM_ERR_PRINT("chunk size mismatch: calc %u, actual %u. VA block [0x%llx, 0x%llx) GPU: %u page_index: %u chunk index: %zu\n",
4159                               chunk_size,
4160                               uvm_gpu_chunk_get_size(chunk),
4161                               block->start,
4162                               block->end + 1,
4163                               uvm_id_value(id),
4164                               page_index,
4165                               i);
4166                 return false;
4167             }
4168 
4169             if (chunk->state != UVM_PMM_GPU_CHUNK_STATE_ALLOCATED) {
4170                 UVM_ERR_PRINT("Invalid chunk state %s. VA block [0x%llx, 0x%llx) GPU: %u page_index: %u chunk index: %zu chunk_size: %u\n",
4171                               uvm_pmm_gpu_chunk_state_string(chunk->state),
4172                               block->start,
4173                               block->end + 1,
4174                               uvm_id_value(id),
4175                               page_index,
4176                               i,
4177                               chunk_size);
4178                 return false;
4179             }
4180 
4181             UVM_ASSERT(chunk->va_block == block);
4182             UVM_ASSERT(chunk->va_block_page_index == page_index);
4183 
4184             UVM_ASSERT(block_check_chunk_indirect_peers(block, gpu, chunk));
4185         }
4186 
4187         page_index += chunk_size / PAGE_SIZE;
4188     }
4189 
4190     return true;
4191 }
4192 
4193 static bool block_check_chunks(uvm_va_block_t *va_block)
4194 {
4195     uvm_gpu_id_t id;
4196 
4197     for_each_gpu_id(id) {
4198         if (!block_check_gpu_chunks(va_block, id))
4199             return false;
4200     }
4201 
4202     return block_check_cpu_chunks(va_block);
4203 }
4204 
4205 // Sanity checks for page mappings
4206 static bool block_check_mappings_page(uvm_va_block_t *block, uvm_page_index_t page_index)
4207 {
4208     uvm_processor_mask_t atomic_mappings, write_mappings, read_mappings;
4209     uvm_processor_mask_t lite_read_mappings, lite_atomic_mappings;
4210     uvm_processor_mask_t remaining_mappings, temp_mappings;
4211     uvm_processor_mask_t resident_processors;
4212     const uvm_processor_mask_t *residency_accessible_from = NULL;
4213     const uvm_processor_mask_t *residency_has_native_atomics = NULL;
4214     uvm_processor_id_t residency, id;
4215     uvm_va_range_t *va_range = block->va_range;
4216     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
4217     uvm_processor_id_t preferred_location = va_range ?
4218                                             uvm_va_range_get_policy(va_range)->preferred_location :
4219                                             UVM_ID_INVALID;
4220     const uvm_processor_mask_t *uvm_lite_gpus = block_get_uvm_lite_gpus(block);
4221 
4222     block_page_authorized_processors(block, page_index, UVM_PROT_READ_WRITE_ATOMIC, &atomic_mappings);
4223     block_page_authorized_processors(block, page_index, UVM_PROT_READ_WRITE, &write_mappings);
4224     block_page_authorized_processors(block, page_index, UVM_PROT_READ_ONLY, &read_mappings);
4225 
4226     // Each access bit implies all accesses below it
4227     UVM_ASSERT(uvm_processor_mask_subset(&atomic_mappings, &write_mappings));
4228     UVM_ASSERT(uvm_processor_mask_subset(&write_mappings, &read_mappings));
4229     UVM_ASSERT(uvm_processor_mask_subset(&read_mappings, &block->mapped));
4230 
4231     uvm_va_block_page_resident_processors(block, page_index, &resident_processors);
4232     UVM_ASSERT(uvm_processor_mask_subset(&resident_processors, &block->resident));
4233 
4234     // Sanity check block_get_mapped_processors
4235     uvm_processor_mask_copy(&remaining_mappings, &read_mappings);
4236     for_each_id_in_mask(residency, &resident_processors) {
4237         block_get_mapped_processors(block, residency, page_index, &temp_mappings);
4238         UVM_ASSERT(uvm_processor_mask_subset(&temp_mappings, &remaining_mappings));
4239         uvm_processor_mask_andnot(&remaining_mappings, &remaining_mappings, &temp_mappings);
4240     }
4241 
4242     // Any remaining mappings point to non-resident locations, so they must be
4243     // UVM-Lite mappings.
4244     UVM_ASSERT(uvm_processor_mask_subset(&remaining_mappings, uvm_lite_gpus));
4245 
4246     residency = uvm_processor_mask_find_first_id(&resident_processors);
4247 
4248     if (uvm_processor_mask_get_count(&resident_processors) > 0) {
4249         residency_accessible_from    = &va_space->accessible_from[uvm_id_value(residency)];
4250         residency_has_native_atomics = &va_space->has_native_atomics[uvm_id_value(residency)];
4251     }
4252 
4253     // If the page is not resident, there should be no valid mappings
4254     UVM_ASSERT_MSG(uvm_processor_mask_get_count(&resident_processors) > 0 ||
4255                    uvm_processor_mask_get_count(&read_mappings) == 0,
4256                    "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx - SWA: 0x%lx - RD: 0x%lx\n",
4257                    *resident_processors.bitmap,
4258                    *read_mappings.bitmap, *write_mappings.bitmap, *atomic_mappings.bitmap,
4259                    *va_space->system_wide_atomics_enabled_processors.bitmap,
4260                    *block->read_duplicated_pages.bitmap);
4261 
4262     // Test read_duplicated_pages mask
4263     UVM_ASSERT_MSG((uvm_processor_mask_get_count(&resident_processors) <= 1 &&
4264                      !uvm_page_mask_test(&block->read_duplicated_pages, page_index)) ||
4265                    (uvm_processor_mask_get_count(&resident_processors) > 1 &&
4266                      uvm_page_mask_test(&block->read_duplicated_pages, page_index)),
4267                    "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx - SWA: 0x%lx - RD: 0x%lx\n",
4268                    *resident_processors.bitmap,
4269                    *read_mappings.bitmap, *write_mappings.bitmap, *atomic_mappings.bitmap,
4270                    *va_space->system_wide_atomics_enabled_processors.bitmap,
4271                    *block->read_duplicated_pages.bitmap);
4272 
4273     if (!uvm_processor_mask_empty(uvm_lite_gpus))
4274         UVM_ASSERT(UVM_ID_IS_VALID(preferred_location));
4275 
4276     // UVM-Lite checks. Since the range group is made non-migratable before the
4277     // actual migrations for that range group happen, we can only make those
4278     // checks which are valid on both migratable and non-migratable range
4279     // groups.
4280     uvm_processor_mask_and(&lite_read_mappings, &read_mappings, uvm_lite_gpus);
4281     uvm_processor_mask_and(&lite_atomic_mappings, &atomic_mappings, uvm_lite_gpus);
4282 
4283     // Any mapping from a UVM-Lite GPU must be atomic...
4284     UVM_ASSERT(uvm_processor_mask_equal(&lite_read_mappings, &lite_atomic_mappings));
4285 
4286     // ... and must have access to preferred_location
4287     if (UVM_ID_IS_VALID(preferred_location)) {
4288         const uvm_processor_mask_t *preferred_location_accessible_from;
4289 
4290         preferred_location_accessible_from = &va_space->accessible_from[uvm_id_value(preferred_location)];
4291         UVM_ASSERT(uvm_processor_mask_subset(&lite_atomic_mappings, preferred_location_accessible_from));
4292     }
4293 
4294     for_each_id_in_mask(id, &lite_atomic_mappings)
4295         UVM_ASSERT(uvm_processor_mask_test(&va_space->can_access[uvm_id_value(id)], preferred_location));
4296 
4297     // Exclude uvm_lite_gpus from mappings' masks after UVM-Lite tests
4298     uvm_processor_mask_andnot(&read_mappings, &read_mappings, uvm_lite_gpus);
4299     uvm_processor_mask_andnot(&write_mappings, &write_mappings, uvm_lite_gpus);
4300     uvm_processor_mask_andnot(&atomic_mappings, &atomic_mappings, uvm_lite_gpus);
4301 
4302     // Pages set to zero in maybe_mapped_pages must not be mapped on any
4303     // non-UVM-Lite GPU
4304     if (!uvm_page_mask_test(&block->maybe_mapped_pages, page_index)) {
4305         UVM_ASSERT_MSG(uvm_processor_mask_get_count(&read_mappings) == 0,
4306                        "Resident: 0x%lx - Mappings Block: 0x%lx / Page R: 0x%lx W: 0x%lx A: 0x%lx\n",
4307                        *resident_processors.bitmap,
4308                        *block->mapped.bitmap,
4309                        *read_mappings.bitmap, *write_mappings.bitmap, *atomic_mappings.bitmap);
4310     }
4311 
4312     // atomic mappings from GPUs with disabled system-wide atomics are treated
4313     // as write mappings. Therefore, we remove them from the atomic mappings mask
4314     uvm_processor_mask_and(&atomic_mappings, &atomic_mappings, &va_space->system_wide_atomics_enabled_processors);
4315 
4316     if (!uvm_processor_mask_empty(&read_mappings)) {
4317         // Read-duplicate: if a page is resident in multiple locations, it
4318         // must be resident locally on each mapped processor.
4319         if (uvm_processor_mask_get_count(&resident_processors) > 1) {
4320             UVM_ASSERT_MSG(uvm_processor_mask_subset(&read_mappings, &resident_processors),
4321                            "Read-duplicate copies from remote processors\n"
4322                            "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx - SWA: 0x%lx - RD: 0x%lx\n",
4323                            *resident_processors.bitmap,
4324                            *read_mappings.bitmap, *write_mappings.bitmap, *atomic_mappings.bitmap,
4325                            *va_space->system_wide_atomics_enabled_processors.bitmap,
4326                            *block->read_duplicated_pages.bitmap);
4327         }
4328         else {
4329             // Processors with mappings must have access to the processor that
4330             // has the valid copy
4331             UVM_ASSERT_MSG(uvm_processor_mask_subset(&read_mappings, residency_accessible_from),
4332                            "Not all processors have access to %s\n"
4333                            "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx -"
4334                            "Access: 0x%lx - Native Atomics: 0x%lx - SWA: 0x%lx\n",
4335                            uvm_va_space_processor_name(va_space, residency),
4336                            *resident_processors.bitmap,
4337                            *read_mappings.bitmap,
4338                            *write_mappings.bitmap,
4339                            *atomic_mappings.bitmap,
4340                            *residency_accessible_from->bitmap,
4341                            *residency_has_native_atomics->bitmap,
4342                            *va_space->system_wide_atomics_enabled_processors.bitmap);
4343             for_each_id_in_mask(id, &read_mappings) {
4344                 UVM_ASSERT(uvm_processor_mask_test(&va_space->can_access[uvm_id_value(id)], residency));
4345 
4346                 if (uvm_processor_mask_test(&va_space->indirect_peers[uvm_id_value(residency)], id)) {
4347                     uvm_gpu_t *resident_gpu = uvm_va_space_get_gpu(va_space, residency);
4348                     uvm_gpu_t *mapped_gpu = uvm_va_space_get_gpu(va_space, id);
4349                     uvm_gpu_chunk_t *chunk = block_phys_page_chunk(block, block_phys_page(residency, page_index), NULL);
4350 
4351                     // This function will assert if no mapping exists
4352                     (void)uvm_pmm_gpu_indirect_peer_addr(&resident_gpu->pmm, chunk, mapped_gpu);
4353                 }
4354             }
4355         }
4356     }
4357 
4358     // If any processor has a writable mapping, there must only be one copy of
4359     // the page in the system
4360     if (!uvm_processor_mask_empty(&write_mappings)) {
4361         UVM_ASSERT_MSG(uvm_processor_mask_get_count(&resident_processors) == 1,
4362                        "Too many resident copies for pages with write_mappings\n"
4363                        "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx - SWA: 0x%lx - RD: 0x%lx\n",
4364                        *resident_processors.bitmap,
4365                        *read_mappings.bitmap,
4366                        *write_mappings.bitmap,
4367                        *atomic_mappings.bitmap,
4368                        *va_space->system_wide_atomics_enabled_processors.bitmap,
4369                        *block->read_duplicated_pages.bitmap);
4370     }
4371 
4372     if (!uvm_processor_mask_empty(&atomic_mappings)) {
4373         uvm_processor_mask_t native_atomics;
4374 
4375         uvm_processor_mask_and(&native_atomics, &atomic_mappings, residency_has_native_atomics);
4376 
4377         if (uvm_processor_mask_empty(&native_atomics)) {
4378             // No other faultable processor should be able to write
4379             uvm_processor_mask_and(&write_mappings, &write_mappings, &va_space->faultable_processors);
4380 
4381             UVM_ASSERT_MSG(uvm_processor_mask_get_count(&write_mappings) == 1,
4382                            "Too many write mappings to %s from processors with non-native atomics\n"
4383                            "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx -"
4384                            "Access: 0x%lx - Native Atomics: 0x%lx - SWA: 0x%lx\n",
4385                            uvm_va_space_processor_name(va_space, residency),
4386                            *resident_processors.bitmap,
4387                            *read_mappings.bitmap,
4388                            *write_mappings.bitmap,
4389                            *atomic_mappings.bitmap,
4390                            *residency_accessible_from->bitmap,
4391                            *residency_has_native_atomics->bitmap,
4392                            *va_space->system_wide_atomics_enabled_processors.bitmap);
4393 
4394             // Only one processor outside of the native group can have atomics enabled
4395             UVM_ASSERT_MSG(uvm_processor_mask_get_count(&atomic_mappings) == 1,
4396                            "Too many atomics mappings to %s from processors with non-native atomics\n"
4397                            "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx -"
4398                            "Access: 0x%lx - Native Atomics: 0x%lx - SWA: 0x%lx\n",
4399                            uvm_va_space_processor_name(va_space, residency),
4400                            *resident_processors.bitmap,
4401                            *read_mappings.bitmap,
4402                            *write_mappings.bitmap,
4403                            *atomic_mappings.bitmap,
4404                            *residency_accessible_from->bitmap,
4405                            *residency_has_native_atomics->bitmap,
4406                            *va_space->system_wide_atomics_enabled_processors.bitmap);
4407         }
4408         else {
4409             uvm_processor_mask_t non_native_atomics;
4410 
4411             // One or more processors within the native group have atomics enabled.
4412             // All processors outside of that group may have write but not atomic
4413             // permissions.
4414             uvm_processor_mask_andnot(&non_native_atomics, &atomic_mappings, residency_has_native_atomics);
4415 
4416             UVM_ASSERT_MSG(uvm_processor_mask_empty(&non_native_atomics),
4417                            "atomic mappings to %s from processors native and non-native\n"
4418                            "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx -"
4419                            "Access: 0x%lx - Native Atomics: 0x%lx - SWA: 0x%lx\n",
4420                            uvm_va_space_processor_name(va_space, residency),
4421                            *resident_processors.bitmap,
4422                            *read_mappings.bitmap,
4423                            *write_mappings.bitmap,
4424                            *atomic_mappings.bitmap,
4425                            *residency_accessible_from->bitmap,
4426                            *residency_has_native_atomics->bitmap,
4427                            *va_space->system_wide_atomics_enabled_processors.bitmap);
4428         }
4429     }
4430 
4431     return true;
4432 }
4433 
4434 static bool block_check_mappings_ptes(uvm_va_block_t *block, uvm_gpu_t *gpu)
4435 {
4436     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
4437     uvm_va_block_gpu_state_t *resident_gpu_state;
4438     uvm_pte_bits_gpu_t pte_bit;
4439     uvm_processor_id_t resident_id;
4440     uvm_prot_t prot;
4441     NvU32 big_page_size;
4442     size_t num_big_pages, big_page_index;
4443     uvm_va_block_region_t big_region, chunk_region;
4444     uvm_gpu_chunk_t *chunk;
4445 
4446     if (!gpu_state->page_table_range_4k.table)
4447         UVM_ASSERT(!gpu_state->activated_4k);
4448 
4449     if (!gpu_state->page_table_range_big.table) {
4450         UVM_ASSERT(!gpu_state->initialized_big);
4451         UVM_ASSERT(!gpu_state->activated_big);
4452     }
4453 
4454     // It's only safe to check the PTE mappings if we have page tables. See
4455     // uvm_va_block_get_gpu_va_space.
4456     if (!block_gpu_has_page_tables(block, gpu)) {
4457         UVM_ASSERT(!uvm_processor_mask_test(&block->mapped, gpu->id));
4458         return true;
4459     }
4460 
4461     big_page_size = uvm_va_block_gpu_big_page_size(block, gpu);
4462     num_big_pages = uvm_va_block_num_big_pages(block, big_page_size);
4463 
4464     if (block_gpu_supports_2m(block, gpu)) {
4465         if (gpu_state->page_table_range_big.table || gpu_state->page_table_range_4k.table) {
4466             // 2M blocks require the 2M entry to be allocated for the lower
4467             // ranges to also be allocated.
4468             UVM_ASSERT(gpu_state->page_table_range_2m.table);
4469         }
4470         else if (gpu_state->page_table_range_2m.table) {
4471             // If the 2M entry is present but the lower ones aren't, the PTE
4472             // must be 2M.
4473             UVM_ASSERT(gpu_state->pte_is_2m);
4474         }
4475     }
4476     else {
4477         UVM_ASSERT(!gpu_state->page_table_range_2m.table);
4478         if (num_big_pages == 0)
4479             UVM_ASSERT(!gpu_state->page_table_range_big.table);
4480     }
4481 
4482     // If we have the big table and it's in use then it must have been
4483     // initialized, even if it doesn't currently contain active PTEs.
4484     if ((!block_gpu_supports_2m(block, gpu) && gpu_state->page_table_range_big.table) ||
4485         (block_gpu_supports_2m(block, gpu) && !gpu_state->pte_is_2m && gpu_state->activated_big))
4486         UVM_ASSERT(gpu_state->initialized_big);
4487 
4488     if (gpu_state->pte_is_2m) {
4489         UVM_ASSERT(block_gpu_supports_2m(block, gpu));
4490         UVM_ASSERT(gpu_state->page_table_range_2m.table);
4491         UVM_ASSERT(bitmap_empty(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK));
4492         UVM_ASSERT(!gpu_state->force_4k_ptes);
4493 
4494         // GPU architectures which support 2M pages only support 64K as the big
4495         // page size. All of the 2M code assumes that
4496         // MAX_BIG_PAGES_PER_UVM_VA_BLOCK covers a 2M PTE exactly (bitmap_full,
4497         // bitmap_complement, etc).
4498         BUILD_BUG_ON((UVM_PAGE_SIZE_2M / UVM_PAGE_SIZE_64K) != MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
4499 
4500         prot = block_page_prot_gpu(block, gpu, 0);
4501 
4502         // All page permissions match
4503         for (pte_bit = 0; pte_bit < UVM_PTE_BITS_GPU_MAX; pte_bit++) {
4504             if (prot == UVM_PROT_NONE || pte_bit > get_gpu_pte_bit_index(prot))
4505                 UVM_ASSERT(uvm_page_mask_empty(&gpu_state->pte_bits[pte_bit]));
4506             else
4507                 UVM_ASSERT(uvm_page_mask_full(&gpu_state->pte_bits[pte_bit]));
4508         }
4509 
4510         if (prot != UVM_PROT_NONE) {
4511             resident_id = block_gpu_get_processor_to_map(block, gpu, 0);
4512 
4513             // block_check_resident_proximity verifies that no closer processor
4514             // has a resident page, so we don't need to check that all pages
4515             // have the same resident_id.
4516 
4517             // block_check_mappings_page verifies that all pages marked resident
4518             // are backed by populated memory.
4519 
4520             // The mapped processor should be fully resident and physically-
4521             // contiguous.
4522             UVM_ASSERT(uvm_page_mask_full(uvm_va_block_resident_mask_get(block, resident_id)));
4523 
4524             if (UVM_ID_IS_GPU(resident_id)) {
4525                 resident_gpu_state = uvm_va_block_gpu_state_get(block, resident_id);
4526                 UVM_ASSERT(resident_gpu_state);
4527                 UVM_ASSERT(uvm_gpu_chunk_get_size(resident_gpu_state->chunks[0]) == UVM_CHUNK_SIZE_2M);
4528             }
4529             else {
4530                 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_first_in_region(block,
4531                                                                        uvm_va_block_region_from_block(block),
4532                                                                        NULL);
4533 
4534                 UVM_ASSERT(uvm_page_mask_full(&block->cpu.allocated));
4535                 UVM_ASSERT(chunk);
4536                 UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_2M);
4537             }
4538         }
4539     }
4540     else if (!bitmap_empty(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)) {
4541         UVM_ASSERT(gpu_state->page_table_range_big.table);
4542         UVM_ASSERT(!gpu_state->force_4k_ptes);
4543         UVM_ASSERT(num_big_pages > 0);
4544         UVM_ASSERT(gpu_state->initialized_big);
4545 
4546         for (big_page_index = 0; big_page_index < num_big_pages; big_page_index++) {
4547             big_region = uvm_va_block_big_page_region(block, big_page_index, big_page_size);
4548 
4549             if (!test_bit(big_page_index, gpu_state->big_ptes)) {
4550                 // If there are valid mappings but this isn't a big PTE, the
4551                 // mapping must be using the 4k PTEs.
4552                 if (!uvm_page_mask_region_empty(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ], big_region))
4553                     UVM_ASSERT(gpu_state->page_table_range_4k.table);
4554                 continue;
4555             }
4556 
4557             prot = block_page_prot_gpu(block, gpu, big_region.first);
4558 
4559             // All page permissions match
4560             for (pte_bit = 0; pte_bit < UVM_PTE_BITS_GPU_MAX; pte_bit++) {
4561                 if (prot == UVM_PROT_NONE || pte_bit > get_gpu_pte_bit_index(prot))
4562                     UVM_ASSERT(uvm_page_mask_region_empty(&gpu_state->pte_bits[pte_bit], big_region));
4563                 else
4564                     UVM_ASSERT(uvm_page_mask_region_full(&gpu_state->pte_bits[pte_bit], big_region));
4565             }
4566 
4567             if (prot != UVM_PROT_NONE) {
4568                 resident_id = block_gpu_get_processor_to_map(block, gpu, big_region.first);
4569 
4570                 // The mapped processor should be fully resident and physically-
4571                 // contiguous. Exception: UVM-Lite GPUs always map the preferred
4572                 // location even if the memory is resident elsewhere. Skip the
4573                 // residency check but still verify contiguity.
4574                 if (!uvm_processor_mask_test(block_get_uvm_lite_gpus(block), gpu->id)) {
4575                     UVM_ASSERT(uvm_page_mask_region_full(uvm_va_block_resident_mask_get(block, resident_id),
4576                                                          big_region));
4577                 }
4578 
4579                 if (UVM_ID_IS_CPU(resident_id)) {
4580                     uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, big_region.first);
4581 
4582                     UVM_ASSERT(gpu->parent->can_map_sysmem_with_large_pages);
4583                     UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) >= uvm_va_block_region_size(big_region));
4584                 }
4585                 else {
4586                     // Check GPU chunks
4587                     chunk = block_phys_page_chunk(block, block_phys_page(resident_id, big_region.first), NULL);
4588                     chunk_region = uvm_va_block_chunk_region(block, uvm_gpu_chunk_get_size(chunk), big_region.first);
4589                     UVM_ASSERT(uvm_va_block_region_contains_region(chunk_region, big_region));
4590                 }
4591             }
4592         }
4593     }
4594 
4595     return true;
4596 }
4597 
4598 static bool block_check_mappings(uvm_va_block_t *block)
4599 {
4600     uvm_page_index_t page_index;
4601     uvm_processor_id_t id;
4602 
4603     // Verify the master masks, since block_check_mappings_page relies on them
4604     for_each_processor_id(id) {
4605         const uvm_page_mask_t *resident_mask, *map_mask;
4606 
4607         if (UVM_ID_IS_GPU(id) && !uvm_va_block_gpu_state_get(block, id)) {
4608             UVM_ASSERT(!uvm_processor_mask_test(&block->resident, id));
4609             UVM_ASSERT(!uvm_processor_mask_test(&block->mapped, id));
4610             UVM_ASSERT(!uvm_processor_mask_test(&block->evicted_gpus, id));
4611             continue;
4612         }
4613 
4614         resident_mask = uvm_va_block_resident_mask_get(block, id);
4615         UVM_ASSERT(uvm_processor_mask_test(&block->resident, id) == !uvm_page_mask_empty(resident_mask));
4616 
4617         map_mask = uvm_va_block_map_mask_get(block, id);
4618         UVM_ASSERT(uvm_processor_mask_test(&block->mapped, id) == !uvm_page_mask_empty(map_mask));
4619 
4620         if (UVM_ID_IS_GPU(id)) {
4621             const uvm_page_mask_t *evicted_mask = block_evicted_mask_get(block, id);
4622             UVM_ASSERT(uvm_processor_mask_test(&block->evicted_gpus, id) == !uvm_page_mask_empty(evicted_mask));
4623 
4624             // Pages cannot be resident if they are marked as evicted
4625             UVM_ASSERT(!uvm_page_mask_intersects(evicted_mask, resident_mask));
4626 
4627             // Pages cannot be resident on a GPU with no memory
4628             if (!block_processor_has_memory(block, id))
4629                 UVM_ASSERT(!uvm_processor_mask_test(&block->resident, id));
4630         }
4631     }
4632 
4633     // Check that every page has coherent mappings
4634     for_each_va_block_page(page_index, block)
4635         block_check_mappings_page(block, page_index);
4636 
4637     for_each_gpu_id(id) {
4638         if (uvm_va_block_gpu_state_get(block, id)) {
4639             uvm_gpu_t *gpu = block_get_gpu(block, id);
4640 
4641             // Check big and/or 2M PTE state
4642             block_check_mappings_ptes(block, gpu);
4643         }
4644     }
4645 
4646     return true;
4647 }
4648 
4649 // See the comments on uvm_va_block_unmap
4650 static void block_unmap_cpu(uvm_va_block_t *block, uvm_va_block_region_t region, const uvm_page_mask_t *unmap_pages)
4651 {
4652     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
4653     uvm_pte_bits_cpu_t pte_bit;
4654     bool unmapped_something = false;
4655     uvm_va_block_region_t subregion;
4656     NvU32 num_mapped_processors;
4657 
4658     // Early-out if nothing in the region is mapped or being unmapped.
4659     if (!block_has_valid_mapping_cpu(block, region) ||
4660         (unmap_pages && !uvm_page_mask_intersects(unmap_pages, &block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ])))
4661         return;
4662 
4663     // We can't actually unmap HMM ranges from the CPU here.
4664     // Unmapping happens as part of migrate_vma_setup().
4665     if (uvm_va_block_is_hmm(block)) {
4666         UVM_ASSERT(!uvm_va_block_is_hmm(block));
4667         return;
4668     }
4669 
4670     num_mapped_processors = uvm_processor_mask_get_count(&block->mapped);
4671 
4672     // If we are unmapping a page which we are tracking due to CPU faults with
4673     // correct permissions, clear the info. This will cover both the unmap and
4674     // revoke cases (since we implement CPU revocation by unmap + map)
4675     if (block->cpu.fault_authorized.first_fault_stamp &&
4676         uvm_page_mask_region_test(unmap_pages, region, block->cpu.fault_authorized.page_index))
4677         block->cpu.fault_authorized.first_fault_stamp = 0;
4678 
4679     for_each_va_block_subregion_in_mask(subregion, unmap_pages, region) {
4680         if (!block_has_valid_mapping_cpu(block, subregion))
4681             continue;
4682 
4683         unmap_mapping_range(va_space->mapping,
4684                             uvm_va_block_region_start(block, subregion),
4685                             uvm_va_block_region_size(subregion), 1);
4686 
4687         for (pte_bit = 0; pte_bit < UVM_PTE_BITS_CPU_MAX; pte_bit++)
4688             uvm_page_mask_region_clear(&block->cpu.pte_bits[pte_bit], subregion);
4689 
4690         // If the CPU is the only processor with mappings we can safely mark
4691         // the pages as fully unmapped
4692         if (num_mapped_processors == 1)
4693             uvm_page_mask_region_clear(&block->maybe_mapped_pages, subregion);
4694 
4695         unmapped_something = true;
4696     }
4697 
4698     if (!unmapped_something)
4699         return;
4700 
4701     // Check whether the block has any more mappings
4702     if (uvm_page_mask_empty(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ])) {
4703         UVM_ASSERT(uvm_page_mask_empty(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE]));
4704         uvm_processor_mask_clear(&block->mapped, UVM_ID_CPU);
4705     }
4706 
4707     UVM_ASSERT(block_check_mappings(block));
4708 }
4709 
4710 // Given a mask of mapped pages, returns true if any of the pages in the mask
4711 // are mapped remotely by the given GPU.
4712 static bool block_has_remote_mapping_gpu(uvm_va_block_t *block,
4713                                          uvm_page_mask_t *scratch_page_mask,
4714                                          uvm_gpu_id_t gpu_id,
4715                                          const uvm_page_mask_t *mapped_pages)
4716 {
4717     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu_id);
4718 
4719     if (!gpu_state)
4720         return false;
4721 
4722     // The caller must ensure that all pages of the input mask are really mapped
4723     UVM_ASSERT(uvm_page_mask_subset(mapped_pages, &gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ]));
4724 
4725     // UVM-Lite GPUs map the preferred location if it's accessible, regardless
4726     // of the resident location.
4727     if (uvm_processor_mask_test(block_get_uvm_lite_gpus(block), gpu_id)) {
4728         if (uvm_page_mask_empty(mapped_pages))
4729             return false;
4730 
4731         return !uvm_id_equal(uvm_va_range_get_policy(block->va_range)->preferred_location, gpu_id);
4732     }
4733 
4734     // Remote pages are pages which are mapped but not resident locally
4735     return uvm_page_mask_andnot(scratch_page_mask, mapped_pages, &gpu_state->resident);
4736 }
4737 
4738 // Writes pte_clear_val to the 4k PTEs covered by clear_page_mask. If
4739 // clear_page_mask is NULL, all 4k PTEs in the {block, gpu} are written.
4740 //
4741 // If tlb_batch is provided, the 4k PTEs written are added to the batch. The
4742 // caller is responsible for ending the TLB batch with the appropriate membar.
4743 static void block_gpu_pte_clear_4k(uvm_va_block_t *block,
4744                                    uvm_gpu_t *gpu,
4745                                    const uvm_page_mask_t *clear_page_mask,
4746                                    NvU64 pte_clear_val,
4747                                    uvm_pte_batch_t *pte_batch,
4748                                    uvm_tlb_batch_t *tlb_batch)
4749 {
4750     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
4751     uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
4752     uvm_gpu_phys_address_t pte_addr;
4753     NvU32 pte_size = uvm_mmu_pte_size(tree, UVM_PAGE_SIZE_4K);
4754     uvm_va_block_region_t region = uvm_va_block_region_from_block(block);
4755     uvm_va_block_region_t subregion;
4756     size_t num_ptes, ptes_per_page = PAGE_SIZE / UVM_PAGE_SIZE_4K;
4757 
4758     for_each_va_block_subregion_in_mask(subregion, clear_page_mask, region) {
4759         num_ptes = uvm_va_block_region_num_pages(subregion) * ptes_per_page;
4760 
4761         pte_addr = uvm_page_table_range_entry_address(tree,
4762                                                       &gpu_state->page_table_range_4k,
4763                                                       subregion.first * ptes_per_page);
4764 
4765         uvm_pte_batch_clear_ptes(pte_batch, pte_addr, pte_clear_val, pte_size, num_ptes);
4766 
4767         if (tlb_batch) {
4768             uvm_tlb_batch_invalidate(tlb_batch,
4769                                      uvm_va_block_region_start(block, subregion),
4770                                      uvm_va_block_region_size(subregion),
4771                                      UVM_PAGE_SIZE_4K,
4772                                      UVM_MEMBAR_NONE);
4773         }
4774     }
4775 }
4776 
4777 // Writes the 4k PTEs covered by write_page_mask using memory from resident_id
4778 // with new_prot permissions. new_prot must not be UVM_PROT_NONE: use
4779 // block_gpu_pte_clear_4k instead.
4780 //
4781 // If write_page_mask is NULL, all 4k PTEs in the {block, gpu} are written.
4782 //
4783 // If tlb_batch is provided, the 4k PTEs written are added to the batch. The
4784 // caller is responsible for ending the TLB batch with the appropriate membar.
4785 static void block_gpu_pte_write_4k(uvm_va_block_t *block,
4786                                    uvm_gpu_t *gpu,
4787                                    uvm_processor_id_t resident_id,
4788                                    uvm_prot_t new_prot,
4789                                    const uvm_page_mask_t *write_page_mask,
4790                                    uvm_pte_batch_t *pte_batch,
4791                                    uvm_tlb_batch_t *tlb_batch)
4792 {
4793     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
4794     uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
4795     NvU32 pte_size = uvm_mmu_pte_size(tree, UVM_PAGE_SIZE_4K);
4796     const size_t ptes_per_page = PAGE_SIZE / UVM_PAGE_SIZE_4K;
4797     uvm_va_block_region_t contig_region = {0};
4798     uvm_gpu_phys_address_t contig_addr = {0};
4799     uvm_gpu_phys_address_t page_addr = {0};
4800     uvm_page_index_t page_index;
4801     NvU64 pte_flags = block_gpu_pte_flag_cacheable(block, gpu, resident_id);
4802 
4803     UVM_ASSERT(new_prot != UVM_PROT_NONE);
4804     UVM_ASSERT(UVM_ID_IS_VALID(resident_id));
4805 
4806     for_each_va_block_page_in_mask(page_index, write_page_mask, block) {
4807         uvm_gpu_phys_address_t pte_addr;
4808         size_t i;
4809 
4810         // Assume that this mapping will be used to write to the page
4811         if (new_prot > UVM_PROT_READ_ONLY && UVM_ID_IS_CPU(resident_id) && !uvm_va_block_is_hmm(block))
4812             block_mark_cpu_page_dirty(block, page_index);
4813 
4814         if (page_index >= contig_region.outer) {
4815             contig_region = block_phys_contig_region(block, page_index, resident_id);
4816             contig_addr = block_phys_page_address(block, block_phys_page(resident_id, contig_region.first), gpu);
4817             page_addr = contig_addr;
4818         }
4819 
4820         page_addr.address = contig_addr.address + (page_index - contig_region.first) * PAGE_SIZE;
4821 
4822         pte_addr = uvm_page_table_range_entry_address(tree,
4823                                                       &gpu_state->page_table_range_4k,
4824                                                       page_index * ptes_per_page);
4825 
4826         // Handle PAGE_SIZE > GPU PTE size
4827         for (i = 0; i < ptes_per_page; i++) {
4828             NvU64 pte_val = tree->hal->make_pte(page_addr.aperture, page_addr.address, new_prot, pte_flags);
4829             uvm_pte_batch_write_pte(pte_batch, pte_addr, pte_val, pte_size);
4830             page_addr.address += UVM_PAGE_SIZE_4K;
4831             pte_addr.address += pte_size;
4832         }
4833 
4834         if (tlb_batch) {
4835             NvU64 page_virt_addr = uvm_va_block_cpu_page_address(block, page_index);
4836             uvm_tlb_batch_invalidate(tlb_batch, page_virt_addr, PAGE_SIZE, UVM_PAGE_SIZE_4K, UVM_MEMBAR_NONE);
4837         }
4838     }
4839 }
4840 
4841 // Writes all 4k PTEs under the big PTE regions described by big_ptes_covered.
4842 // This is used to initialize the 4k PTEs when splitting 2M and big PTEs. It
4843 // only writes 4k PTEs, not big PTEs.
4844 //
4845 // For those 4k PTEs, new_pages_mask indicates which ones should inherit the
4846 // mapping from the corresponding big page (0) and which ones should be written
4847 // using memory from resident_id and new_prot (1). Unlike the other pte_write
4848 // functions, new_prot may be UVM_PROT_NONE.
4849 //
4850 // If resident_id is UVM_ID_INVALID, this function looks up the resident ID
4851 // which should inherit the current permissions. new_prot must be UVM_PROT_NONE
4852 // in this case.
4853 //
4854 // new_pages_mask must not be NULL.
4855 //
4856 // No TLB invalidates are required since we've set up the lower PTEs to never be
4857 // cached by the GPU's MMU when covered by larger PTEs.
4858 static void block_gpu_pte_big_split_write_4k(uvm_va_block_t *block,
4859                                              uvm_va_block_context_t *block_context,
4860                                              uvm_gpu_t *gpu,
4861                                              uvm_processor_id_t resident_id,
4862                                              uvm_prot_t new_prot,
4863                                              const unsigned long *big_ptes_covered,
4864                                              const uvm_page_mask_t *new_pages_mask,
4865                                              uvm_pte_batch_t *pte_batch)
4866 {
4867     uvm_va_block_region_t big_region;
4868     size_t big_page_index;
4869     uvm_processor_id_t curr_resident_id;
4870     uvm_prot_t curr_prot;
4871     NvU32 big_page_size = uvm_va_block_gpu_big_page_size(block, gpu);
4872 
4873     if (UVM_ID_IS_INVALID(resident_id))
4874         UVM_ASSERT(new_prot == UVM_PROT_NONE);
4875 
4876     for_each_set_bit(big_page_index, big_ptes_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) {
4877         big_region = uvm_va_block_big_page_region(block, big_page_index, big_page_size);
4878 
4879         curr_prot = block_page_prot_gpu(block, gpu, big_region.first);
4880 
4881         // The unmap path doesn't know the current residency ahead of time, so
4882         // we have to look it up.
4883         if (UVM_ID_IS_INVALID(resident_id)) {
4884             curr_resident_id = block_gpu_get_processor_to_map(block, gpu, big_region.first);
4885         }
4886         else {
4887             // Check that we aren't changing the aperture of the existing
4888             // mappings. It could be legal in some cases (switching from {RO, A}
4889             // to {RO, B} for example) but we'd need to issue TLB membars.
4890             if (curr_prot != UVM_PROT_NONE)
4891                 UVM_ASSERT(uvm_id_equal(block_gpu_get_processor_to_map(block, gpu, big_region.first), resident_id));
4892 
4893             curr_resident_id = resident_id;
4894         }
4895 
4896         // pages in new_pages_mask under this big page get new_prot
4897         uvm_page_mask_zero(&block_context->scratch_page_mask);
4898         uvm_page_mask_region_fill(&block_context->scratch_page_mask, big_region);
4899         if (uvm_page_mask_and(&block_context->scratch_page_mask, &block_context->scratch_page_mask, new_pages_mask)) {
4900             if (new_prot == UVM_PROT_NONE) {
4901                 block_gpu_pte_clear_4k(block, gpu, &block_context->scratch_page_mask, 0, pte_batch, NULL);
4902             }
4903             else {
4904                 block_gpu_pte_write_4k(block,
4905                                        gpu,
4906                                        curr_resident_id,
4907                                        new_prot,
4908                                        &block_context->scratch_page_mask,
4909                                        pte_batch,
4910                                        NULL);
4911             }
4912         }
4913 
4914         // All other pages under this big page inherit curr_prot
4915         uvm_page_mask_zero(&block_context->scratch_page_mask);
4916         uvm_page_mask_region_fill(&block_context->scratch_page_mask, big_region);
4917         if (uvm_page_mask_andnot(&block_context->scratch_page_mask, &block_context->scratch_page_mask, new_pages_mask)) {
4918             if (curr_prot == UVM_PROT_NONE) {
4919                 block_gpu_pte_clear_4k(block, gpu, &block_context->scratch_page_mask, 0, pte_batch, NULL);
4920             }
4921             else {
4922                 block_gpu_pte_write_4k(block,
4923                                        gpu,
4924                                        curr_resident_id,
4925                                        curr_prot,
4926                                        &block_context->scratch_page_mask,
4927                                        pte_batch,
4928                                        NULL);
4929             }
4930         }
4931     }
4932 }
4933 
4934 // Writes pte_clear_val to the big PTEs in big_ptes_mask. If big_ptes_mask is
4935 // NULL, all big PTEs in the {block, gpu} are cleared.
4936 //
4937 // If tlb_batch is provided, the big PTEs written are added to the batch. The
4938 // caller is responsible for ending the TLB batch with the appropriate membar.
4939 static void block_gpu_pte_clear_big(uvm_va_block_t *block,
4940                                     uvm_gpu_t *gpu,
4941                                     const unsigned long *big_ptes_mask,
4942                                     NvU64 pte_clear_val,
4943                                     uvm_pte_batch_t *pte_batch,
4944                                     uvm_tlb_batch_t *tlb_batch)
4945 {
4946     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
4947     uvm_gpu_va_space_t *gpu_va_space = uvm_va_block_get_gpu_va_space(block, gpu);
4948     NvU32 big_page_size = gpu_va_space->page_tables.big_page_size;
4949     uvm_gpu_phys_address_t pte_addr;
4950     NvU32 pte_size = uvm_mmu_pte_size(&gpu_va_space->page_tables, big_page_size);
4951     size_t big_page_index;
4952     DECLARE_BITMAP(big_ptes_to_clear, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
4953 
4954     if (big_ptes_mask)
4955         bitmap_copy(big_ptes_to_clear, big_ptes_mask, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
4956     else
4957         bitmap_set(big_ptes_to_clear, 0, uvm_va_block_num_big_pages(block, big_page_size));
4958 
4959     for_each_set_bit(big_page_index, big_ptes_to_clear, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) {
4960         pte_addr = uvm_page_table_range_entry_address(&gpu_va_space->page_tables,
4961                                                       &gpu_state->page_table_range_big,
4962                                                       big_page_index);
4963         uvm_pte_batch_clear_ptes(pte_batch, pte_addr, pte_clear_val, pte_size, 1);
4964 
4965         if (tlb_batch) {
4966             uvm_tlb_batch_invalidate(tlb_batch,
4967                                      uvm_va_block_big_page_addr(block, big_page_index, big_page_size),
4968                                      big_page_size,
4969                                      big_page_size,
4970                                      UVM_MEMBAR_NONE);
4971         }
4972     }
4973 }
4974 
4975 // Writes the big PTEs in big_ptes_mask using memory from resident_id with
4976 // new_prot permissions. new_prot must not be UVM_PROT_NONE: use
4977 // block_gpu_pte_clear_big instead.
4978 //
4979 // Unlike block_gpu_pte_clear_big, big_ptes_mask must not be NULL.
4980 //
4981 // If tlb_batch is provided, the big PTEs written are added to the batch. The
4982 // caller is responsible for ending the TLB batch with the appropriate membar.
4983 static void block_gpu_pte_write_big(uvm_va_block_t *block,
4984                                     uvm_gpu_t *gpu,
4985                                     uvm_processor_id_t resident_id,
4986                                     uvm_prot_t new_prot,
4987                                     const unsigned long *big_ptes_mask,
4988                                     uvm_pte_batch_t *pte_batch,
4989                                     uvm_tlb_batch_t *tlb_batch)
4990 {
4991     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
4992     uvm_gpu_va_space_t *gpu_va_space = uvm_va_block_get_gpu_va_space(block, gpu);
4993     uvm_page_tree_t *tree = &gpu_va_space->page_tables;
4994     NvU32 big_page_size = tree->big_page_size;
4995     NvU32 pte_size = uvm_mmu_pte_size(tree, big_page_size);
4996     size_t big_page_index;
4997     uvm_va_block_region_t contig_region = {0};
4998     uvm_gpu_phys_address_t contig_addr = {0};
4999     uvm_gpu_phys_address_t page_addr = {0};
5000     NvU64 pte_flags = block_gpu_pte_flag_cacheable(block, gpu, resident_id);
5001 
5002     UVM_ASSERT(new_prot != UVM_PROT_NONE);
5003     UVM_ASSERT(UVM_ID_IS_VALID(resident_id));
5004     UVM_ASSERT(big_ptes_mask);
5005 
5006     if (!bitmap_empty(big_ptes_mask, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)) {
5007         UVM_ASSERT(uvm_va_block_num_big_pages(block, big_page_size) > 0);
5008 
5009         if (!gpu->parent->can_map_sysmem_with_large_pages)
5010             UVM_ASSERT(UVM_ID_IS_GPU(resident_id));
5011     }
5012 
5013     for_each_set_bit(big_page_index, big_ptes_mask, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) {
5014         NvU64 pte_val;
5015         uvm_gpu_phys_address_t pte_addr;
5016         uvm_va_block_region_t big_region = uvm_va_block_big_page_region(block, big_page_index, big_page_size);
5017 
5018         // Assume that this mapping will be used to write to the page
5019         if (new_prot > UVM_PROT_READ_ONLY && UVM_ID_IS_CPU(resident_id) && !uvm_va_block_is_hmm(block)) {
5020             uvm_page_index_t page_index;
5021 
5022             for_each_va_block_page_in_region(page_index, big_region)
5023                 block_mark_cpu_page_dirty(block, page_index);
5024         }
5025 
5026         if (big_region.first >= contig_region.outer) {
5027             contig_region = block_phys_contig_region(block, big_region.first, resident_id);
5028             contig_addr = block_phys_page_address(block, block_phys_page(resident_id, contig_region.first), gpu);
5029             page_addr = contig_addr;
5030         }
5031 
5032         page_addr.address = contig_addr.address + (big_region.first - contig_region.first) * PAGE_SIZE;
5033 
5034         pte_addr = uvm_page_table_range_entry_address(tree, &gpu_state->page_table_range_big, big_page_index);
5035         pte_val = tree->hal->make_pte(page_addr.aperture, page_addr.address, new_prot, pte_flags);
5036         uvm_pte_batch_write_pte(pte_batch, pte_addr, pte_val, pte_size);
5037 
5038         if (tlb_batch) {
5039             uvm_tlb_batch_invalidate(tlb_batch,
5040                                      uvm_va_block_region_start(block, big_region),
5041                                      big_page_size,
5042                                      big_page_size,
5043                                      UVM_MEMBAR_NONE);
5044         }
5045     }
5046 }
5047 
5048 // Switches any mix of valid or invalid 4k PTEs under the big PTEs in
5049 // big_ptes_to_merge to an unmapped big PTE. This also ends both pte_batch and
5050 // tlb_batch in order to poison the now-unused 4k PTEs.
5051 //
5052 // The 4k PTEs are invalidated with the specified membar.
5053 static void block_gpu_pte_merge_big_and_end(uvm_va_block_t *block,
5054                                             uvm_va_block_context_t *block_context,
5055                                             uvm_gpu_t *gpu,
5056                                             const unsigned long *big_ptes_to_merge,
5057                                             uvm_push_t *push,
5058                                             uvm_pte_batch_t *pte_batch,
5059                                             uvm_tlb_batch_t *tlb_batch,
5060                                             uvm_membar_t tlb_membar)
5061 {
5062     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
5063     uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
5064     NvU32 big_page_size = tree->big_page_size;
5065     NvU64 unmapped_pte_val = tree->hal->unmapped_pte(big_page_size);
5066     size_t big_page_index;
5067     DECLARE_BITMAP(dummy_big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5068 
5069     UVM_ASSERT(!bitmap_empty(big_ptes_to_merge, MAX_BIG_PAGES_PER_UVM_VA_BLOCK));
5070     UVM_ASSERT(!bitmap_and(dummy_big_ptes, gpu_state->big_ptes, big_ptes_to_merge, MAX_BIG_PAGES_PER_UVM_VA_BLOCK));
5071 
5072     // We can be called with the 4k PTEs in two cases:
5073     // 1) 4k PTEs allocated. In this case the 4k PTEs are currently active.
5074     //
5075     // 2) 4k PTEs unallocated. In this case the GPU may not have invalid 4k PTEs
5076     //    active under the big PTE, depending on whether neighboring blocks
5077     //    caused the page tables to be allocated.
5078     //
5079     // In both cases we need to invalidate the 4k PTEs in case the GPU MMU has
5080     // them cached.
5081 
5082     // Each big PTE is currently invalid so the 4ks are active (or unallocated).
5083     // First make the big PTEs unmapped to disable future lookups of the 4ks
5084     // under it. We can't directly transition the entry from valid 4k PTEs to
5085     // valid big PTEs, because that could cause the GPU TLBs to cache the same
5086     // VA in different cache lines. That could cause memory ordering to not be
5087     // maintained.
5088     block_gpu_pte_clear_big(block, gpu, big_ptes_to_merge, unmapped_pte_val, pte_batch, tlb_batch);
5089 
5090     // Now invalidate the big PTEs we just wrote as well as all 4ks under them.
5091     // Subsequent MMU fills will stop at the now-unmapped big PTEs, so we only
5092     // need to invalidate the 4k PTEs without actually writing them.
5093     for_each_set_bit(big_page_index, big_ptes_to_merge, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) {
5094         uvm_tlb_batch_invalidate(tlb_batch,
5095                                  uvm_va_block_big_page_addr(block, big_page_index, big_page_size),
5096                                  big_page_size,
5097                                  big_page_size | UVM_PAGE_SIZE_4K,
5098                                  UVM_MEMBAR_NONE);
5099     }
5100 
5101     // End the batches for the caller. We need to do this here in order to
5102     // poison the 4ks below.
5103     uvm_pte_batch_end(pte_batch);
5104     uvm_tlb_batch_end(tlb_batch, push, tlb_membar);
5105 
5106     // As a guard against bad PTE writes/TLB invalidates, fill the now-unused
5107     // PTEs with a pattern which will trigger fatal faults on access. We have to
5108     // do this after the TLB invalidate of the big PTEs, or the GPU might use
5109     // the new values.
5110     if (UVM_IS_DEBUG() && gpu_state->page_table_range_4k.table) {
5111         uvm_page_mask_init_from_big_ptes(block, gpu, &block_context->scratch_page_mask, big_ptes_to_merge);
5112         uvm_pte_batch_begin(push, pte_batch);
5113         block_gpu_pte_clear_4k(block,
5114                                gpu,
5115                                &block_context->scratch_page_mask,
5116                                tree->hal->poisoned_pte(),
5117                                pte_batch,
5118                                NULL);
5119         uvm_pte_batch_end(pte_batch);
5120     }
5121 }
5122 
5123 // Writes 0 (invalid) to the 2M PTE for this {block, gpu}.
5124 //
5125 // If tlb_batch is provided, the 2M PTE is added to the batch. The caller is
5126 // responsible for ending the TLB batch with the appropriate membar.
5127 static void block_gpu_pte_clear_2m(uvm_va_block_t *block,
5128                                    uvm_gpu_t *gpu,
5129                                    uvm_pte_batch_t *pte_batch,
5130                                    uvm_tlb_batch_t *tlb_batch)
5131 {
5132     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
5133     uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
5134     uvm_gpu_phys_address_t pte_addr = uvm_page_table_range_entry_address(tree, &gpu_state->page_table_range_2m, 0);
5135     NvU32 pte_size = uvm_mmu_pte_size(tree, UVM_PAGE_SIZE_2M);
5136 
5137     // uvm_pte_batch_write_pte only writes the lower 8 bytes of the 16-byte PTE,
5138     // which would cause a problem when trying to make the entry invalid since
5139     // both halves must be 0. Using uvm_pte_batch_clear_ptes writes the entire
5140     // 16 bytes.
5141     uvm_pte_batch_clear_ptes(pte_batch, pte_addr, 0, pte_size, 1);
5142 
5143     if (tlb_batch)
5144         uvm_tlb_batch_invalidate(tlb_batch, block->start, UVM_PAGE_SIZE_2M, UVM_PAGE_SIZE_2M, UVM_MEMBAR_NONE);
5145 }
5146 
5147 // Writes the 2M PTE for {block, gpu} using memory from resident_id with
5148 // new_prot permissions. new_prot must not be UVM_PROT_NONE: use
5149 // block_gpu_pte_clear_2m instead.
5150 //
5151 // If tlb_batch is provided, the 2M PTE is added to the batch. The caller is
5152 // responsible for ending the TLB batch with the appropriate membar.
5153 static void block_gpu_pte_write_2m(uvm_va_block_t *block,
5154                                    uvm_gpu_t *gpu,
5155                                    uvm_processor_id_t resident_id,
5156                                    uvm_prot_t new_prot,
5157                                    uvm_pte_batch_t *pte_batch,
5158                                    uvm_tlb_batch_t *tlb_batch)
5159 {
5160     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
5161     uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
5162     uvm_gpu_phys_address_t pte_addr = uvm_page_table_range_entry_address(tree, &gpu_state->page_table_range_2m, 0);
5163     uvm_gpu_phys_address_t page_addr;
5164     NvU32 pte_size = uvm_mmu_pte_size(tree, UVM_PAGE_SIZE_2M);
5165     NvU64 pte_val;
5166     NvU64 pte_flags = block_gpu_pte_flag_cacheable(block, gpu, resident_id);
5167 
5168     UVM_ASSERT(new_prot != UVM_PROT_NONE);
5169     UVM_ASSERT(UVM_ID_IS_VALID(resident_id));
5170 
5171     if (UVM_ID_IS_CPU(resident_id) && !uvm_va_block_is_hmm(block))
5172         block_mark_cpu_page_dirty(block, 0);
5173 
5174     page_addr = block_phys_page_address(block, block_phys_page(resident_id, 0), gpu);
5175     pte_val = tree->hal->make_pte(page_addr.aperture, page_addr.address, new_prot, pte_flags);
5176     uvm_pte_batch_write_pte(pte_batch, pte_addr, pte_val, pte_size);
5177 
5178     if (tlb_batch)
5179         uvm_tlb_batch_invalidate(tlb_batch, block->start, UVM_PAGE_SIZE_2M, UVM_PAGE_SIZE_2M, UVM_MEMBAR_NONE);
5180 }
5181 
5182 static bool block_gpu_needs_to_activate_table(uvm_va_block_t *block, uvm_gpu_t *gpu)
5183 {
5184     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
5185 
5186     if (!block_gpu_supports_2m(block, gpu))
5187         return false;
5188 
5189     if ((gpu_state->page_table_range_big.table && !gpu_state->activated_big) ||
5190         (gpu_state->page_table_range_4k.table  && !gpu_state->activated_4k))
5191         return true;
5192 
5193     return false;
5194 }
5195 
5196 // Only used if 2M PTEs are supported. Either transitions a 2M PTE to a PDE, or
5197 // activates a newly-allocated page table (big or 4k) while the other is already
5198 // active. The caller must have already written the new PTEs under the table
5199 // with the appropriate membar.
5200 static void block_gpu_write_pde(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_push_t *push, uvm_tlb_batch_t *tlb_batch)
5201 {
5202     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
5203     uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
5204 
5205     if (!gpu_state->pte_is_2m)
5206         UVM_ASSERT(block_gpu_needs_to_activate_table(block, gpu));
5207 
5208     UVM_ASSERT(gpu_state->page_table_range_big.table || gpu_state->page_table_range_4k.table);
5209 
5210     // We always need a membar to order PDE/PTE writes with the TLB invalidate.
5211     // write_pde will do a MEMBAR_SYS by default.
5212     if (uvm_page_table_range_aperture(&gpu_state->page_table_range_2m) == UVM_APERTURE_VID)
5213         uvm_push_set_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU);
5214     uvm_page_tree_write_pde(tree, &gpu_state->page_table_range_2m, push);
5215 
5216     gpu->parent->host_hal->wait_for_idle(push);
5217 
5218     // Invalidate just the PDE
5219     uvm_tlb_batch_invalidate(tlb_batch, block->start, UVM_PAGE_SIZE_2M, UVM_PAGE_SIZE_2M, UVM_MEMBAR_NONE);
5220 
5221     if (gpu_state->page_table_range_big.table)
5222         gpu_state->activated_big = true;
5223 
5224     if (gpu_state->page_table_range_4k.table)
5225         gpu_state->activated_4k = true;
5226 }
5227 
5228 // Called to switch the 2M PTE (valid or invalid) to a PDE. The caller should
5229 // have written all lower PTEs as appropriate into the given pte_batch already.
5230 // This function ends the PTE batch, activates the 2M PDE, and does a TLB
5231 // invalidate.
5232 //
5233 // The caller does not need to do any TLB invalidates since none of the lower
5234 // PTEs could be cached.
5235 static void block_gpu_pte_finish_split_2m(uvm_va_block_t *block,
5236                                           uvm_gpu_t *gpu,
5237                                           uvm_push_t *push,
5238                                           uvm_pte_batch_t *pte_batch,
5239                                           uvm_tlb_batch_t *tlb_batch,
5240                                           uvm_membar_t tlb_membar)
5241 {
5242     uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
5243     uvm_prot_t curr_prot = block_page_prot_gpu(block, gpu, 0);
5244 
5245     // Step 1: Make the 2M entry invalid. We can't directly transition from a
5246     //         valid 2M PTE to valid lower PTEs, because that could cause the
5247     //         GPU TLBs to cache the same VA in different cache lines. That
5248     //         could cause memory ordering to not be maintained.
5249     //
5250     //         If the 2M PTE is already invalid, no TLB invalidate is needed.
5251 
5252     if (curr_prot == UVM_PROT_NONE) {
5253         // If we aren't downgrading, then we don't need a membar.
5254         UVM_ASSERT(tlb_membar == UVM_MEMBAR_NONE);
5255 
5256         // End the batch, which pushes a membar to ensure that the caller's PTE
5257         // writes below 2M are observed before the PDE write we're about to do.
5258         uvm_pte_batch_end(pte_batch);
5259     }
5260     else {
5261         // The 64k and 4k PTEs can't possibly be cached since the 2M entry is
5262         // not yet a PDE, so we just need to invalidate this single 2M entry.
5263         uvm_tlb_batch_begin(tree, tlb_batch);
5264         block_gpu_pte_clear_2m(block, gpu, pte_batch, tlb_batch);
5265 
5266         // Make sure the PTE writes are observed before the TLB invalidate
5267         uvm_pte_batch_end(pte_batch);
5268         uvm_tlb_batch_end(tlb_batch, push, tlb_membar);
5269     }
5270 
5271     // Step 2: Switch the 2M entry from invalid to a PDE. This activates the
5272     //         smaller PTEs.
5273     uvm_tlb_batch_begin(tree, tlb_batch);
5274     block_gpu_write_pde(block, gpu, push, tlb_batch);
5275     uvm_tlb_batch_end(tlb_batch, push, UVM_MEMBAR_NONE);
5276 }
5277 
5278 // Switches any mix of valid or invalid 4k or 64k PTEs to an invalid 2M PTE.
5279 // Any lower PTEs are invalidated with the specified membar.
5280 static void block_gpu_pte_merge_2m(uvm_va_block_t *block,
5281                                    uvm_va_block_context_t *block_context,
5282                                    uvm_gpu_t *gpu,
5283                                    uvm_push_t *push,
5284                                    uvm_membar_t tlb_membar)
5285 {
5286     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
5287     uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
5288     uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch;
5289     uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch;
5290     NvU32 tlb_inval_sizes;
5291 
5292     UVM_ASSERT(!gpu_state->pte_is_2m);
5293     UVM_ASSERT(gpu_state->page_table_range_big.table || gpu_state->page_table_range_4k.table);
5294 
5295     // The 2M entry is currently a PDE, so first make it invalid. We can't
5296     // directly transition the entry from a valid PDE to a valid 2M PTE, because
5297     // that could cause the GPU TLBs to cache the same VA in different cache
5298     // lines. That could cause memory ordering to not be maintained.
5299     uvm_pte_batch_begin(push, pte_batch);
5300     block_gpu_pte_clear_2m(block, gpu, pte_batch, NULL);
5301     uvm_pte_batch_end(pte_batch);
5302 
5303     // Now invalidate both the 2M entry we just wrote as well as all lower-level
5304     // entries which could be cached. Subsequent MMU fills will stop at the now-
5305     // invalid 2M entry, so we only need to invalidate the lower PTEs without
5306     // actually writing them.
5307     tlb_inval_sizes = UVM_PAGE_SIZE_2M;
5308     if (gpu_state->page_table_range_big.table)
5309         tlb_inval_sizes |= UVM_PAGE_SIZE_64K;
5310 
5311     // Strictly-speaking we only need to invalidate those 4k ranges which are
5312     // not covered by a big pte. However, any such invalidate will require
5313     // enough 4k invalidates to force the TLB batching to invalidate everything
5314     // anyway, so just do the simpler thing.
5315     if (!bitmap_full(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK))
5316         tlb_inval_sizes |= UVM_PAGE_SIZE_4K;
5317 
5318     uvm_tlb_batch_begin(tree, tlb_batch);
5319     uvm_tlb_batch_invalidate(tlb_batch, block->start, UVM_PAGE_SIZE_2M, tlb_inval_sizes, UVM_MEMBAR_NONE);
5320     uvm_tlb_batch_end(tlb_batch, push, tlb_membar);
5321 
5322     // As a guard against bad PTE writes/TLB invalidates, fill the now-unused
5323     // PTEs with a pattern which will trigger fatal faults on access. We have to
5324     // do this after the TLB invalidate of the 2M entry, or the GPU might use
5325     // the new values.
5326     if (UVM_IS_DEBUG()) {
5327         uvm_pte_batch_begin(push, pte_batch);
5328 
5329         if (gpu_state->page_table_range_big.table) {
5330             block_gpu_pte_clear_big(block,
5331                                     gpu,
5332                                     NULL,
5333                                     tree->hal->poisoned_pte(),
5334                                     pte_batch,
5335                                     NULL);
5336         }
5337 
5338         if (gpu_state->page_table_range_4k.table) {
5339             block_gpu_pte_clear_4k(block,
5340                                    gpu,
5341                                    NULL,
5342                                    tree->hal->poisoned_pte(),
5343                                    pte_batch,
5344                                    NULL);
5345         }
5346 
5347         uvm_pte_batch_end(pte_batch);
5348     }
5349 }
5350 
5351 static uvm_membar_t block_pte_op_membar(block_pte_op_t pte_op, uvm_gpu_t *gpu, uvm_processor_id_t resident_id)
5352 {
5353     // Permissions upgrades (MAP) don't need membars
5354     if (pte_op == BLOCK_PTE_OP_MAP)
5355         return UVM_MEMBAR_NONE;
5356 
5357     UVM_ASSERT(UVM_ID_IS_VALID(resident_id));
5358     UVM_ASSERT(pte_op == BLOCK_PTE_OP_REVOKE);
5359 
5360     return uvm_hal_downgrade_membar_type(gpu, uvm_id_equal(gpu->id, resident_id));
5361 }
5362 
5363 // Write the 2M PTE for {block, gpu} to the memory on resident_id with new_prot
5364 // permissions. If the 2M entry is currently a PDE, it is first merged into a
5365 // PTE.
5366 //
5367 // new_prot must not be UVM_PROT_NONE: use block_gpu_unmap_to_2m instead.
5368 //
5369 // pte_op specifies whether this is a MAP or REVOKE operation, which determines
5370 // the TLB membar required.
5371 static void block_gpu_map_to_2m(uvm_va_block_t *block,
5372                                 uvm_va_block_context_t *block_context,
5373                                 uvm_gpu_t *gpu,
5374                                 uvm_processor_id_t resident_id,
5375                                 uvm_prot_t new_prot,
5376                                 uvm_push_t *push,
5377                                 block_pte_op_t pte_op)
5378 {
5379     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
5380     uvm_gpu_va_space_t *gpu_va_space = uvm_va_block_get_gpu_va_space(block, gpu);
5381     uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch;
5382     uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch;
5383     uvm_membar_t tlb_membar;
5384 
5385     UVM_ASSERT(new_prot != UVM_PROT_NONE);
5386 
5387     // If we have a mix of big and 4k PTEs, we have to first merge them to an
5388     // invalid 2M PTE.
5389     if (!gpu_state->pte_is_2m) {
5390         block_gpu_pte_merge_2m(block, block_context, gpu, push, UVM_MEMBAR_NONE);
5391 
5392         gpu_state->pte_is_2m = true;
5393         bitmap_zero(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5394     }
5395 
5396     // Write the new permissions
5397     uvm_pte_batch_begin(push, pte_batch);
5398     uvm_tlb_batch_begin(&gpu_va_space->page_tables, tlb_batch);
5399 
5400     block_gpu_pte_write_2m(block, gpu, resident_id, new_prot, pte_batch, tlb_batch);
5401 
5402     uvm_pte_batch_end(pte_batch);
5403 
5404     tlb_membar = block_pte_op_membar(pte_op, gpu, resident_id);
5405     uvm_tlb_batch_end(tlb_batch, push, tlb_membar);
5406 }
5407 
5408 // Combination split + map operation, called when only part of a 2M PTE mapping
5409 // is being changed. This splits an existing valid or invalid 2M PTE into the
5410 // mix of big and 4k PTEs described by block_context->mapping.new_pte_state.
5411 //
5412 // The PTEs covering the pages in pages_to_write are written to the memory on
5413 // resident_id with new_prot permissions. new_prot must not be UVM_PROT_NONE.
5414 //
5415 // The PTEs covering the pages not set in pages_to_write inherit the mapping of
5416 // the current 2M PTE. If the current mapping is valid, it must target
5417 // resident_id.
5418 //
5419 // pte_op specifies whether this is a MAP or REVOKE operation, which determines
5420 // the TLB membar required.
5421 static void block_gpu_map_split_2m(uvm_va_block_t *block,
5422                                    uvm_va_block_context_t *block_context,
5423                                    uvm_gpu_t *gpu,
5424                                    uvm_processor_id_t resident_id,
5425                                    const uvm_page_mask_t *pages_to_write,
5426                                    uvm_prot_t new_prot,
5427                                    uvm_push_t *push,
5428                                    block_pte_op_t pte_op)
5429 {
5430     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
5431     uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
5432     uvm_va_block_new_pte_state_t *new_pte_state = &block_context->mapping.new_pte_state;
5433     uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch;
5434     uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch;
5435     uvm_prot_t curr_prot = block_page_prot_gpu(block, gpu, 0);
5436     uvm_membar_t tlb_membar;
5437     DECLARE_BITMAP(big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5438     DECLARE_BITMAP(big_ptes_inherit, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5439     DECLARE_BITMAP(big_ptes_new_prot, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5440 
5441     UVM_ASSERT(gpu_state->pte_is_2m);
5442 
5443     if (!gpu_state->page_table_range_4k.table)
5444         UVM_ASSERT(bitmap_full(new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK));
5445 
5446     uvm_pte_batch_begin(push, pte_batch);
5447 
5448     // Since the 2M entry is active as a PTE, the GPU MMU can't fetch entries
5449     // from the lower levels. This means we don't need to issue a TLB invalidate
5450     // when writing those levels.
5451 
5452     // Cases to handle:
5453     // 1) Big PTEs which inherit curr_prot
5454     // 2) Big PTEs which get new_prot
5455     // 3) Big PTEs which are split to 4k
5456     //    a) 4k PTEs which inherit curr_prot under the split big PTEs
5457     //    b) 4k PTEs which get new_prot under the split big PTEs
5458 
5459     // Compute the big PTEs which will need to be split to 4k, if any.
5460     bitmap_complement(big_ptes_split, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5461 
5462     if (gpu_state->page_table_range_big.table) {
5463         // Case 1: Write the big PTEs which will inherit the 2M permissions, if
5464         // any. These are the big PTEs which are unchanged (uncovered) by the
5465         // operation.
5466         bitmap_andnot(big_ptes_inherit,
5467                       new_pte_state->big_ptes,
5468                       new_pte_state->big_ptes_covered,
5469                       MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5470 
5471         if (curr_prot == UVM_PROT_NONE) {
5472             block_gpu_pte_clear_big(block,
5473                                     gpu,
5474                                     big_ptes_inherit,
5475                                     tree->hal->unmapped_pte(UVM_PAGE_SIZE_64K),
5476                                     pte_batch,
5477                                     NULL);
5478         }
5479         else {
5480             block_gpu_pte_write_big(block, gpu, resident_id, curr_prot, big_ptes_inherit, pte_batch, NULL);
5481         }
5482 
5483         // Case 2: Write the new big PTEs
5484         bitmap_and(big_ptes_new_prot,
5485                    new_pte_state->big_ptes,
5486                    new_pte_state->big_ptes_covered,
5487                    MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5488         block_gpu_pte_write_big(block, gpu, resident_id, new_prot, big_ptes_new_prot, pte_batch, NULL);
5489 
5490         // Case 3: Write the big PTEs which cover 4k PTEs
5491         block_gpu_pte_clear_big(block, gpu, big_ptes_split, 0, pte_batch, NULL);
5492 
5493         // We just wrote all possible big PTEs, so mark them as initialized
5494         gpu_state->initialized_big = true;
5495     }
5496     else {
5497         UVM_ASSERT(bitmap_empty(new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK));
5498     }
5499 
5500     // Cases 3a and 3b: Write all 4k PTEs under all now-split big PTEs
5501     block_gpu_pte_big_split_write_4k(block,
5502                                      block_context,
5503                                      gpu,
5504                                      resident_id,
5505                                      new_prot,
5506                                      big_ptes_split,
5507                                      pages_to_write,
5508                                      pte_batch);
5509 
5510     // Activate the 2M PDE. This ends the pte_batch and issues a single TLB
5511     // invalidate for the 2M entry.
5512     tlb_membar = block_pte_op_membar(pte_op, gpu, resident_id);
5513     block_gpu_pte_finish_split_2m(block, gpu, push, pte_batch, tlb_batch, tlb_membar);
5514 
5515     gpu_state->pte_is_2m = false;
5516     bitmap_copy(gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5517 }
5518 
5519 // Split the existing 2M PTE into big and 4k PTEs. No permissions are changed.
5520 //
5521 // new_big_ptes specifies which PTEs should be big. NULL means all PTEs should
5522 // be 4k.
5523 static void block_gpu_split_2m(uvm_va_block_t *block,
5524                                uvm_va_block_context_t *block_context,
5525                                uvm_gpu_t *gpu,
5526                                const unsigned long *new_big_ptes,
5527                                uvm_push_t *push)
5528 {
5529     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
5530     uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
5531     uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch;
5532     uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch;
5533     uvm_prot_t curr_prot = block_page_prot_gpu(block, gpu, 0);
5534     DECLARE_BITMAP(new_big_ptes_local, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5535     DECLARE_BITMAP(big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5536     NvU64 unmapped_pte_val;
5537     uvm_processor_id_t curr_residency;
5538 
5539     UVM_ASSERT(gpu_state->pte_is_2m);
5540 
5541     if (new_big_ptes)
5542         bitmap_copy(new_big_ptes_local, new_big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5543     else
5544         bitmap_zero(new_big_ptes_local, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5545 
5546     if (!bitmap_empty(new_big_ptes_local, MAX_BIG_PAGES_PER_UVM_VA_BLOCK))
5547         UVM_ASSERT(gpu_state->page_table_range_big.table);
5548 
5549     // We're splitting from 2M to big only, so we'll be writing all big PTEs
5550     if (gpu_state->page_table_range_big.table)
5551         gpu_state->initialized_big = true;
5552 
5553     // Cases to handle:
5554     // 1) Big PTEs which inherit curr_prot
5555     // 2) Big PTEs which are split to 4k
5556     //    a) 4k PTEs inherit curr_prot under the split big PTEs
5557 
5558     // big_ptes_split will cover the 4k regions
5559     bitmap_complement(big_ptes_split, new_big_ptes_local, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5560     uvm_page_mask_init_from_big_ptes(block, gpu, &block_context->scratch_page_mask, big_ptes_split);
5561 
5562     uvm_pte_batch_begin(push, pte_batch);
5563 
5564     // Since the 2M entry is active as a PTE, the GPU MMU can't fetch entries
5565     // from the lower levels. This means we don't need to issue a TLB invalidate
5566     // when writing those levels.
5567 
5568     if (curr_prot == UVM_PROT_NONE) {
5569         unmapped_pte_val = tree->hal->unmapped_pte(tree->big_page_size);
5570 
5571         // Case 2a: Clear the 4k PTEs under big_ptes_split
5572         block_gpu_pte_clear_4k(block, gpu, &block_context->scratch_page_mask, 0, pte_batch, NULL);
5573 
5574         // Case 1: Make the remaining big PTEs unmapped
5575         block_gpu_pte_clear_big(block, gpu, new_big_ptes_local, unmapped_pte_val, pte_batch, NULL);
5576     }
5577     else {
5578         curr_residency = block_gpu_get_processor_to_map(block, gpu, 0);
5579 
5580         // Case 2a: Write the new 4k PTEs under big_ptes_split
5581         block_gpu_pte_write_4k(block,
5582                                gpu,
5583                                curr_residency,
5584                                curr_prot,
5585                                &block_context->scratch_page_mask,
5586                                pte_batch,
5587                                NULL);
5588 
5589         // Case 1: Write the new big PTEs
5590         block_gpu_pte_write_big(block, gpu, curr_residency, curr_prot, new_big_ptes_local, pte_batch, NULL);
5591     }
5592 
5593     // Case 2: Make big_ptes_split invalid to activate the 4k PTEs
5594     if (gpu_state->page_table_range_big.table)
5595         block_gpu_pte_clear_big(block, gpu, big_ptes_split, 0, pte_batch, NULL);
5596 
5597     // Activate the 2M PDE. This ends the pte_batch and issues a single TLB
5598     // invalidate for the 2M entry. No membar is necessary since we aren't
5599     // changing permissions.
5600     block_gpu_pte_finish_split_2m(block, gpu, push, pte_batch, tlb_batch, UVM_MEMBAR_NONE);
5601 
5602     gpu_state->pte_is_2m = false;
5603     bitmap_copy(gpu_state->big_ptes, new_big_ptes_local, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5604 }
5605 
5606 // Split the big PTEs in big_ptes_to_split into 4k PTEs. No permissions are
5607 // changed.
5608 //
5609 // big_ptes_to_split must not be NULL.
5610 static void block_gpu_split_big(uvm_va_block_t *block,
5611                                 uvm_va_block_context_t *block_context,
5612                                 uvm_gpu_t *gpu,
5613                                 const unsigned long *big_ptes_to_split,
5614                                 uvm_push_t *push)
5615 {
5616     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
5617     uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
5618     uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch;
5619     uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch;
5620     NvU32 big_page_size = tree->big_page_size;
5621     uvm_va_block_region_t big_region;
5622     uvm_processor_id_t resident_id;
5623     size_t big_page_index;
5624     uvm_prot_t curr_prot;
5625     DECLARE_BITMAP(big_ptes_valid, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5626 
5627     UVM_ASSERT(!gpu_state->pte_is_2m);
5628     UVM_ASSERT(bitmap_subset(big_ptes_to_split, gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK));
5629     UVM_ASSERT(!bitmap_empty(big_ptes_to_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK));
5630 
5631     uvm_pte_batch_begin(push, pte_batch);
5632     uvm_tlb_batch_begin(tree, tlb_batch);
5633 
5634     // Write all 4k PTEs under all big PTEs which are being split. We'll make
5635     // the big PTEs inactive below after flushing these writes. No TLB
5636     // invalidate is needed since the big PTE is active.
5637     bitmap_zero(big_ptes_valid, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5638     for_each_set_bit(big_page_index, big_ptes_to_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) {
5639         big_region = uvm_va_block_big_page_region(block, big_page_index, big_page_size);
5640         curr_prot = block_page_prot_gpu(block, gpu, big_region.first);
5641 
5642         uvm_page_mask_zero(&block_context->scratch_page_mask);
5643         uvm_page_mask_region_fill(&block_context->scratch_page_mask, big_region);
5644         if (curr_prot == UVM_PROT_NONE) {
5645             block_gpu_pte_clear_4k(block, gpu, &block_context->scratch_page_mask, 0, pte_batch, NULL);
5646         }
5647         else {
5648             __set_bit(big_page_index, big_ptes_valid);
5649 
5650             resident_id = block_gpu_get_processor_to_map(block, gpu, big_region.first);
5651 
5652             block_gpu_pte_write_4k(block,
5653                                    gpu,
5654                                    resident_id,
5655                                    curr_prot,
5656                                    &block_context->scratch_page_mask,
5657                                    pte_batch,
5658                                    NULL);
5659         }
5660     }
5661 
5662     // Unmap the big PTEs which are valid and are being split to 4k. We can't
5663     // directly transition from a valid big PTE to valid lower PTEs, because
5664     // that could cause the GPU TLBs to cache the same VA in different cache
5665     // lines. That could cause memory ordering to not be maintained.
5666     block_gpu_pte_clear_big(block, gpu, big_ptes_valid, tree->hal->unmapped_pte(big_page_size), pte_batch, tlb_batch);
5667 
5668     // End the batches. We have to commit the membars and TLB invalidates
5669     // before we finish splitting formerly-big PTEs. No membar is necessary
5670     // since we aren't changing permissions.
5671     uvm_pte_batch_end(pte_batch);
5672     uvm_tlb_batch_end(tlb_batch, push, UVM_MEMBAR_NONE);
5673 
5674     // Finish the split by switching the big PTEs from unmapped to invalid. This
5675     // causes the GPU MMU to start reading the 4k PTEs instead of stopping at
5676     // the unmapped big PTEs.
5677     uvm_pte_batch_begin(push, pte_batch);
5678     uvm_tlb_batch_begin(tree, tlb_batch);
5679 
5680     block_gpu_pte_clear_big(block, gpu, big_ptes_to_split, 0, pte_batch, tlb_batch);
5681 
5682     uvm_pte_batch_end(pte_batch);
5683 
5684     // Finally, activate the page tables if they're inactive
5685     if (block_gpu_needs_to_activate_table(block, gpu))
5686         block_gpu_write_pde(block, gpu, push, tlb_batch);
5687 
5688     uvm_tlb_batch_end(tlb_batch, push, UVM_MEMBAR_NONE);
5689 
5690     bitmap_andnot(gpu_state->big_ptes, gpu_state->big_ptes, big_ptes_to_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5691 }
5692 
5693 // Changes permissions on some pre-existing mix of big and 4k PTEs into some
5694 // other mix of big and 4k PTEs, as described by
5695 // block_context->mapping.new_pte_state.
5696 //
5697 // The PTEs covering the pages in pages_to_write are written to the memory on
5698 // resident_id with new_prot permissions. new_prot must not be UVM_PROT_NONE.
5699 //
5700 // pte_op specifies whether this is a MAP or REVOKE operation, which determines
5701 // the TLB membar required.
5702 static void block_gpu_map_big_and_4k(uvm_va_block_t *block,
5703                                      uvm_va_block_context_t *block_context,
5704                                      uvm_gpu_t *gpu,
5705                                      uvm_processor_id_t resident_id,
5706                                      const uvm_page_mask_t *pages_to_write,
5707                                      uvm_prot_t new_prot,
5708                                      uvm_push_t *push,
5709                                      block_pte_op_t pte_op)
5710 {
5711     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
5712     uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
5713     uvm_va_block_new_pte_state_t *new_pte_state = &block_context->mapping.new_pte_state;
5714     uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch;
5715     uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch;
5716     DECLARE_BITMAP(big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5717     DECLARE_BITMAP(big_ptes_before_or_after, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5718     DECLARE_BITMAP(big_ptes_merge, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5719     DECLARE_BITMAP(big_ptes_mask, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5720     uvm_va_block_region_t big_region;
5721     size_t big_page_index;
5722     NvU32 big_page_size = tree->big_page_size;
5723     uvm_membar_t tlb_membar = block_pte_op_membar(pte_op, gpu, resident_id);
5724 
5725     UVM_ASSERT(!gpu_state->pte_is_2m);
5726 
5727     uvm_pte_batch_begin(push, pte_batch);
5728     uvm_tlb_batch_begin(tree, tlb_batch);
5729 
5730     // All of these cases might be perfomed in the same call:
5731     // 1) Split currently-big PTEs to 4k
5732     //    a) Write new 4k PTEs which inherit curr_prot under the split big PTEs
5733     //    b) Write new 4k PTEs which get new_prot under the split big PTEs
5734     // 2) Merge currently-4k PTEs to big with new_prot
5735     // 3) Write currently-big PTEs which wholly get new_prot
5736     // 4) Write currently-4k PTEs which get new_prot
5737     // 5) Initialize big PTEs which are not covered by this operation
5738 
5739     // Cases 1a and 1b: Write all 4k PTEs under all currently-big PTEs which are
5740     // being split. We'll make the big PTEs inactive below after flushing these
5741     // writes. No TLB invalidate is needed since the big PTE is active.
5742     //
5743     // Mask computation: big_before && !big_after
5744     bitmap_andnot(big_ptes_split, gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5745 
5746     block_gpu_pte_big_split_write_4k(block,
5747                                      block_context,
5748                                      gpu,
5749                                      resident_id,
5750                                      new_prot,
5751                                      big_ptes_split,
5752                                      pages_to_write,
5753                                      pte_batch);
5754 
5755     // Case 4: Write the 4k PTEs which weren't covered by a big PTE before, and
5756     // remain uncovered after the operation.
5757     //
5758     // Mask computation: !big_before && !big_after
5759     bitmap_or(big_ptes_before_or_after, gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5760     uvm_page_mask_init_from_big_ptes(block, gpu, &block_context->scratch_page_mask, big_ptes_before_or_after);
5761     if (uvm_page_mask_andnot(&block_context->scratch_page_mask, pages_to_write, &block_context->scratch_page_mask)) {
5762         block_gpu_pte_write_4k(block,
5763                                gpu,
5764                                resident_id,
5765                                new_prot,
5766                                &block_context->scratch_page_mask,
5767                                pte_batch,
5768                                tlb_batch);
5769     }
5770 
5771     // Case 5: If the big page table is newly-allocated, make sure that all big
5772     // PTEs we aren't otherwise writing (that is, those which cover 4k PTEs) are
5773     // all initialized to invalid.
5774     //
5775     // The similar case of making newly-allocated big PTEs unmapped when no
5776     // lower 4k table is present is handled by having
5777     // block_gpu_compute_new_pte_state set new_pte_state->big_ptes
5778     // appropriately.
5779     if (gpu_state->page_table_range_big.table && !gpu_state->initialized_big) {
5780         // TODO: Bug 1766424: If we have the 4k page table already, we could
5781         //       attempt to merge all uncovered big PTE regions when first
5782         //       allocating the big table. That's probably not worth doing.
5783         UVM_ASSERT(gpu_state->page_table_range_4k.table);
5784         UVM_ASSERT(bitmap_empty(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK));
5785         bitmap_complement(big_ptes_mask, new_pte_state->big_ptes, uvm_va_block_num_big_pages(block, big_page_size));
5786         block_gpu_pte_clear_big(block, gpu, big_ptes_mask, 0, pte_batch, tlb_batch);
5787         gpu_state->initialized_big = true;
5788     }
5789 
5790     // Case 1 (step 1): Unmap the currently-big PTEs which are valid and are
5791     // being split to 4k. We can't directly transition from a valid big PTE to
5792     // valid lower PTEs, because that could cause the GPU TLBs to cache the same
5793     // VA in different cache lines. That could cause memory ordering to not be
5794     // maintained.
5795     bitmap_zero(big_ptes_mask, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5796     for_each_set_bit(big_page_index, big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) {
5797         big_region = uvm_va_block_big_page_region(block, big_page_index, big_page_size);
5798         if (uvm_page_mask_test(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ], big_region.first))
5799             __set_bit(big_page_index, big_ptes_mask);
5800     }
5801 
5802     block_gpu_pte_clear_big(block, gpu, big_ptes_mask, tree->hal->unmapped_pte(big_page_size), pte_batch, tlb_batch);
5803 
5804     // Case 3: Write the currently-big PTEs which remain big PTEs, and are
5805     // wholly changing permissions.
5806     //
5807     // Mask computation: big_before && big_after && covered
5808     bitmap_and(big_ptes_mask, gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5809     if (bitmap_and(big_ptes_mask, big_ptes_mask, new_pte_state->big_ptes_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK))
5810         block_gpu_pte_write_big(block, gpu, resident_id, new_prot, big_ptes_mask, pte_batch, tlb_batch);
5811 
5812     // Case 2 (step 1): Merge the new big PTEs and end the batches, now that
5813     // we've done all of the independent PTE writes we can. This also merges
5814     // newly-allocated uncovered big PTEs to unmapped (see
5815     // block_gpu_compute_new_pte_state).
5816     //
5817     // Mask computation: !big_before && big_after
5818     if (bitmap_andnot(big_ptes_merge, new_pte_state->big_ptes, gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)) {
5819         // This writes the newly-big PTEs to unmapped and ends the PTE and TLB
5820         // batches.
5821         block_gpu_pte_merge_big_and_end(block,
5822                                         block_context,
5823                                         gpu,
5824                                         big_ptes_merge,
5825                                         push,
5826                                         pte_batch,
5827                                         tlb_batch,
5828                                         tlb_membar);
5829 
5830         // Remove uncovered big PTEs. We needed to merge them to unmapped above,
5831         // but they shouldn't get new_prot below.
5832         bitmap_and(big_ptes_merge, big_ptes_merge, new_pte_state->big_ptes_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5833     }
5834     else {
5835         // End the batches. We have to commit the membars and TLB invalidates
5836         // before we finish splitting formerly-big PTEs.
5837         uvm_pte_batch_end(pte_batch);
5838         uvm_tlb_batch_end(tlb_batch, push, tlb_membar);
5839     }
5840 
5841     if (!bitmap_empty(big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) ||
5842         !bitmap_empty(big_ptes_merge, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) ||
5843         block_gpu_needs_to_activate_table(block, gpu)) {
5844 
5845         uvm_pte_batch_begin(push, pte_batch);
5846         uvm_tlb_batch_begin(tree, tlb_batch);
5847 
5848         // Case 1 (step 2): Finish splitting our big PTEs, if we have any, by
5849         // switching them from unmapped to invalid. This causes the GPU MMU to
5850         // start reading the 4k PTEs instead of stopping at the unmapped big
5851         // PTEs.
5852         block_gpu_pte_clear_big(block, gpu, big_ptes_split, 0, pte_batch, tlb_batch);
5853 
5854         // Case 2 (step 2): Finish merging our big PTEs, if we have any, by
5855         // switching them from unmapped to new_prot.
5856         block_gpu_pte_write_big(block, gpu, resident_id, new_prot, big_ptes_merge, pte_batch, tlb_batch);
5857 
5858         uvm_pte_batch_end(pte_batch);
5859 
5860         // Finally, activate the page tables if they're inactive
5861         if (block_gpu_needs_to_activate_table(block, gpu))
5862             block_gpu_write_pde(block, gpu, push, tlb_batch);
5863 
5864         uvm_tlb_batch_end(tlb_batch, push, UVM_MEMBAR_NONE);
5865     }
5866 
5867     // Update gpu_state
5868     bitmap_copy(gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5869 }
5870 
5871 // Unmap all PTEs for {block, gpu}. If the 2M entry is currently a PDE, it is
5872 // merged into a PTE.
5873 static void block_gpu_unmap_to_2m(uvm_va_block_t *block,
5874                                   uvm_va_block_context_t *block_context,
5875                                   uvm_gpu_t *gpu,
5876                                   uvm_push_t *push,
5877                                   uvm_membar_t tlb_membar)
5878 {
5879     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
5880     uvm_gpu_va_space_t *gpu_va_space = uvm_va_block_get_gpu_va_space(block, gpu);
5881     uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch;
5882     uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch;
5883 
5884     if (gpu_state->pte_is_2m) {
5885         // If we're already mapped as a valid 2M PTE, just write it to invalid
5886         uvm_pte_batch_begin(push, pte_batch);
5887         uvm_tlb_batch_begin(&gpu_va_space->page_tables, tlb_batch);
5888 
5889         block_gpu_pte_clear_2m(block, gpu, pte_batch, tlb_batch);
5890 
5891         uvm_pte_batch_end(pte_batch);
5892         uvm_tlb_batch_end(tlb_batch, push, tlb_membar);
5893     }
5894     else {
5895         // Otherwise we have a mix of big and 4K PTEs which need to be merged
5896         // into an invalid 2M PTE.
5897         block_gpu_pte_merge_2m(block, block_context, gpu, push, tlb_membar);
5898 
5899         gpu_state->pte_is_2m = true;
5900         bitmap_zero(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5901     }
5902 }
5903 
5904 // Combination split + unmap operation, called when only part of a valid 2M PTE
5905 // mapping is being unmapped. The 2M PTE is split into a mix of valid and
5906 // invalid big and/or 4k PTEs, as described by
5907 // block_context->mapping.new_pte_state.
5908 //
5909 // The PTEs covering the pages in pages_to_unmap are cleared (unmapped).
5910 //
5911 // The PTEs covering the pages not set in pages_to_unmap inherit the mapping of
5912 // the current 2M PTE.
5913 static void block_gpu_unmap_split_2m(uvm_va_block_t *block,
5914                                      uvm_va_block_context_t *block_context,
5915                                      uvm_gpu_t *gpu,
5916                                      const uvm_page_mask_t *pages_to_unmap,
5917                                      uvm_push_t *push,
5918                                      uvm_membar_t tlb_membar)
5919 {
5920     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
5921     uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
5922     uvm_va_block_new_pte_state_t *new_pte_state = &block_context->mapping.new_pte_state;
5923     uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch;
5924     uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch;
5925     uvm_prot_t curr_prot = block_page_prot_gpu(block, gpu, 0);
5926     uvm_processor_id_t resident_id;
5927     DECLARE_BITMAP(big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5928     DECLARE_BITMAP(big_ptes_inherit, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5929     DECLARE_BITMAP(big_ptes_new_prot, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5930 
5931     UVM_ASSERT(gpu_state->pte_is_2m);
5932 
5933     resident_id = block_gpu_get_processor_to_map(block, gpu, 0);
5934 
5935     uvm_pte_batch_begin(push, pte_batch);
5936 
5937     // Since the 2M entry is active as a PTE, the GPU MMU can't fetch entries
5938     // from the lower levels. This means we don't need to issue a TLB invalidate
5939     // when writing those levels.
5940 
5941     // Cases to handle:
5942     // 1) Big PTEs which inherit curr_prot
5943     // 2) Big PTEs which get unmapped
5944     // 3) Big PTEs which are split to 4k
5945     //    a) 4k PTEs which inherit curr_prot under the split big PTEs
5946     //    b) 4k PTEs which get unmapped under the split big PTEs
5947 
5948     // Compute the big PTEs which will need to be split to 4k, if any.
5949     bitmap_complement(big_ptes_split, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5950 
5951     if (gpu_state->page_table_range_big.table) {
5952         // Case 1: Write the big PTEs which will inherit the 2M permissions, if
5953         // any. These are the big PTEs which are unchanged (uncovered) by the
5954         // operation.
5955         bitmap_andnot(big_ptes_inherit,
5956                       new_pte_state->big_ptes,
5957                       new_pte_state->big_ptes_covered,
5958                       MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5959 
5960         block_gpu_pte_write_big(block, gpu, resident_id, curr_prot, big_ptes_inherit, pte_batch, NULL);
5961 
5962         // Case 2: Clear the new big PTEs which get unmapped (those not covering
5963         // 4ks)
5964         bitmap_and(big_ptes_new_prot,
5965                    new_pte_state->big_ptes,
5966                    new_pte_state->big_ptes_covered,
5967                    MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5968 
5969         block_gpu_pte_clear_big(block,
5970                                 gpu,
5971                                 big_ptes_new_prot,
5972                                 tree->hal->unmapped_pte(UVM_PAGE_SIZE_64K),
5973                                 pte_batch,
5974                                 NULL);
5975 
5976         // Case 3: Write the big PTEs which cover 4k PTEs
5977         block_gpu_pte_clear_big(block, gpu, big_ptes_split, 0, pte_batch, NULL);
5978 
5979         // We just wrote all possible big PTEs, so mark them as initialized
5980         gpu_state->initialized_big = true;
5981     }
5982     else {
5983         UVM_ASSERT(bitmap_empty(new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK));
5984         UVM_ASSERT(bitmap_full(new_pte_state->big_ptes_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK));
5985     }
5986 
5987     // Cases 3a and 3b: Write all 4k PTEs under all now-split big PTEs
5988     block_gpu_pte_big_split_write_4k(block,
5989                                      block_context,
5990                                      gpu,
5991                                      resident_id,
5992                                      UVM_PROT_NONE,
5993                                      big_ptes_split,
5994                                      pages_to_unmap,
5995                                      pte_batch);
5996 
5997     // And activate the 2M PDE. This ends the pte_batch and issues a single TLB
5998     // invalidate for the 2M entry.
5999     block_gpu_pte_finish_split_2m(block, gpu, push, pte_batch, tlb_batch, tlb_membar);
6000 
6001     gpu_state->pte_is_2m = false;
6002     bitmap_copy(gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6003 }
6004 
6005 // Unmap some pre-existing mix of big and 4k PTEs into some other mix of big
6006 // and 4k PTEs.
6007 //
6008 // The PTEs covering the pages in pages_to_unmap are cleared (unmapped).
6009 static void block_gpu_unmap_big_and_4k(uvm_va_block_t *block,
6010                                        uvm_va_block_context_t *block_context,
6011                                        uvm_gpu_t *gpu,
6012                                        const uvm_page_mask_t *pages_to_unmap,
6013                                        uvm_push_t *push,
6014                                        uvm_membar_t tlb_membar)
6015 {
6016     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
6017     uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
6018     uvm_va_block_new_pte_state_t *new_pte_state = &block_context->mapping.new_pte_state;
6019     uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch;
6020     uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch;
6021     DECLARE_BITMAP(big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6022     DECLARE_BITMAP(big_ptes_before_or_after, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6023     DECLARE_BITMAP(big_ptes_mask, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6024     NvU32 big_page_size = tree->big_page_size;
6025     NvU64 unmapped_pte_val = tree->hal->unmapped_pte(big_page_size);
6026 
6027     UVM_ASSERT(!gpu_state->pte_is_2m);
6028 
6029     uvm_pte_batch_begin(push, pte_batch);
6030     uvm_tlb_batch_begin(tree, tlb_batch);
6031 
6032     // All of these cases might be perfomed in the same call:
6033     // 1) Split currently-big PTEs to 4k
6034     //    a) Write new 4k PTEs which inherit curr_prot under the split big PTEs
6035     //    b) Clear new 4k PTEs which get unmapped under the split big PTEs
6036     // 2) Merge currently-4k PTEs to unmapped big
6037     // 3) Clear currently-big PTEs which wholly get unmapped
6038     // 4) Clear currently-4k PTEs which get unmapped
6039     // 5) Initialize big PTEs which are not covered by this operation
6040 
6041     // Cases 1a and 1b: Write all 4k PTEs under all currently-big PTEs which are
6042     // being split. We'll make the big PTEs inactive below after flushing these
6043     // writes. No TLB invalidate is needed since the big PTE is active.
6044     //
6045     // Mask computation: big_before && !big_after
6046     bitmap_andnot(big_ptes_split, gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6047 
6048     block_gpu_pte_big_split_write_4k(block,
6049                                      block_context,
6050                                      gpu,
6051                                      UVM_ID_INVALID,
6052                                      UVM_PROT_NONE,
6053                                      big_ptes_split,
6054                                      pages_to_unmap,
6055                                      pte_batch);
6056 
6057     // Case 4: Clear the 4k PTEs which weren't covered by a big PTE before, and
6058     // remain uncovered after the unmap.
6059     //
6060     // Mask computation: !big_before && !big_after
6061     bitmap_or(big_ptes_before_or_after, gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6062     uvm_page_mask_init_from_big_ptes(block, gpu, &block_context->scratch_page_mask, big_ptes_before_or_after);
6063     if (uvm_page_mask_andnot(&block_context->scratch_page_mask, pages_to_unmap, &block_context->scratch_page_mask))
6064         block_gpu_pte_clear_4k(block, gpu, &block_context->scratch_page_mask, 0, pte_batch, tlb_batch);
6065 
6066     // Case 5: If the big page table is newly-allocated, make sure that all big
6067     // PTEs we aren't otherwise writing (that is, those which cover 4k PTEs) are
6068     // all initialized to invalid.
6069     //
6070     // The similar case of making newly-allocated big PTEs unmapped when no
6071     // lower 4k table is present is handled by having
6072     // block_gpu_compute_new_pte_state set new_pte_state->big_ptes
6073     // appropriately.
6074     if (gpu_state->page_table_range_big.table && !gpu_state->initialized_big) {
6075         // TODO: Bug 1766424: If we have the 4k page table already, we could
6076         //       attempt to merge all uncovered big PTE regions when first
6077         //       allocating the big table. That's probably not worth doing.
6078         UVM_ASSERT(gpu_state->page_table_range_4k.table);
6079         UVM_ASSERT(bitmap_empty(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK));
6080         bitmap_complement(big_ptes_mask, new_pte_state->big_ptes, uvm_va_block_num_big_pages(block, big_page_size));
6081         block_gpu_pte_clear_big(block, gpu, big_ptes_mask, 0, pte_batch, tlb_batch);
6082         gpu_state->initialized_big = true;
6083     }
6084 
6085     // Case 3 and step 1 of case 1: Unmap both currently-big PTEs which are
6086     // getting wholly unmapped, and those currently-big PTEs which are being
6087     // split to 4k. We can't directly transition from a valid big PTE to valid
6088     // lower PTEs, because that could cause the GPU TLBs to cache the same VA in
6089     // different cache lines. That could cause memory ordering to not be
6090     // maintained.
6091     //
6092     // Mask computation: (big_before && big_after && covered) ||
6093     //                   (big_before && !big_after)
6094     bitmap_and(big_ptes_mask, gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6095     bitmap_and(big_ptes_mask, big_ptes_mask, new_pte_state->big_ptes_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6096     bitmap_or(big_ptes_mask, big_ptes_mask, big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6097     block_gpu_pte_clear_big(block, gpu, big_ptes_mask, unmapped_pte_val, pte_batch, tlb_batch);
6098 
6099     // Case 2: Merge the new big PTEs and end the batches, now that we've done
6100     // all of the independent PTE writes we can.
6101     //
6102     // Mask computation: !big_before && big_after
6103     if (bitmap_andnot(big_ptes_mask, new_pte_state->big_ptes, gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)) {
6104         // This writes the newly-big PTEs to unmapped and ends the PTE and TLB
6105         // batches.
6106         block_gpu_pte_merge_big_and_end(block,
6107                                         block_context,
6108                                         gpu,
6109                                         big_ptes_mask,
6110                                         push,
6111                                         pte_batch,
6112                                         tlb_batch,
6113                                         tlb_membar);
6114     }
6115     else {
6116         // End the batches. We have to commit the membars and TLB invalidates
6117         // before we finish splitting formerly-big PTEs.
6118         uvm_pte_batch_end(pte_batch);
6119         uvm_tlb_batch_end(tlb_batch, push, tlb_membar);
6120     }
6121 
6122     if (!bitmap_empty(big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) ||
6123         block_gpu_needs_to_activate_table(block, gpu)) {
6124         uvm_pte_batch_begin(push, pte_batch);
6125         uvm_tlb_batch_begin(tree, tlb_batch);
6126 
6127         // Case 1 (step 2): Finish splitting our big PTEs, if we have any, by
6128         // switching them from unmapped to invalid. This causes the GPU MMU to
6129         // start reading the 4k PTEs instead of stopping at the unmapped big
6130         // PTEs.
6131         block_gpu_pte_clear_big(block, gpu, big_ptes_split, 0, pte_batch, tlb_batch);
6132 
6133         uvm_pte_batch_end(pte_batch);
6134 
6135         // Finally, activate the page tables if they're inactive
6136         if (block_gpu_needs_to_activate_table(block, gpu))
6137             block_gpu_write_pde(block, gpu, push, tlb_batch);
6138 
6139         uvm_tlb_batch_end(tlb_batch, push, UVM_MEMBAR_NONE);
6140     }
6141 
6142     // Update gpu_state
6143     bitmap_copy(gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6144 }
6145 
6146 // When PTE state is about to change (for example due to a map/unmap/revoke
6147 // operation), this function decides how to split and merge the PTEs in response
6148 // to that operation.
6149 //
6150 // The operation is described with the two page masks:
6151 //
6152 // - pages_changing indicates which pages will have their PTE mappings changed
6153 //   on the GPU in some way as a result of the operation (for example, which
6154 //   pages will actually have their mapping permissions upgraded).
6155 //
6156 // - page_mask_after indicates which pages on this GPU will have exactly the
6157 //   same PTE attributes (permissions, residency) as pages_changing after the
6158 //   operation is applied.
6159 //
6160 // PTEs are merged eagerly.
6161 static void block_gpu_compute_new_pte_state(uvm_va_block_t *block,
6162                                             uvm_gpu_t *gpu,
6163                                             uvm_processor_id_t resident_id,
6164                                             const uvm_page_mask_t *pages_changing,
6165                                             const uvm_page_mask_t *page_mask_after,
6166                                             uvm_va_block_new_pte_state_t *new_pte_state)
6167 {
6168     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
6169     uvm_va_block_region_t big_region_all, big_page_region, region;
6170     NvU32 big_page_size;
6171     uvm_page_index_t page_index;
6172     size_t big_page_index;
6173     DECLARE_BITMAP(big_ptes_not_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6174     bool can_make_new_big_ptes;
6175 
6176     memset(new_pte_state, 0, sizeof(*new_pte_state));
6177     new_pte_state->needs_4k = true;
6178 
6179     // TODO: Bug 1676485: Force a specific page size for perf testing
6180 
6181     if (gpu_state->force_4k_ptes)
6182         return;
6183 
6184     // Limit HMM GPU allocations to PAGE_SIZE since migrate_vma_*(),
6185     // hmm_range_fault(), and make_device_exclusive_range() don't handle folios
6186     // yet. Also, it makes mremap() difficult since the new address may not
6187     // align with the GPU block size otherwise.
6188     // If PAGE_SIZE is 64K, the code following this check is OK since 64K
6189     // big_pages is supported on all HMM supported GPUs (Turing+).
6190     // TODO: Bug 3368756: add support for transparent huge pages (THP).
6191     if (uvm_va_block_is_hmm(block) && PAGE_SIZE == UVM_PAGE_SIZE_4K)
6192         return;
6193 
6194     UVM_ASSERT(uvm_page_mask_subset(pages_changing, page_mask_after));
6195 
6196     // If all pages in the 2M mask have the same attributes after the
6197     // operation is applied, we can use a 2M PTE.
6198     if (block_gpu_supports_2m(block, gpu) &&
6199         uvm_page_mask_full(page_mask_after) &&
6200         (UVM_ID_IS_INVALID(resident_id) || is_block_phys_contig(block, resident_id))) {
6201         new_pte_state->pte_is_2m = true;
6202         new_pte_state->needs_4k = false;
6203         return;
6204     }
6205 
6206     // Find big PTEs with matching attributes
6207 
6208     // Can this block fit any big pages?
6209     big_page_size = uvm_va_block_gpu_big_page_size(block, gpu);
6210     big_region_all = uvm_va_block_big_page_region_all(block, big_page_size);
6211     if (big_region_all.first >= big_region_all.outer)
6212         return;
6213 
6214     new_pte_state->needs_4k = false;
6215 
6216     can_make_new_big_ptes = true;
6217 
6218     // Big pages can be used when mapping sysmem if the GPU supports it (Pascal+).
6219     if (UVM_ID_IS_CPU(resident_id) && !gpu->parent->can_map_sysmem_with_large_pages)
6220         can_make_new_big_ptes = false;
6221 
6222     // We must not fail during teardown: unmap (resident_id == UVM_ID_INVALID)
6223     // with no splits required. That means we should avoid allocating PTEs
6224     // which are only needed for merges.
6225     //
6226     // This only matters if we're merging to big PTEs. If we're merging to 2M,
6227     // then we must already have the 2M level (since it has to be allocated
6228     // before the lower levels).
6229     //
6230     // If pte_is_2m already and we don't have a big table, we're splitting so we
6231     // have to allocate.
6232     if (UVM_ID_IS_INVALID(resident_id) && !gpu_state->page_table_range_big.table && !gpu_state->pte_is_2m)
6233         can_make_new_big_ptes = false;
6234 
6235     for_each_va_block_page_in_region_mask(page_index, pages_changing, big_region_all) {
6236         uvm_va_block_region_t contig_region = {0};
6237 
6238         big_page_index = uvm_va_block_big_page_index(block, page_index, big_page_size);
6239         big_page_region = uvm_va_block_big_page_region(block, big_page_index, big_page_size);
6240 
6241         if (!UVM_ID_IS_INVALID(resident_id))
6242             contig_region = block_phys_contig_region(block, page_index, resident_id);
6243 
6244         __set_bit(big_page_index, new_pte_state->big_ptes_covered);
6245 
6246         // When mapping sysmem, we can use big pages only if we are mapping all
6247         // pages in the big page subregion and the CPU pages backing the
6248         // subregion are physically contiguous.
6249         if (can_make_new_big_ptes &&
6250             uvm_page_mask_region_full(page_mask_after, big_page_region) &&
6251             (!UVM_ID_IS_CPU(resident_id) ||
6252              (contig_region.first <= big_page_region.first && contig_region.outer >= big_page_region.outer))) {
6253             __set_bit(big_page_index, new_pte_state->big_ptes);
6254         }
6255 
6256         if (!test_bit(big_page_index, new_pte_state->big_ptes))
6257             new_pte_state->needs_4k = true;
6258 
6259         // Skip to the end of the region
6260         page_index = big_page_region.outer - 1;
6261     }
6262 
6263     if (!new_pte_state->needs_4k) {
6264         // All big page regions in pages_changing will be big PTEs. Now check if
6265         // there are any unaligned pages outside of big_region_all which are
6266         // changing.
6267         region = uvm_va_block_region(0, big_region_all.first);
6268         if (!uvm_page_mask_region_empty(pages_changing, region)) {
6269             new_pte_state->needs_4k = true;
6270         }
6271         else {
6272             region = uvm_va_block_region(big_region_all.outer, uvm_va_block_num_cpu_pages(block));
6273             if (!uvm_page_mask_region_empty(pages_changing, region))
6274                 new_pte_state->needs_4k = true;
6275         }
6276     }
6277 
6278     // Now add in the PTEs which should be big but weren't covered by this
6279     // operation.
6280     //
6281     // Note that we can't assume that a given page table range has been
6282     // initialized if it's present here, since it could have been allocated by a
6283     // thread which had to restart its operation due to allocation retry.
6284     if (gpu_state->pte_is_2m || (block_gpu_supports_2m(block, gpu) && !gpu_state->page_table_range_2m.table)) {
6285         // We're splitting a 2M PTE so all of the uncovered big PTE regions will
6286         // become big PTEs which inherit the 2M permissions. If we haven't
6287         // allocated the 2M table yet, it will start as a 2M PTE until the lower
6288         // levels are allocated, so it's the same split case regardless of
6289         // whether this operation will need to retry a later allocation.
6290         bitmap_complement(big_ptes_not_covered, new_pte_state->big_ptes_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6291     }
6292     else if (!gpu_state->page_table_range_4k.table && !new_pte_state->needs_4k) {
6293         // If we don't have 4k PTEs and we won't be allocating them for this
6294         // operation, all of our PTEs need to be big.
6295         UVM_ASSERT(!bitmap_empty(new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK));
6296         bitmap_zero(big_ptes_not_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6297         bitmap_set(big_ptes_not_covered, 0, uvm_va_block_num_big_pages(block, big_page_size));
6298     }
6299     else {
6300         // Otherwise, add in all of the currently-big PTEs which are unchanging.
6301         // They won't be written, but they need to be carried into the new
6302         // gpu_state->big_ptes when it's updated.
6303         bitmap_andnot(big_ptes_not_covered,
6304                       gpu_state->big_ptes,
6305                       new_pte_state->big_ptes_covered,
6306                       MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6307     }
6308 
6309     bitmap_or(new_pte_state->big_ptes, new_pte_state->big_ptes, big_ptes_not_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6310 }
6311 
6312 // Wrapper around uvm_page_tree_get_ptes() and uvm_page_tree_alloc_table() that
6313 // handles allocation retry. If the block lock has been unlocked and relocked as
6314 // part of the allocation, NV_ERR_MORE_PROCESSING_REQUIRED is returned to signal
6315 // to the caller that the operation likely needs to be restarted. If that
6316 // happens, the pending tracker is added to the block's tracker.
6317 static NV_STATUS block_alloc_pt_range_with_retry(uvm_va_block_t *va_block,
6318                                                  uvm_gpu_t *gpu,
6319                                                  NvU32 page_size,
6320                                                  uvm_page_table_range_t *page_table_range,
6321                                                  uvm_tracker_t *pending_tracker)
6322 {
6323     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id);
6324     uvm_gpu_va_space_t *gpu_va_space = uvm_va_block_get_gpu_va_space(va_block, gpu);
6325     uvm_page_tree_t *page_tables = &gpu_va_space->page_tables;
6326     uvm_va_block_test_t *va_block_test = uvm_va_block_get_test(va_block);
6327     uvm_page_table_range_t local_range;
6328     NV_STATUS status;
6329 
6330     // Blocks may contain large PTEs without starting on a PTE boundary or
6331     // having an aligned size. Cover the PTEs of this size in the block's
6332     // interior so we match uvm_va_block_gpu_state_t::big_ptes.
6333     NvU64 start = UVM_ALIGN_UP(va_block->start, page_size);
6334     NvU64 size  = UVM_ALIGN_DOWN(va_block->end + 1, page_size) - start;
6335 
6336     // VA blocks which can use the 2MB level as either a PTE or a PDE need to
6337     // account for the PDE specially, so they must use uvm_page_tree_alloc_table
6338     // to allocate the lower levels.
6339     bool use_alloc_table = block_gpu_supports_2m(va_block, gpu) && page_size < UVM_PAGE_SIZE_2M;
6340 
6341     UVM_ASSERT(page_table_range->table == NULL);
6342 
6343     if (va_block_test && va_block_test->page_table_allocation_retry_force_count > 0) {
6344         --va_block_test->page_table_allocation_retry_force_count;
6345         status = NV_ERR_NO_MEMORY;
6346     }
6347     else if (use_alloc_table) {
6348         // Pascal+: 4k/64k tables under a 2M entry
6349         UVM_ASSERT(gpu_state->page_table_range_2m.table);
6350         status = uvm_page_tree_alloc_table(page_tables,
6351                                            page_size,
6352                                            UVM_PMM_ALLOC_FLAGS_NONE,
6353                                            &gpu_state->page_table_range_2m,
6354                                            page_table_range);
6355     }
6356     else {
6357         // 4k/big tables on pre-Pascal, and the 2M entry on Pascal+
6358         status = uvm_page_tree_get_ptes(page_tables,
6359                                         page_size,
6360                                         start,
6361                                         size,
6362                                         UVM_PMM_ALLOC_FLAGS_NONE,
6363                                         page_table_range);
6364     }
6365 
6366     if (status == NV_OK)
6367         goto allocated;
6368 
6369     if (status != NV_ERR_NO_MEMORY)
6370         return status;
6371 
6372     // Before unlocking the block lock, any pending work on the block has to be
6373     // added to the block's tracker.
6374     if (pending_tracker) {
6375         status = uvm_tracker_add_tracker_safe(&va_block->tracker, pending_tracker);
6376         if (status != NV_OK)
6377             return status;
6378     }
6379 
6380     // Unlock the va block and retry with eviction enabled
6381     uvm_mutex_unlock(&va_block->lock);
6382 
6383     if (use_alloc_table) {
6384         // Although we don't hold the block lock here, it's safe to pass
6385         // gpu_state->page_table_range_2m to the page tree code because we know
6386         // that the 2m range has already been allocated, and that it can't go
6387         // away while we have the va_space lock held.
6388         status = uvm_page_tree_alloc_table(page_tables,
6389                                            page_size,
6390                                            UVM_PMM_ALLOC_FLAGS_EVICT,
6391                                            &gpu_state->page_table_range_2m,
6392                                            &local_range);
6393     }
6394     else {
6395         status = uvm_page_tree_get_ptes(page_tables,
6396                                         page_size,
6397                                         start,
6398                                         size,
6399                                         UVM_PMM_ALLOC_FLAGS_EVICT,
6400                                         &local_range);
6401     }
6402 
6403     uvm_mutex_lock(&va_block->lock);
6404 
6405     if (status != NV_OK)
6406         return status;
6407 
6408     status = NV_ERR_MORE_PROCESSING_REQUIRED;
6409 
6410     if (page_table_range->table) {
6411         // A different caller allocated the page tables in the meantime, release the
6412         // local copy.
6413         uvm_page_tree_put_ptes(page_tables, &local_range);
6414         return status;
6415     }
6416 
6417     *page_table_range = local_range;
6418 
6419 allocated:
6420     // Mark the 2M PTE as active when we first allocate it, since we don't have
6421     // any PTEs below it yet.
6422     if (page_size == UVM_PAGE_SIZE_2M) {
6423         UVM_ASSERT(!gpu_state->pte_is_2m);
6424         gpu_state->pte_is_2m = true;
6425     }
6426     else if (page_size != UVM_PAGE_SIZE_4K) {
6427         // uvm_page_tree_get_ptes initializes big PTEs to invalid.
6428         // uvm_page_tree_alloc_table does not, so we'll have to do it later.
6429         if (use_alloc_table)
6430             UVM_ASSERT(!gpu_state->initialized_big);
6431         else
6432             gpu_state->initialized_big = true;
6433     }
6434 
6435     return status;
6436 }
6437 
6438 // Helper which allocates all page table ranges necessary for the given page
6439 // sizes. See block_alloc_pt_range_with_retry.
6440 static NV_STATUS block_alloc_ptes_with_retry(uvm_va_block_t *va_block,
6441                                              uvm_gpu_t *gpu,
6442                                              NvU32 page_sizes,
6443                                              uvm_tracker_t *pending_tracker)
6444 {
6445     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id);
6446     uvm_gpu_va_space_t *gpu_va_space = uvm_va_block_get_gpu_va_space(va_block, gpu);
6447     uvm_page_table_range_t *range;
6448     NvU32 page_size;
6449     NV_STATUS status, final_status = NV_OK;
6450 
6451     UVM_ASSERT(gpu_state);
6452 
6453     // Blocks which can map 2M PTE/PDEs must always allocate the 2MB level first
6454     // in order to allocate the levels below.
6455     if (block_gpu_supports_2m(va_block, gpu))
6456         page_sizes |= UVM_PAGE_SIZE_2M;
6457 
6458     UVM_ASSERT((page_sizes & gpu_va_space->page_tables.hal->page_sizes()) == page_sizes);
6459 
6460     for_each_chunk_size_rev(page_size, page_sizes) {
6461         if (page_size == UVM_PAGE_SIZE_2M)
6462             range = &gpu_state->page_table_range_2m;
6463         else if (page_size == UVM_PAGE_SIZE_4K)
6464             range = &gpu_state->page_table_range_4k;
6465         else
6466             range = &gpu_state->page_table_range_big;
6467 
6468         if (range->table)
6469             continue;
6470 
6471         if (page_size == UVM_PAGE_SIZE_2M) {
6472             UVM_ASSERT(!gpu_state->pte_is_2m);
6473             UVM_ASSERT(!gpu_state->page_table_range_big.table);
6474             UVM_ASSERT(!gpu_state->page_table_range_4k.table);
6475         }
6476         else if (page_size != UVM_PAGE_SIZE_4K) {
6477             UVM_ASSERT(uvm_va_block_num_big_pages(va_block, uvm_va_block_gpu_big_page_size(va_block, gpu)) > 0);
6478             UVM_ASSERT(bitmap_empty(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK));
6479         }
6480 
6481         status = block_alloc_pt_range_with_retry(va_block, gpu, page_size, range, pending_tracker);
6482 
6483         // Keep going to allocate the remaining levels even if the allocation
6484         // requires a retry, since we'll likely still need them when we retry
6485         // anyway.
6486         if (status == NV_ERR_MORE_PROCESSING_REQUIRED)
6487             final_status = NV_ERR_MORE_PROCESSING_REQUIRED;
6488         else if (status != NV_OK)
6489             return status;
6490     }
6491 
6492     return final_status;
6493 }
6494 
6495 static NV_STATUS block_alloc_ptes_new_state(uvm_va_block_t *va_block,
6496                                             uvm_gpu_t *gpu,
6497                                             uvm_va_block_new_pte_state_t *new_pte_state,
6498                                             uvm_tracker_t *pending_tracker)
6499 {
6500     NvU32 page_sizes = 0;
6501 
6502     if (new_pte_state->pte_is_2m) {
6503         page_sizes |= UVM_PAGE_SIZE_2M;
6504     }
6505     else {
6506         if (!bitmap_empty(new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK))
6507             page_sizes |= uvm_va_block_gpu_big_page_size(va_block, gpu);
6508 
6509         if (new_pte_state->needs_4k)
6510             page_sizes |= UVM_PAGE_SIZE_4K;
6511         else
6512             UVM_ASSERT(!bitmap_empty(new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK));
6513     }
6514 
6515     return block_alloc_ptes_with_retry(va_block, gpu, page_sizes, pending_tracker);
6516 }
6517 
6518 // Make sure that GMMU PDEs down to PDE1 are populated for the given VA block.
6519 // This is currently used on ATS systems to prevent GPUs from inadvertently
6520 // accessing sysmem via ATS because there is no PDE1 in the GMMU page tables,
6521 // which is where the NOATS bit resides.
6522 //
6523 // The current implementation simply pre-allocates the PTEs for the VA Block,
6524 // which is wasteful because the GPU may never need them.
6525 //
6526 // TODO: Bug 2064188: Change the MMU code to be able to directly refcount PDE1
6527 // page table entries without having to request PTEs.
6528 static NV_STATUS block_pre_populate_pde1_gpu(uvm_va_block_t *block,
6529                                              uvm_gpu_va_space_t *gpu_va_space,
6530                                              uvm_tracker_t *pending_tracker)
6531 {
6532     NvU32 page_sizes;
6533     NvU32 big_page_size;
6534     uvm_gpu_t *gpu;
6535     uvm_va_block_gpu_state_t *gpu_state;
6536 
6537     UVM_ASSERT(block);
6538     UVM_ASSERT(gpu_va_space);
6539     UVM_ASSERT(gpu_va_space->ats.enabled);
6540     UVM_ASSERT(uvm_gpu_va_space_state(gpu_va_space) == UVM_GPU_VA_SPACE_STATE_ACTIVE);
6541 
6542     gpu = gpu_va_space->gpu;
6543     big_page_size = gpu_va_space->page_tables.big_page_size;
6544 
6545     gpu_state = block_gpu_state_get_alloc(block, gpu);
6546     if (!gpu_state)
6547         return NV_ERR_NO_MEMORY;
6548 
6549     // If the VA Block supports 2M pages, allocate the 2M PTE only, as it
6550     // requires less memory
6551     if (block_gpu_supports_2m(block, gpu))
6552         page_sizes = UVM_PAGE_SIZE_2M;
6553     else if (uvm_va_block_num_big_pages(block, big_page_size) > 0)
6554         page_sizes = big_page_size;
6555     else
6556         page_sizes = UVM_PAGE_SIZE_4K;
6557 
6558     return block_alloc_ptes_with_retry(block, gpu, page_sizes, pending_tracker);
6559 }
6560 
6561 static NV_STATUS block_pre_populate_pde1_all_gpus(uvm_va_block_t *block, uvm_tracker_t *pending_tracker)
6562 {
6563     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
6564     NV_STATUS status = NV_OK;
6565 
6566     // Pre-populate PDEs down to PDE1 for all GPU VA spaces on ATS systems. See
6567     // comments in block_pre_populate_pde1_gpu.
6568     if (g_uvm_global.ats.enabled && !block->cpu.ever_mapped) {
6569         uvm_gpu_va_space_t *gpu_va_space;
6570 
6571         for_each_gpu_va_space(gpu_va_space, va_space) {
6572             // We only care about systems where ATS is supported and the application
6573             // enabled it.
6574             if (!gpu_va_space->ats.enabled)
6575                 continue;
6576 
6577             status = block_pre_populate_pde1_gpu(block, gpu_va_space, pending_tracker);
6578             if (status != NV_OK)
6579                 break;
6580         }
6581     }
6582 
6583     return status;
6584 }
6585 
6586 static NV_STATUS block_unmap_gpu(uvm_va_block_t *block,
6587                                  uvm_va_block_context_t *block_context,
6588                                  uvm_gpu_t *gpu,
6589                                  const uvm_page_mask_t *unmap_page_mask,
6590                                  uvm_tracker_t *out_tracker)
6591 {
6592     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
6593     uvm_pte_bits_gpu_t pte_bit;
6594     uvm_push_t push;
6595     uvm_membar_t tlb_membar;
6596     bool only_local_mappings;
6597     uvm_page_mask_t *pages_to_unmap = &block_context->mapping.page_mask;
6598     NV_STATUS status;
6599     uvm_va_block_new_pte_state_t *new_pte_state = &block_context->mapping.new_pte_state;
6600     bool mask_empty;
6601 
6602     // We have to check gpu_state before looking at any VA space state like our
6603     // gpu_va_space, because we could be on the eviction path where we don't
6604     // have a lock on that state. However, since remove_gpu_va_space walks each
6605     // block to unmap the GPU before destroying the gpu_va_space, we're
6606     // guaranteed that if this GPU has page tables, the gpu_va_space can't go
6607     // away while we're holding the block lock.
6608     if (!block_gpu_has_page_tables(block, gpu))
6609         return NV_OK;
6610 
6611     if (!uvm_page_mask_and(pages_to_unmap, unmap_page_mask, &gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ]))
6612         return NV_OK;
6613 
6614     // block_gpu_compute_new_pte_state needs a mask of pages which will have
6615     // matching attributes after the operation is performed. In the case of
6616     // unmap, those are the pages with unset bits.
6617     uvm_page_mask_andnot(&block_context->scratch_page_mask, &gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ], pages_to_unmap);
6618     uvm_page_mask_complement(&block_context->scratch_page_mask, &block_context->scratch_page_mask);
6619     block_gpu_compute_new_pte_state(block,
6620                                     gpu,
6621                                     UVM_ID_INVALID,
6622                                     pages_to_unmap,
6623                                     &block_context->scratch_page_mask,
6624                                     new_pte_state);
6625 
6626     status = block_alloc_ptes_new_state(block, gpu, new_pte_state, out_tracker);
6627     if (status != NV_OK)
6628         return status;
6629 
6630     only_local_mappings = !block_has_remote_mapping_gpu(block, &block_context->scratch_page_mask, gpu->id, pages_to_unmap);
6631     tlb_membar = uvm_hal_downgrade_membar_type(gpu, only_local_mappings);
6632 
6633     status = uvm_push_begin_acquire(gpu->channel_manager,
6634                                     UVM_CHANNEL_TYPE_MEMOPS,
6635                                     &block->tracker,
6636                                     &push,
6637                                     "Unmapping pages in block [0x%llx, 0x%llx)",
6638                                     block->start,
6639                                     block->end + 1);
6640     if (status != NV_OK)
6641         return status;
6642 
6643     if (new_pte_state->pte_is_2m) {
6644         // We're either unmapping a whole valid 2M PTE, or we're unmapping all
6645         // remaining pages in a split 2M PTE.
6646         block_gpu_unmap_to_2m(block, block_context, gpu, &push, tlb_membar);
6647     }
6648     else if (gpu_state->pte_is_2m) {
6649         // The block is currently mapped as a valid 2M PTE and we're unmapping
6650         // some pages within the 2M, so we have to split it into the appropriate
6651         // mix of big and 4k PTEs.
6652         block_gpu_unmap_split_2m(block, block_context, gpu, pages_to_unmap, &push, tlb_membar);
6653     }
6654     else {
6655         // We're unmapping some pre-existing mix of big and 4K PTEs into some
6656         // other mix of big and 4K PTEs.
6657         block_gpu_unmap_big_and_4k(block, block_context, gpu, pages_to_unmap, &push, tlb_membar);
6658     }
6659 
6660     uvm_push_end(&push);
6661 
6662     if (!uvm_processor_mask_test(block_get_uvm_lite_gpus(block), gpu->id)) {
6663         uvm_processor_mask_t non_uvm_lite_gpus;
6664         uvm_processor_mask_andnot(&non_uvm_lite_gpus, &block->mapped, block_get_uvm_lite_gpus(block));
6665 
6666         UVM_ASSERT(uvm_processor_mask_test(&non_uvm_lite_gpus, gpu->id));
6667 
6668         // If the GPU is the only non-UVM-Lite processor with mappings, we can
6669         // safely mark pages as fully unmapped
6670         if (uvm_processor_mask_get_count(&non_uvm_lite_gpus) == 1)
6671             uvm_page_mask_andnot(&block->maybe_mapped_pages, &block->maybe_mapped_pages, pages_to_unmap);
6672     }
6673 
6674     // Clear block PTE state
6675     for (pte_bit = 0; pte_bit < UVM_PTE_BITS_GPU_MAX; pte_bit++) {
6676         mask_empty = !uvm_page_mask_andnot(&gpu_state->pte_bits[pte_bit],
6677                                            &gpu_state->pte_bits[pte_bit],
6678                                            pages_to_unmap);
6679         if (pte_bit == UVM_PTE_BITS_GPU_READ && mask_empty)
6680             uvm_processor_mask_clear(&block->mapped, gpu->id);
6681     }
6682 
6683     UVM_ASSERT(block_check_mappings(block));
6684 
6685     return uvm_tracker_add_push_safe(out_tracker, &push);
6686 }
6687 
6688 NV_STATUS uvm_va_block_unmap(uvm_va_block_t *va_block,
6689                              uvm_va_block_context_t *va_block_context,
6690                              uvm_processor_id_t id,
6691                              uvm_va_block_region_t region,
6692                              const uvm_page_mask_t *unmap_page_mask,
6693                              uvm_tracker_t *out_tracker)
6694 {
6695     uvm_page_mask_t *region_page_mask = &va_block_context->mapping.map_running_page_mask;
6696 
6697     UVM_ASSERT(!uvm_va_block_is_dead(va_block));
6698     uvm_assert_mutex_locked(&va_block->lock);
6699 
6700     if (UVM_ID_IS_CPU(id)) {
6701        block_unmap_cpu(va_block, region, unmap_page_mask);
6702        return NV_OK;
6703     }
6704 
6705     uvm_page_mask_init_from_region(region_page_mask, region, unmap_page_mask);
6706 
6707     return block_unmap_gpu(va_block, va_block_context, block_get_gpu(va_block, id), region_page_mask, out_tracker);
6708 }
6709 
6710 // This function essentially works as a wrapper around vm_insert_page (hence
6711 // the similar function prototype). This is needed since vm_insert_page
6712 // doesn't take permissions as input, but uses vma->vm_page_prot instead.
6713 // Since we may have multiple VA blocks under one VMA which need to map
6714 // with different permissions, we have to manually change vma->vm_page_prot for
6715 // each call to vm_insert_page. Multiple faults under one VMA in separate
6716 // blocks can be serviced concurrently, so the VMA wrapper lock is used
6717 // to protect access to vma->vm_page_prot.
6718 static NV_STATUS uvm_cpu_insert_page(struct vm_area_struct *vma,
6719                                      NvU64 addr,
6720                                      struct page *page,
6721                                      uvm_prot_t new_prot)
6722 {
6723     uvm_vma_wrapper_t *vma_wrapper;
6724     unsigned long target_flags;
6725     pgprot_t target_pgprot;
6726     int ret;
6727 
6728     UVM_ASSERT(vma);
6729     UVM_ASSERT(vma->vm_private_data);
6730 
6731     vma_wrapper = vma->vm_private_data;
6732     target_flags = vma->vm_flags;
6733 
6734     if (new_prot == UVM_PROT_READ_ONLY)
6735         target_flags &= ~VM_WRITE;
6736 
6737     target_pgprot = vm_get_page_prot(target_flags);
6738 
6739     // Take VMA wrapper lock to check vma->vm_page_prot
6740     uvm_down_read(&vma_wrapper->lock);
6741 
6742     // Take a write lock if we need to modify the VMA vm_page_prot
6743     // - vma->vm_page_prot creates writable PTEs but new prot is RO
6744     // - vma->vm_page_prot creates read-only PTEs but new_prot is RW
6745     if (pgprot_val(vma->vm_page_prot) != pgprot_val(target_pgprot)) {
6746         uvm_up_read(&vma_wrapper->lock);
6747         uvm_down_write(&vma_wrapper->lock);
6748 
6749         vma->vm_page_prot = target_pgprot;
6750 
6751         uvm_downgrade_write(&vma_wrapper->lock);
6752     }
6753 
6754     ret = vm_insert_page(vma, addr, page);
6755     uvm_up_read(&vma_wrapper->lock);
6756     if (ret) {
6757         UVM_ASSERT_MSG(ret == -ENOMEM, "ret: %d\n", ret);
6758         return errno_to_nv_status(ret);
6759     }
6760 
6761     return NV_OK;
6762 }
6763 
6764 static uvm_prot_t compute_logical_prot(uvm_va_block_t *va_block,
6765                                        struct vm_area_struct *hmm_vma,
6766                                        uvm_page_index_t page_index)
6767 {
6768     uvm_prot_t logical_prot;
6769 
6770     if (uvm_va_block_is_hmm(va_block)) {
6771         NvU64 addr = uvm_va_block_cpu_page_address(va_block, page_index);
6772 
6773         logical_prot = uvm_hmm_compute_logical_prot(va_block, hmm_vma, addr);
6774     }
6775     else {
6776         uvm_va_range_t *va_range = va_block->va_range;
6777 
6778         UVM_ASSERT(va_range->type == UVM_VA_RANGE_TYPE_MANAGED);
6779 
6780         // Zombified VA ranges no longer have a vma, so they have no permissions
6781         if (uvm_va_range_is_managed_zombie(va_range)) {
6782             logical_prot = UVM_PROT_NONE;
6783         }
6784         else {
6785             struct vm_area_struct *vma;
6786 
6787             vma = uvm_va_range_vma(va_range);
6788 
6789             if (!(vma->vm_flags & VM_READ))
6790                 logical_prot = UVM_PROT_NONE;
6791             else if (!(vma->vm_flags & VM_WRITE))
6792                 logical_prot = UVM_PROT_READ_ONLY;
6793             else
6794                 logical_prot = UVM_PROT_READ_WRITE_ATOMIC;
6795         }
6796     }
6797 
6798     return logical_prot;
6799 }
6800 
6801 static struct page *block_page_get(uvm_va_block_t *block, block_phys_page_t block_page)
6802 {
6803     struct page *page;
6804 
6805     if (UVM_ID_IS_CPU(block_page.processor)) {
6806         page = uvm_cpu_chunk_get_cpu_page(block, block_page.page_index);
6807     }
6808     else {
6809         uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
6810         uvm_gpu_t *gpu = uvm_va_space_get_gpu(va_space, block_page.processor);
6811         size_t chunk_offset;
6812         uvm_gpu_chunk_t *chunk = block_phys_page_chunk(block, block_page, &chunk_offset);
6813 
6814         UVM_ASSERT(gpu->mem_info.numa.enabled);
6815         page = uvm_gpu_chunk_to_page(&gpu->pmm, chunk) + chunk_offset / PAGE_SIZE;
6816     }
6817 
6818     UVM_ASSERT(page);
6819     return page;
6820 }
6821 
6822 // Creates or upgrades a CPU mapping for the given page, updating the block's
6823 // mapping and pte_bits bitmaps as appropriate. Upon successful return, the page
6824 // will be mapped with at least new_prot permissions.
6825 //
6826 // This never downgrades mappings, so new_prot must not be UVM_PROT_NONE. Use
6827 // block_unmap_cpu or uvm_va_block_revoke_prot instead.
6828 //
6829 // If the existing mapping is >= new_prot already, this is a no-op.
6830 //
6831 // It is the caller's responsibility to:
6832 //  - Revoke mappings from other processors as appropriate so the CPU can map
6833 //    with new_prot permissions
6834 //  - Guarantee that vm_insert_page is safe to use (vma->vm_mm has a reference
6835 //    and mmap_lock is held in at least read mode)
6836 //  - For HMM blocks that vma is valid and safe to use, vma->vm_mm has a
6837 //    reference and mmap_lock is held in at least read mode
6838 //  - Ensure that the struct page corresponding to the physical memory being
6839 //    mapped exists
6840 //  - Manage the block's residency bitmap
6841 //  - Ensure that the block hasn't been killed (block->va_range is present)
6842 //  - Update the pte/mapping tracking state on success
6843 static NV_STATUS block_map_cpu_page_to(uvm_va_block_t *block,
6844                                        struct vm_area_struct *hmm_vma,
6845                                        uvm_processor_id_t resident_id,
6846                                        uvm_page_index_t page_index,
6847                                        uvm_prot_t new_prot)
6848 {
6849     uvm_prot_t curr_prot = block_page_prot_cpu(block, page_index);
6850     uvm_va_range_t *va_range = block->va_range;
6851     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
6852     struct vm_area_struct *vma;
6853     NV_STATUS status;
6854     NvU64 addr;
6855     struct page *page;
6856 
6857     UVM_ASSERT((uvm_va_block_is_hmm(block) && hmm_vma) || va_range->type == UVM_VA_RANGE_TYPE_MANAGED);
6858     UVM_ASSERT(new_prot != UVM_PROT_NONE);
6859     UVM_ASSERT(new_prot < UVM_PROT_MAX);
6860     UVM_ASSERT(uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(resident_id)], UVM_ID_CPU));
6861 
6862     uvm_assert_mutex_locked(&block->lock);
6863     if (UVM_ID_IS_CPU(resident_id))
6864         UVM_ASSERT(uvm_page_mask_test(&block->cpu.allocated, page_index));
6865 
6866     // For the CPU, write implies atomic
6867     if (new_prot == UVM_PROT_READ_WRITE)
6868         new_prot = UVM_PROT_READ_WRITE_ATOMIC;
6869 
6870     // Only upgrades are supported in this function
6871     UVM_ASSERT(curr_prot <= new_prot);
6872 
6873     if (new_prot == curr_prot)
6874         return NV_OK;
6875 
6876     // Check for existing VMA permissions. They could have been modified after
6877     // the initial mmap by mprotect.
6878     if (new_prot > compute_logical_prot(block, hmm_vma, page_index))
6879         return NV_ERR_INVALID_ACCESS_TYPE;
6880 
6881     if (uvm_va_block_is_hmm(block)) {
6882         // Do not map CPU pages because they belong to the Linux kernel.
6883         return NV_OK;
6884     }
6885 
6886     UVM_ASSERT(va_range);
6887 
6888     if (UVM_ID_IS_CPU(resident_id) && UVM_ID_IS_CPU(uvm_va_range_get_policy(va_range)->preferred_location)) {
6889         // Add the page's range group range to the range group's migrated list.
6890         uvm_range_group_range_t *rgr = uvm_range_group_range_find(va_space,
6891                                                                   uvm_va_block_cpu_page_address(block, page_index));
6892         if (rgr != NULL) {
6893             uvm_spin_lock(&rgr->range_group->migrated_ranges_lock);
6894             if (list_empty(&rgr->range_group_migrated_list_node))
6895                 list_move_tail(&rgr->range_group_migrated_list_node, &rgr->range_group->migrated_ranges);
6896             uvm_spin_unlock(&rgr->range_group->migrated_ranges_lock);
6897         }
6898     }
6899 
6900     // It's possible here that current->mm != vma->vm_mm. That can happen for
6901     // example due to access_process_vm (ptrace) or get_user_pages from another
6902     // driver.
6903     //
6904     // In such cases the caller has taken care of ref counting vma->vm_mm for
6905     // us, so we can safely operate on the vma but we can't use
6906     // uvm_va_range_vma_current.
6907     vma = uvm_va_range_vma(va_range);
6908     uvm_assert_mmap_lock_locked(vma->vm_mm);
6909     UVM_ASSERT(!uvm_va_space_mm_enabled(va_space) || va_space->va_space_mm.mm == vma->vm_mm);
6910 
6911     // Add the mapping
6912     addr = uvm_va_block_cpu_page_address(block, page_index);
6913 
6914     // This unmap handles upgrades as vm_insert_page returns -EBUSY when
6915     // there's already a mapping present at fault_addr, so we have to unmap
6916     // first anyway when upgrading from RO -> RW.
6917     if (curr_prot != UVM_PROT_NONE)
6918         unmap_mapping_range(va_space->mapping, addr, PAGE_SIZE, 1);
6919 
6920     // Don't map the CPU until prior copies and GPU PTE updates finish,
6921     // otherwise we might not stay coherent.
6922     status = uvm_tracker_wait(&block->tracker);
6923     if (status != NV_OK)
6924         return status;
6925 
6926     page = block_page_get(block, block_phys_page(resident_id, page_index));
6927     return uvm_cpu_insert_page(vma, addr, page, new_prot);
6928 }
6929 
6930 // Maps the CPU to the given pages which are resident on resident_id.
6931 // map_page_mask is an in/out parameter: the pages which are mapped to
6932 // resident_id are removed from the mask before returning.
6933 //
6934 // Caller must ensure that:
6935 // -  Pages in map_page_mask must not be set in the corresponding cpu.pte_bits
6936 // mask for the requested protection.
6937 static NV_STATUS block_map_cpu_to(uvm_va_block_t *block,
6938                                   uvm_va_block_context_t *block_context,
6939                                   uvm_processor_id_t resident_id,
6940                                   uvm_va_block_region_t region,
6941                                   uvm_page_mask_t *map_page_mask,
6942                                   uvm_prot_t new_prot,
6943                                   uvm_tracker_t *out_tracker)
6944 {
6945     NV_STATUS status = NV_OK;
6946     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
6947     uvm_page_index_t page_index;
6948     uvm_page_mask_t *pages_to_map = &block_context->mapping.page_mask;
6949     const uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(block, resident_id);
6950     uvm_pte_bits_cpu_t prot_pte_bit = get_cpu_pte_bit_index(new_prot);
6951     uvm_pte_bits_cpu_t pte_bit;
6952 
6953     UVM_ASSERT(uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(resident_id)], UVM_ID_CPU));
6954 
6955     // TODO: Bug 1766424: Check if optimizing the unmap_mapping_range calls
6956     //       within block_map_cpu_page_to by doing them once here is helpful.
6957 
6958     UVM_ASSERT(!uvm_page_mask_and(&block_context->scratch_page_mask,
6959                                   map_page_mask,
6960                                   &block->cpu.pte_bits[prot_pte_bit]));
6961 
6962     // The pages which will actually change are those in the input page mask
6963     // which are resident on the target.
6964     if (!uvm_page_mask_and(pages_to_map, map_page_mask, resident_mask))
6965         return NV_OK;
6966 
6967     status = block_pre_populate_pde1_all_gpus(block, out_tracker);
6968     if (status != NV_OK)
6969         return status;
6970 
6971     block->cpu.ever_mapped = true;
6972 
6973     for_each_va_block_page_in_region_mask(page_index, pages_to_map, region) {
6974         status = block_map_cpu_page_to(block,
6975                                        block_context->hmm.vma,
6976                                        resident_id,
6977                                        page_index,
6978                                        new_prot);
6979         if (status != NV_OK)
6980             break;
6981 
6982         uvm_processor_mask_set(&block->mapped, UVM_ID_CPU);
6983     }
6984 
6985     // If there was some error, shrink the region so that we only update the
6986     // pte/mapping tracking bits for the pages that succeeded
6987     if (status != NV_OK) {
6988         region = uvm_va_block_region(region.first, page_index);
6989         uvm_page_mask_region_clear_outside(pages_to_map, region);
6990     }
6991 
6992     // If pages are mapped from a remote residency, notify the remote mapping
6993     // events to tools. We skip event notification if the cause is Invalid. We
6994     // use it to signal that this function is being called from the revocation
6995     // path to avoid reporting duplicate events.
6996     if (UVM_ID_IS_GPU(resident_id) &&
6997         va_space->tools.enabled &&
6998         block_context->mapping.cause != UvmEventMapRemoteCauseInvalid) {
6999         uvm_va_block_region_t subregion;
7000         for_each_va_block_subregion_in_mask(subregion, pages_to_map, region) {
7001             uvm_tools_record_map_remote(block,
7002                                         NULL,
7003                                         UVM_ID_CPU,
7004                                         resident_id,
7005                                         uvm_va_block_region_start(block, subregion),
7006                                         uvm_va_block_region_size(subregion),
7007                                         block_context->mapping.cause);
7008         }
7009     }
7010 
7011     // Update CPU mapping state
7012     for (pte_bit = 0; pte_bit <= prot_pte_bit; pte_bit++)
7013         uvm_page_mask_or(&block->cpu.pte_bits[pte_bit], &block->cpu.pte_bits[pte_bit], pages_to_map);
7014 
7015     uvm_page_mask_or(&block->maybe_mapped_pages, &block->maybe_mapped_pages, pages_to_map);
7016 
7017     UVM_ASSERT(block_check_mappings(block));
7018 
7019     // Remove all pages that were newly-mapped from the input mask
7020     uvm_page_mask_andnot(map_page_mask, map_page_mask, pages_to_map);
7021 
7022     return status;
7023 }
7024 
7025 // Maps the GPU to the given pages which are resident on resident_id.
7026 // map_page_mask is an in/out parameter: the pages which are mapped
7027 // to resident_id are removed from the mask before returning.
7028 //
7029 // Caller must ensure that:
7030 // -  Pages in map_page_mask must not be set in the corresponding pte_bits mask
7031 // for the requested protection on the mapping GPU.
7032 static NV_STATUS block_map_gpu_to(uvm_va_block_t *va_block,
7033                                   uvm_va_block_context_t *block_context,
7034                                   uvm_gpu_t *gpu,
7035                                   uvm_processor_id_t resident_id,
7036                                   uvm_page_mask_t *map_page_mask,
7037                                   uvm_prot_t new_prot,
7038                                   uvm_tracker_t *out_tracker)
7039 {
7040     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id);
7041     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
7042     uvm_push_t push;
7043     NV_STATUS status;
7044     uvm_page_mask_t *pages_to_map = &block_context->mapping.page_mask;
7045     const uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, resident_id);
7046     uvm_pte_bits_gpu_t pte_bit;
7047     uvm_pte_bits_gpu_t prot_pte_bit = get_gpu_pte_bit_index(new_prot);
7048     uvm_va_block_new_pte_state_t *new_pte_state = &block_context->mapping.new_pte_state;
7049     block_pte_op_t pte_op;
7050 
7051     UVM_ASSERT(map_page_mask);
7052     UVM_ASSERT(uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(resident_id)], gpu->id));
7053 
7054     if (uvm_processor_mask_test(block_get_uvm_lite_gpus(va_block), gpu->id))
7055         UVM_ASSERT(uvm_id_equal(resident_id, uvm_va_range_get_policy(va_block->va_range)->preferred_location));
7056 
7057     UVM_ASSERT(!uvm_page_mask_and(&block_context->scratch_page_mask,
7058                                   map_page_mask,
7059                                   &gpu_state->pte_bits[prot_pte_bit]));
7060 
7061     // The pages which will actually change are those in the input page mask
7062     // which are resident on the target.
7063     if (!uvm_page_mask_and(pages_to_map, map_page_mask, resident_mask))
7064         return NV_OK;
7065 
7066     UVM_ASSERT(block_check_mapping_residency(va_block, gpu, resident_id, pages_to_map));
7067 
7068     // For PTE merge/split computation, compute all resident pages which will
7069     // have exactly new_prot after performing the mapping.
7070     uvm_page_mask_or(&block_context->scratch_page_mask, &gpu_state->pte_bits[prot_pte_bit], pages_to_map);
7071     if (prot_pte_bit < UVM_PTE_BITS_GPU_ATOMIC) {
7072         uvm_page_mask_andnot(&block_context->scratch_page_mask,
7073                              &block_context->scratch_page_mask,
7074                              &gpu_state->pte_bits[prot_pte_bit + 1]);
7075     }
7076     uvm_page_mask_and(&block_context->scratch_page_mask, &block_context->scratch_page_mask, resident_mask);
7077 
7078     block_gpu_compute_new_pte_state(va_block,
7079                                     gpu,
7080                                     resident_id,
7081                                     pages_to_map,
7082                                     &block_context->scratch_page_mask,
7083                                     new_pte_state);
7084 
7085     status = block_alloc_ptes_new_state(va_block, gpu, new_pte_state, out_tracker);
7086     if (status != NV_OK)
7087         return status;
7088 
7089     status = uvm_push_begin_acquire(gpu->channel_manager,
7090                                     UVM_CHANNEL_TYPE_MEMOPS,
7091                                     &va_block->tracker,
7092                                     &push,
7093                                     "Mapping pages in block [0x%llx, 0x%llx) as %s",
7094                                     va_block->start,
7095                                     va_block->end + 1,
7096                                     uvm_prot_string(new_prot));
7097     if (status != NV_OK)
7098         return status;
7099 
7100     pte_op = BLOCK_PTE_OP_MAP;
7101     if (new_pte_state->pte_is_2m) {
7102         // We're either modifying permissions of a pre-existing 2M PTE, or all
7103         // permissions match so we can merge to a new 2M PTE.
7104         block_gpu_map_to_2m(va_block, block_context, gpu, resident_id, new_prot, &push, pte_op);
7105     }
7106     else if (gpu_state->pte_is_2m) {
7107         // Permissions on a subset of the existing 2M PTE are being upgraded, so
7108         // we have to split it into the appropriate mix of big and 4k PTEs.
7109         block_gpu_map_split_2m(va_block, block_context, gpu, resident_id, pages_to_map, new_prot, &push, pte_op);
7110     }
7111     else {
7112         // We're upgrading permissions on some pre-existing mix of big and 4K
7113         // PTEs into some other mix of big and 4K PTEs.
7114         block_gpu_map_big_and_4k(va_block, block_context, gpu, resident_id, pages_to_map, new_prot, &push, pte_op);
7115     }
7116 
7117     // If we are mapping remotely, record the event
7118     if (va_space->tools.enabled && !uvm_id_equal(resident_id, gpu->id)) {
7119         uvm_va_block_region_t subregion, region = uvm_va_block_region_from_block(va_block);
7120 
7121         UVM_ASSERT(block_context->mapping.cause != UvmEventMapRemoteCauseInvalid);
7122 
7123         for_each_va_block_subregion_in_mask(subregion, pages_to_map, region) {
7124             uvm_tools_record_map_remote(va_block,
7125                                         &push,
7126                                         gpu->id,
7127                                         resident_id,
7128                                         uvm_va_block_region_start(va_block, subregion),
7129                                         uvm_va_block_region_size(subregion),
7130                                         block_context->mapping.cause);
7131         }
7132     }
7133 
7134     uvm_push_end(&push);
7135 
7136     // Update GPU mapping state
7137     for (pte_bit = 0; pte_bit <= prot_pte_bit; pte_bit++)
7138         uvm_page_mask_or(&gpu_state->pte_bits[pte_bit], &gpu_state->pte_bits[pte_bit], pages_to_map);
7139 
7140     uvm_processor_mask_set(&va_block->mapped, gpu->id);
7141 
7142     // If we are mapping a UVM-Lite GPU do not update maybe_mapped_pages
7143     if (!uvm_processor_mask_test(block_get_uvm_lite_gpus(va_block), gpu->id))
7144         uvm_page_mask_or(&va_block->maybe_mapped_pages, &va_block->maybe_mapped_pages, pages_to_map);
7145 
7146     // Remove all pages resident on this processor from the input mask, which
7147     // were newly-mapped.
7148     uvm_page_mask_andnot(map_page_mask, map_page_mask, pages_to_map);
7149 
7150     UVM_ASSERT(block_check_mappings(va_block));
7151 
7152     return uvm_tracker_add_push_safe(out_tracker, &push);
7153 }
7154 
7155 static void map_get_allowed_destinations(uvm_va_block_t *block,
7156                                          uvm_va_block_context_t *va_block_context,
7157                                          const uvm_va_policy_t *policy,
7158                                          uvm_processor_id_t id,
7159                                          uvm_processor_mask_t *allowed_mask)
7160 {
7161     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
7162 
7163     if (uvm_processor_mask_test(block_get_uvm_lite_gpus(block), id)) {
7164         // UVM-Lite can only map resident pages on the preferred location
7165         uvm_processor_mask_zero(allowed_mask);
7166         uvm_processor_mask_set(allowed_mask, policy->preferred_location);
7167     }
7168     else if ((uvm_va_policy_is_read_duplicate(policy, va_space) ||
7169               (uvm_id_equal(policy->preferred_location, id) &&
7170                !is_uvm_fault_force_sysmem_set() &&
7171                !uvm_hmm_must_use_sysmem(block, va_block_context))) &&
7172              uvm_va_space_processor_has_memory(va_space, id)) {
7173         // When operating under read-duplication we should only map the local
7174         // processor to cause fault-and-duplicate of remote pages.
7175         //
7176         // The same holds when this processor is the preferred location: only
7177         // create local mappings to force remote pages to fault-and-migrate.
7178         uvm_processor_mask_zero(allowed_mask);
7179         uvm_processor_mask_set(allowed_mask, id);
7180     }
7181     else {
7182         // Common case: Just map wherever the memory happens to reside
7183         uvm_processor_mask_and(allowed_mask, &block->resident, &va_space->can_access[uvm_id_value(id)]);
7184         return;
7185     }
7186 
7187     // Clamp to resident and accessible processors
7188     uvm_processor_mask_and(allowed_mask, allowed_mask, &block->resident);
7189     uvm_processor_mask_and(allowed_mask, allowed_mask, &va_space->can_access[uvm_id_value(id)]);
7190 }
7191 
7192 NV_STATUS uvm_va_block_map(uvm_va_block_t *va_block,
7193                            uvm_va_block_context_t *va_block_context,
7194                            uvm_processor_id_t id,
7195                            uvm_va_block_region_t region,
7196                            const uvm_page_mask_t *map_page_mask,
7197                            uvm_prot_t new_prot,
7198                            UvmEventMapRemoteCause cause,
7199                            uvm_tracker_t *out_tracker)
7200 {
7201     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
7202     uvm_gpu_t *gpu = NULL;
7203     uvm_processor_mask_t allowed_destinations;
7204     uvm_processor_id_t resident_id;
7205     const uvm_page_mask_t *pte_mask;
7206     uvm_page_mask_t *running_page_mask = &va_block_context->mapping.map_running_page_mask;
7207     NV_STATUS status;
7208     const uvm_va_policy_t *policy = uvm_va_policy_get_region(va_block, region);
7209 
7210     va_block_context->mapping.cause = cause;
7211 
7212     UVM_ASSERT(new_prot != UVM_PROT_NONE);
7213     UVM_ASSERT(new_prot < UVM_PROT_MAX);
7214     uvm_assert_mutex_locked(&va_block->lock);
7215 
7216     // Mapping is not supported on the eviction path that doesn't hold the VA
7217     // space lock.
7218     uvm_assert_rwsem_locked(&va_space->lock);
7219 
7220     if (UVM_ID_IS_CPU(id)) {
7221         uvm_pte_bits_cpu_t prot_pte_bit;
7222 
7223         // Check if the current thread is allowed to call vm_insert_page
7224         if (!uvm_va_block_is_hmm(va_block) && !uvm_va_range_vma_check(va_block->va_range, va_block_context->mm))
7225             return NV_OK;
7226 
7227         prot_pte_bit = get_cpu_pte_bit_index(new_prot);
7228         pte_mask = &va_block->cpu.pte_bits[prot_pte_bit];
7229     }
7230     else {
7231         uvm_va_block_gpu_state_t *gpu_state;
7232         uvm_pte_bits_gpu_t prot_pte_bit;
7233 
7234         gpu = uvm_va_space_get_gpu(va_space, id);
7235 
7236         // Although this GPU UUID is registered in the VA space, it might not have a
7237         // GPU VA space registered.
7238         if (!uvm_gpu_va_space_get(va_space, gpu))
7239             return NV_OK;
7240 
7241         gpu_state = block_gpu_state_get_alloc(va_block, gpu);
7242         if (!gpu_state)
7243             return NV_ERR_NO_MEMORY;
7244 
7245         prot_pte_bit = get_gpu_pte_bit_index(new_prot);
7246         pte_mask = &gpu_state->pte_bits[prot_pte_bit];
7247     }
7248 
7249     uvm_page_mask_init_from_region(running_page_mask, region, map_page_mask);
7250 
7251     if (!uvm_page_mask_andnot(running_page_mask, running_page_mask, pte_mask))
7252         return NV_OK;
7253 
7254     // Map per resident location so we can more easily detect physically-
7255     // contiguous mappings.
7256     map_get_allowed_destinations(va_block, va_block_context, policy, id, &allowed_destinations);
7257 
7258     for_each_closest_id(resident_id, &allowed_destinations, id, va_space) {
7259         if (UVM_ID_IS_CPU(id)) {
7260             status = block_map_cpu_to(va_block,
7261                                       va_block_context,
7262                                       resident_id,
7263                                       region,
7264                                       running_page_mask,
7265                                       new_prot,
7266                                       out_tracker);
7267         }
7268         else {
7269             status = block_map_gpu_to(va_block,
7270                                       va_block_context,
7271                                       gpu,
7272                                       resident_id,
7273                                       running_page_mask,
7274                                       new_prot,
7275                                       out_tracker);
7276         }
7277 
7278         if (status != NV_OK)
7279             return status;
7280 
7281         // If we've mapped all requested pages, we're done
7282         if (uvm_page_mask_region_empty(running_page_mask, region))
7283             break;
7284     }
7285 
7286     return NV_OK;
7287 }
7288 
7289 // Revokes the given pages mapped by cpu. This is implemented by unmapping all
7290 // pages and mapping them later with the lower permission. This is required
7291 // because vm_insert_page can only be used for upgrades from Invalid.
7292 //
7293 // Caller must ensure that:
7294 // -  Pages in revoke_page_mask must be set in the
7295 // cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE] mask.
7296 static NV_STATUS block_revoke_cpu_write(uvm_va_block_t *block,
7297                                         uvm_va_block_context_t *block_context,
7298                                         uvm_va_block_region_t region,
7299                                         const uvm_page_mask_t *revoke_page_mask,
7300                                         uvm_tracker_t *out_tracker)
7301 {
7302     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
7303     uvm_va_block_region_t subregion;
7304 
7305     UVM_ASSERT(revoke_page_mask);
7306 
7307     UVM_ASSERT(uvm_page_mask_subset(revoke_page_mask, &block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE]));
7308 
7309     block_unmap_cpu(block, region, revoke_page_mask);
7310 
7311     // Coalesce revocation event notification
7312     for_each_va_block_subregion_in_mask(subregion, revoke_page_mask, region) {
7313         uvm_perf_event_notify_revocation(&va_space->perf_events,
7314                                          block,
7315                                          UVM_ID_CPU,
7316                                          uvm_va_block_region_start(block, subregion),
7317                                          uvm_va_block_region_size(subregion),
7318                                          UVM_PROT_READ_WRITE_ATOMIC,
7319                                          UVM_PROT_READ_ONLY);
7320     }
7321 
7322     // uvm_va_block_map will skip this remap if we aren't holding the right mm
7323     // lock.
7324     return uvm_va_block_map(block,
7325                             block_context,
7326                             UVM_ID_CPU,
7327                             region,
7328                             revoke_page_mask,
7329                             UVM_PROT_READ_ONLY,
7330                             UvmEventMapRemoteCauseInvalid,
7331                             out_tracker);
7332 }
7333 
7334 static void block_revoke_prot_gpu_perf_notify(uvm_va_block_t *block,
7335                                               uvm_va_block_context_t *block_context,
7336                                               uvm_gpu_t *gpu,
7337                                               uvm_prot_t prot_revoked,
7338                                               const uvm_page_mask_t *pages_revoked)
7339 {
7340     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
7341     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
7342     uvm_va_block_region_t subregion, region = uvm_va_block_region_from_block(block);
7343     uvm_pte_bits_gpu_t pte_bit;
7344 
7345     for (pte_bit = UVM_PTE_BITS_GPU_ATOMIC; pte_bit >= get_gpu_pte_bit_index(prot_revoked); pte_bit--) {
7346         uvm_prot_t old_prot;
7347 
7348         if (!uvm_page_mask_and(&block_context->scratch_page_mask, &gpu_state->pte_bits[pte_bit], pages_revoked))
7349             continue;
7350 
7351         if (pte_bit == UVM_PTE_BITS_GPU_ATOMIC)
7352             old_prot = UVM_PROT_READ_WRITE_ATOMIC;
7353         else
7354             old_prot = UVM_PROT_READ_WRITE;
7355 
7356         for_each_va_block_subregion_in_mask(subregion, &block_context->scratch_page_mask, region) {
7357             uvm_perf_event_notify_revocation(&va_space->perf_events,
7358                                              block,
7359                                              gpu->id,
7360                                              uvm_va_block_region_start(block, subregion),
7361                                              uvm_va_block_region_size(subregion),
7362                                              old_prot,
7363                                              prot_revoked - 1);
7364         }
7365     }
7366 }
7367 
7368 // Revokes the given pages mapped by gpu which are resident on resident_id.
7369 // revoke_page_mask is an in/out parameter: the pages which have the appropriate
7370 // permissions and are mapped to resident_id are removed from the mask before
7371 // returning.
7372 //
7373 // Caller must ensure that:
7374 // -  Pages in map_page_mask must be set in the corresponding pte_bits mask for
7375 // the protection to be revoked on the mapping GPU.
7376 static NV_STATUS block_revoke_prot_gpu_to(uvm_va_block_t *va_block,
7377                                           uvm_va_block_context_t *block_context,
7378                                           uvm_gpu_t *gpu,
7379                                           uvm_processor_id_t resident_id,
7380                                           uvm_page_mask_t *revoke_page_mask,
7381                                           uvm_prot_t prot_to_revoke,
7382                                           uvm_tracker_t *out_tracker)
7383 {
7384     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id);
7385     uvm_push_t push;
7386     NV_STATUS status;
7387     uvm_pte_bits_gpu_t pte_bit;
7388     uvm_pte_bits_gpu_t prot_pte_bit = get_gpu_pte_bit_index(prot_to_revoke);
7389     uvm_prot_t new_prot = prot_to_revoke - 1;
7390     uvm_va_block_new_pte_state_t *new_pte_state = &block_context->mapping.new_pte_state;
7391     block_pte_op_t pte_op;
7392     const uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, resident_id);
7393     uvm_page_mask_t *pages_to_revoke = &block_context->mapping.page_mask;
7394 
7395     UVM_ASSERT(revoke_page_mask);
7396     UVM_ASSERT(uvm_page_mask_subset(revoke_page_mask, &gpu_state->pte_bits[prot_pte_bit]));
7397 
7398     // The pages which will actually change are those in the input page mask
7399     // which are resident on the target.
7400     if (!uvm_page_mask_and(pages_to_revoke, revoke_page_mask, resident_mask))
7401         return NV_OK;
7402 
7403     UVM_ASSERT(block_check_mapping_residency(va_block, gpu, resident_id, pages_to_revoke));
7404 
7405     // For PTE merge/split computation, compute all resident pages which will
7406     // have exactly prot_to_revoke-1 after performing the revocation.
7407     uvm_page_mask_andnot(&block_context->scratch_page_mask, &gpu_state->pte_bits[prot_pte_bit], pages_to_revoke);
7408     uvm_page_mask_andnot(&block_context->scratch_page_mask,
7409                          &gpu_state->pte_bits[prot_pte_bit - 1],
7410                          &block_context->scratch_page_mask);
7411     uvm_page_mask_and(&block_context->scratch_page_mask, &block_context->scratch_page_mask, resident_mask);
7412 
7413     block_gpu_compute_new_pte_state(va_block,
7414                                     gpu,
7415                                     resident_id,
7416                                     pages_to_revoke,
7417                                     &block_context->scratch_page_mask,
7418                                     new_pte_state);
7419 
7420     status = block_alloc_ptes_new_state(va_block, gpu, new_pte_state, out_tracker);
7421     if (status != NV_OK)
7422         return status;
7423 
7424     status = uvm_push_begin_acquire(gpu->channel_manager,
7425                                     UVM_CHANNEL_TYPE_MEMOPS,
7426                                     &va_block->tracker,
7427                                     &push,
7428                                     "Revoking %s access privileges in block [0x%llx, 0x%llx) ",
7429                                     uvm_prot_string(prot_to_revoke),
7430                                     va_block->start,
7431                                     va_block->end + 1);
7432     if (status != NV_OK)
7433         return status;
7434 
7435     pte_op = BLOCK_PTE_OP_REVOKE;
7436     if (new_pte_state->pte_is_2m) {
7437         // We're either modifying permissions of a pre-existing 2M PTE, or all
7438         // permissions match so we can merge to a new 2M PTE.
7439         block_gpu_map_to_2m(va_block, block_context, gpu, resident_id, new_prot, &push, pte_op);
7440     }
7441     else if (gpu_state->pte_is_2m) {
7442         // Permissions on a subset of the existing 2M PTE are being downgraded,
7443         // so we have to split it into the appropriate mix of big and 4k PTEs.
7444         block_gpu_map_split_2m(va_block, block_context, gpu, resident_id, pages_to_revoke, new_prot, &push, pte_op);
7445     }
7446     else {
7447         // We're downgrading permissions on some pre-existing mix of big and 4K
7448         // PTEs into some other mix of big and 4K PTEs.
7449         block_gpu_map_big_and_4k(va_block, block_context, gpu, resident_id, pages_to_revoke, new_prot, &push, pte_op);
7450     }
7451 
7452     uvm_push_end(&push);
7453 
7454     block_revoke_prot_gpu_perf_notify(va_block, block_context, gpu, prot_to_revoke, pages_to_revoke);
7455 
7456     // Update GPU mapping state
7457     for (pte_bit = UVM_PTE_BITS_GPU_ATOMIC; pte_bit >= prot_pte_bit; pte_bit--)
7458         uvm_page_mask_andnot(&gpu_state->pte_bits[pte_bit], &gpu_state->pte_bits[pte_bit], pages_to_revoke);
7459 
7460     // Remove all pages resident on this processor from the input mask, which
7461     // pages which were revoked and pages which already had the correct
7462     // permissions.
7463     uvm_page_mask_andnot(revoke_page_mask, revoke_page_mask, pages_to_revoke);
7464 
7465     UVM_ASSERT(block_check_mappings(va_block));
7466 
7467     return uvm_tracker_add_push_safe(out_tracker, &push);
7468 }
7469 
7470 NV_STATUS uvm_va_block_revoke_prot(uvm_va_block_t *va_block,
7471                                    uvm_va_block_context_t *va_block_context,
7472                                    uvm_processor_id_t id,
7473                                    uvm_va_block_region_t region,
7474                                    const uvm_page_mask_t *revoke_page_mask,
7475                                    uvm_prot_t prot_to_revoke,
7476                                    uvm_tracker_t *out_tracker)
7477 {
7478     uvm_gpu_t *gpu;
7479     uvm_va_block_gpu_state_t *gpu_state;
7480     uvm_processor_mask_t resident_procs;
7481     uvm_processor_id_t resident_id;
7482     uvm_page_mask_t *running_page_mask = &va_block_context->mapping.revoke_running_page_mask;
7483     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
7484     uvm_pte_bits_gpu_t prot_pte_bit;
7485 
7486     UVM_ASSERT(prot_to_revoke > UVM_PROT_READ_ONLY);
7487     UVM_ASSERT(prot_to_revoke < UVM_PROT_MAX);
7488     uvm_assert_mutex_locked(&va_block->lock);
7489 
7490     if (UVM_ID_IS_CPU(id)) {
7491         if (prot_to_revoke == UVM_PROT_READ_WRITE_ATOMIC)
7492             return NV_OK;
7493 
7494         if (uvm_va_block_is_hmm(va_block)) {
7495             // Linux is responsible for CPU page table updates.
7496             uvm_page_mask_region_clear(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE], region);
7497             return NV_OK;
7498         }
7499 
7500         uvm_page_mask_init_from_region(running_page_mask, region, revoke_page_mask);
7501 
7502         if (uvm_page_mask_and(running_page_mask, running_page_mask, &va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE]))
7503             return block_revoke_cpu_write(va_block, va_block_context, region, running_page_mask, out_tracker);
7504 
7505         return NV_OK;
7506     }
7507 
7508     gpu = uvm_va_space_get_gpu(va_space, id);
7509 
7510     // UVM-Lite GPUs should never have access revoked
7511     UVM_ASSERT_MSG(!uvm_processor_mask_test(block_get_uvm_lite_gpus(va_block), gpu->id),
7512                    "GPU %s\n", uvm_gpu_name(gpu));
7513 
7514     // Return early if there are no mappings for the GPU present in the block
7515     if (!uvm_processor_mask_test(&va_block->mapped, gpu->id))
7516         return NV_OK;
7517 
7518     gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id);
7519     prot_pte_bit = get_gpu_pte_bit_index(prot_to_revoke);
7520 
7521     uvm_page_mask_init_from_region(running_page_mask, region, revoke_page_mask);
7522 
7523     if (!uvm_page_mask_and(running_page_mask, running_page_mask, &gpu_state->pte_bits[prot_pte_bit]))
7524         return NV_OK;
7525 
7526     // Revoke per resident location so we can more easily detect physically-
7527     // contiguous mappings.
7528     uvm_processor_mask_copy(&resident_procs, &va_block->resident);
7529 
7530     for_each_closest_id(resident_id, &resident_procs, gpu->id, va_space) {
7531         NV_STATUS status = block_revoke_prot_gpu_to(va_block,
7532                                                     va_block_context,
7533                                                     gpu,
7534                                                     resident_id,
7535                                                     running_page_mask,
7536                                                     prot_to_revoke,
7537                                                     out_tracker);
7538         if (status != NV_OK)
7539             return status;
7540 
7541         // If we've revoked all requested pages, we're done
7542         if (uvm_page_mask_region_empty(running_page_mask, region))
7543             break;
7544     }
7545 
7546     return NV_OK;
7547 }
7548 
7549 NV_STATUS uvm_va_block_map_mask(uvm_va_block_t *va_block,
7550                                 uvm_va_block_context_t *va_block_context,
7551                                 const uvm_processor_mask_t *map_processor_mask,
7552                                 uvm_va_block_region_t region,
7553                                 const uvm_page_mask_t *map_page_mask,
7554                                 uvm_prot_t new_prot,
7555                                 UvmEventMapRemoteCause cause)
7556 {
7557     uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
7558     NV_STATUS status = NV_OK;
7559     NV_STATUS tracker_status;
7560     uvm_processor_id_t id;
7561 
7562     for_each_id_in_mask(id, map_processor_mask) {
7563         status = uvm_va_block_map(va_block,
7564                                   va_block_context,
7565                                   id,
7566                                   region,
7567                                   map_page_mask,
7568                                   new_prot,
7569                                   cause,
7570                                   &local_tracker);
7571         if (status != NV_OK)
7572             break;
7573     }
7574 
7575     // Regardless of error, add the successfully-pushed mapping operations into
7576     // the block's tracker. Note that we can't overwrite the tracker because we
7577     // aren't guaranteed that the map actually pushed anything (in which case it
7578     // would've acquired the block tracker first).
7579     tracker_status = uvm_tracker_add_tracker_safe(&va_block->tracker, &local_tracker);
7580     uvm_tracker_deinit(&local_tracker);
7581 
7582     return status == NV_OK ? tracker_status : status;
7583 }
7584 
7585 NV_STATUS uvm_va_block_unmap_mask(uvm_va_block_t *va_block,
7586                                   uvm_va_block_context_t *va_block_context,
7587                                   const uvm_processor_mask_t *unmap_processor_mask,
7588                                   uvm_va_block_region_t region,
7589                                   const uvm_page_mask_t *unmap_page_mask)
7590 {
7591     uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
7592     NV_STATUS status = NV_OK;
7593     NV_STATUS tracker_status;
7594     uvm_processor_id_t id;
7595 
7596     // Watch out, unmap_mask could change during iteration since it could be
7597     // va_block->mapped.
7598     for_each_id_in_mask(id, unmap_processor_mask) {
7599         // Errors could either be a system-fatal error (ECC) or an allocation
7600         // retry due to PTE splitting. In either case we should stop after
7601         // hitting the first one.
7602         status = uvm_va_block_unmap(va_block, va_block_context, id, region, unmap_page_mask, &local_tracker);
7603         if (status != NV_OK)
7604             break;
7605     }
7606 
7607     // See the comment in uvm_va_block_map_mask for adding to the tracker.
7608     tracker_status = uvm_tracker_add_tracker_safe(&va_block->tracker, &local_tracker);
7609     uvm_tracker_deinit(&local_tracker);
7610 
7611     return status == NV_OK ? tracker_status : status;
7612 }
7613 
7614 NV_STATUS uvm_va_block_revoke_prot_mask(uvm_va_block_t *va_block,
7615                                         uvm_va_block_context_t *va_block_context,
7616                                         const uvm_processor_mask_t *revoke_processor_mask,
7617                                         uvm_va_block_region_t region,
7618                                         const uvm_page_mask_t *revoke_page_mask,
7619                                         uvm_prot_t prot_to_revoke)
7620 {
7621     uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
7622     NV_STATUS status = NV_OK;
7623     NV_STATUS tracker_status;
7624     uvm_processor_id_t id;
7625 
7626     for_each_id_in_mask(id, revoke_processor_mask) {
7627         status = uvm_va_block_revoke_prot(va_block,
7628                                           va_block_context,
7629                                           id,
7630                                           region,
7631                                           revoke_page_mask,
7632                                           prot_to_revoke,
7633                                           &local_tracker);
7634         if (status != NV_OK)
7635             break;
7636     }
7637 
7638     // See the comment in uvm_va_block_map_mask for adding to the tracker.
7639     tracker_status = uvm_tracker_add_tracker_safe(&va_block->tracker, &local_tracker);
7640     uvm_tracker_deinit(&local_tracker);
7641 
7642     return status == NV_OK ? tracker_status : status;
7643 }
7644 
7645 // Updates the read_duplicated_pages mask in the block when the state of GPU id
7646 // is being destroyed
7647 static void update_read_duplicated_pages_mask(uvm_va_block_t *block,
7648                                               uvm_gpu_id_t id,
7649                                               uvm_va_block_gpu_state_t *gpu_state)
7650 {
7651     uvm_gpu_id_t running_id;
7652     bool first = true;
7653     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
7654     uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, NULL);
7655     uvm_page_mask_t *running_page_mask = &block_context->update_read_duplicated_pages.running_page_mask;
7656     uvm_page_mask_t *tmp_page_mask = &block_context->scratch_page_mask;
7657 
7658     uvm_page_mask_zero(&block->read_duplicated_pages);
7659 
7660     for_each_id_in_mask(running_id, &block->resident) {
7661         const uvm_page_mask_t *running_residency_mask;
7662 
7663         if (uvm_id_equal(running_id, id))
7664             continue;
7665 
7666         running_residency_mask = uvm_va_block_resident_mask_get(block, running_id);
7667 
7668         if (first) {
7669             uvm_page_mask_copy(running_page_mask, running_residency_mask);
7670             first = false;
7671             continue;
7672         }
7673 
7674         if (uvm_page_mask_and(tmp_page_mask, running_page_mask, running_residency_mask))
7675             uvm_page_mask_or(&block->read_duplicated_pages, &block->read_duplicated_pages, tmp_page_mask);
7676 
7677         uvm_page_mask_or(running_page_mask, running_page_mask, running_residency_mask);
7678     }
7679 }
7680 
7681 // Unmaps all GPU mappings under this block, frees the page tables, and frees
7682 // all the GPU chunks. This simply drops the chunks on the floor, so the caller
7683 // must take care of copying the data elsewhere if it needs to remain intact.
7684 //
7685 // This serializes on the block tracker since it must unmap page tables.
7686 static void block_destroy_gpu_state(uvm_va_block_t *block, uvm_gpu_id_t id)
7687 {
7688     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, id);
7689     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
7690     uvm_gpu_va_space_t *gpu_va_space;
7691     uvm_gpu_t *gpu, *other_gpu;
7692 
7693     if (!gpu_state)
7694         return;
7695 
7696     uvm_assert_mutex_locked(&block->lock);
7697 
7698     // Unmap PTEs and free page tables
7699     gpu = uvm_va_space_get_gpu(va_space, id);
7700     gpu_va_space = uvm_gpu_va_space_get(va_space, gpu);
7701     if (gpu_va_space) {
7702         uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, NULL);
7703 
7704         uvm_va_block_remove_gpu_va_space(block, gpu_va_space, block_context);
7705     }
7706 
7707     UVM_ASSERT(!uvm_processor_mask_test(&block->mapped, id));
7708 
7709     // No processor should have this GPU mapped at this point
7710     UVM_ASSERT(block_check_processor_not_mapped(block, id));
7711 
7712     // We need to remove the mappings of the indirect peers from the reverse
7713     // map when the GPU state is being destroyed (for example, on
7714     // unregister_gpu) and when peer access between indirect peers is disabled.
7715     // However, we need to avoid double mapping removals. There are two
7716     // possible scenarios:
7717     // - Disable peer access first. This will remove all mappings between A and
7718     // B GPUs, and the indirect_peers bit is cleared. Thus, the later call to
7719     // unregister_gpu will not operate on that pair of GPUs.
7720     // - Unregister GPU first. This will remove all mappings from all indirect
7721     // peers to the GPU being unregistered. It will also destroy its GPU state.
7722     // Subsequent calls to disable peers will remove the mappings from the GPU
7723     // being unregistered, but never to the GPU being unregistered (since it no
7724     // longer has a valid GPU state).
7725     for_each_va_space_gpu_in_mask(other_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)])
7726         block_gpu_unmap_all_chunks_indirect_peer(block, gpu, other_gpu);
7727 
7728     if (gpu_state->chunks) {
7729         size_t i, num_chunks;
7730 
7731         update_read_duplicated_pages_mask(block, id, gpu_state);
7732         uvm_page_mask_zero(&gpu_state->resident);
7733         block_clear_resident_processor(block, id);
7734 
7735         num_chunks = block_num_gpu_chunks(block, gpu);
7736         for (i = 0; i < num_chunks; i++) {
7737             uvm_gpu_chunk_t *chunk = gpu_state->chunks[i];
7738             if (!chunk)
7739                 continue;
7740 
7741             uvm_mmu_chunk_unmap(chunk, &block->tracker);
7742             uvm_pmm_gpu_free(&gpu->pmm, chunk, &block->tracker);
7743         }
7744 
7745         uvm_kvfree(gpu_state->chunks);
7746     }
7747     else {
7748         UVM_ASSERT(!uvm_processor_mask_test(&block->resident, id));
7749     }
7750 
7751 
7752     // Pending operations may still need the DMA memory to be mapped.
7753     uvm_tracker_wait(&block->tracker);
7754 
7755     block_gpu_unmap_phys_all_cpu_pages(block, gpu);
7756     uvm_processor_mask_clear(&block->evicted_gpus, id);
7757 
7758     kmem_cache_free(g_uvm_va_block_gpu_state_cache, gpu_state);
7759     block->gpus[uvm_id_gpu_index(id)] = NULL;
7760 }
7761 
7762 static void block_put_ptes_safe(uvm_page_tree_t *tree, uvm_page_table_range_t *range)
7763 {
7764     if (range->table) {
7765         uvm_page_tree_put_ptes(tree, range);
7766         memset(range, 0, sizeof(*range));
7767     }
7768 }
7769 
7770 NV_STATUS uvm_va_block_add_gpu_va_space(uvm_va_block_t *va_block, uvm_gpu_va_space_t *gpu_va_space)
7771 {
7772     uvm_assert_mutex_locked(&va_block->lock);
7773 
7774     if (!gpu_va_space->ats.enabled || !va_block->cpu.ever_mapped)
7775         return NV_OK;
7776 
7777     // Pre-populate PDEs down to PDE1 for all GPU VA spaces on ATS systems. See
7778     // comments in pre_populate_pde1_gpu.
7779     return block_pre_populate_pde1_gpu(va_block, gpu_va_space, NULL);
7780 }
7781 
7782 void uvm_va_block_remove_gpu_va_space(uvm_va_block_t *va_block,
7783                                       uvm_gpu_va_space_t *gpu_va_space,
7784                                       uvm_va_block_context_t *block_context)
7785 {
7786     uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch;
7787     uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch;
7788     uvm_gpu_t *gpu = gpu_va_space->gpu;
7789     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id);
7790     uvm_va_block_region_t region = uvm_va_block_region_from_block(va_block);
7791     uvm_push_t push;
7792     NV_STATUS status;
7793 
7794     uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
7795 
7796     if (!gpu_state)
7797         return;
7798 
7799     uvm_assert_mutex_locked(&va_block->lock);
7800 
7801     // Unmapping the whole block won't cause a page table split, so this should
7802     // only fail if we have a system-fatal error.
7803     status = uvm_va_block_unmap(va_block, block_context, gpu->id, region, NULL, &local_tracker);
7804     if (status != NV_OK) {
7805         UVM_ASSERT(status == uvm_global_get_status());
7806         return; // Just leak
7807     }
7808 
7809     UVM_ASSERT(!uvm_processor_mask_test(&va_block->mapped, gpu->id));
7810 
7811     // Reset the page tables if other allocations could reuse them
7812     if (!block_gpu_supports_2m(va_block, gpu) &&
7813         !bitmap_empty(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)) {
7814 
7815         status = uvm_push_begin_acquire(gpu->channel_manager,
7816                                         UVM_CHANNEL_TYPE_MEMOPS,
7817                                         &local_tracker,
7818                                         &push,
7819                                         "Resetting PTEs for block [0x%llx, 0x%llx)",
7820                                         va_block->start,
7821                                         va_block->end + 1);
7822         if (status != NV_OK) {
7823             UVM_ASSERT(status == uvm_global_get_status());
7824             return; // Just leak
7825         }
7826 
7827         uvm_pte_batch_begin(&push, pte_batch);
7828         uvm_tlb_batch_begin(&gpu_va_space->page_tables, tlb_batch);
7829 
7830         // When the big PTEs is active, the 4k PTEs under it are garbage. Make
7831         // them invalid so the page tree code can reuse them for other
7832         // allocations on this VA. These don't need TLB invalidates since the
7833         // big PTEs above them are active.
7834         if (gpu_state->page_table_range_4k.table) {
7835             uvm_page_mask_init_from_big_ptes(va_block, gpu, &block_context->scratch_page_mask, gpu_state->big_ptes);
7836             block_gpu_pte_clear_4k(va_block, gpu, &block_context->scratch_page_mask, 0, pte_batch, NULL);
7837         }
7838 
7839         // We unmapped all big PTEs above, which means they have the unmapped
7840         // pattern so the GPU MMU won't read 4k PTEs under them. Set them to
7841         // invalid to activate the 4ks below so new allocations using just those
7842         // 4k PTEs will work.
7843         block_gpu_pte_clear_big(va_block, gpu, gpu_state->big_ptes, 0, pte_batch, tlb_batch);
7844 
7845         uvm_pte_batch_end(pte_batch);
7846         uvm_tlb_batch_end(tlb_batch, &push, UVM_MEMBAR_NONE);
7847 
7848         uvm_push_end(&push);
7849         uvm_tracker_overwrite_with_push(&local_tracker, &push);
7850     }
7851 
7852     // The unmap must finish before we free the page tables
7853     status = uvm_tracker_wait_deinit(&local_tracker);
7854     if (status != NV_OK)
7855         return; // System-fatal error, just leak
7856 
7857     // Note that if the PTE is currently 2M with lower tables allocated but not
7858     // in use, calling put_ptes on those lower ranges will re-write the 2M entry
7859     // to be a PDE.
7860     block_put_ptes_safe(&gpu_va_space->page_tables, &gpu_state->page_table_range_4k);
7861     block_put_ptes_safe(&gpu_va_space->page_tables, &gpu_state->page_table_range_big);
7862     block_put_ptes_safe(&gpu_va_space->page_tables, &gpu_state->page_table_range_2m);
7863 
7864     gpu_state->pte_is_2m = false;
7865     gpu_state->initialized_big = false;
7866     gpu_state->activated_big = false;
7867     gpu_state->activated_4k = false;
7868     bitmap_zero(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
7869 
7870     UVM_ASSERT(block_check_mappings(va_block));
7871 }
7872 
7873 NV_STATUS uvm_va_block_enable_peer(uvm_va_block_t *va_block, uvm_gpu_t *gpu0, uvm_gpu_t *gpu1)
7874 {
7875     NV_STATUS status;
7876     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
7877 
7878     UVM_ASSERT(uvm_gpu_peer_caps(gpu0, gpu1)->link_type != UVM_GPU_LINK_INVALID);
7879     uvm_assert_rwsem_locked_write(&va_space->lock);
7880     uvm_assert_mutex_locked(&va_block->lock);
7881 
7882     if (uvm_processor_mask_test(&va_space->indirect_peers[uvm_id_value(gpu0->id)], gpu1->id)) {
7883         status = block_gpu_map_all_chunks_indirect_peer(va_block, gpu0, gpu1);
7884         if (status != NV_OK)
7885             return status;
7886 
7887         status = block_gpu_map_all_chunks_indirect_peer(va_block, gpu1, gpu0);
7888         if (status != NV_OK) {
7889             block_gpu_unmap_all_chunks_indirect_peer(va_block, gpu0, gpu1);
7890             return status;
7891         }
7892     }
7893 
7894     // TODO: Bug 1767224: Refactor the uvm_va_block_set_accessed_by logic so we
7895     //       call it here.
7896 
7897     return NV_OK;
7898 }
7899 
7900 void uvm_va_block_disable_peer(uvm_va_block_t *va_block, uvm_gpu_t *gpu0, uvm_gpu_t *gpu1)
7901 {
7902     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
7903     NV_STATUS status;
7904     uvm_tracker_t tracker = UVM_TRACKER_INIT();
7905     uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, NULL);
7906     uvm_page_mask_t *unmap_page_mask = &block_context->caller_page_mask;
7907     const uvm_page_mask_t *resident0;
7908     const uvm_page_mask_t *resident1;
7909 
7910     uvm_assert_mutex_locked(&va_block->lock);
7911 
7912     // See comment in block_destroy_gpu_state
7913     if (uvm_processor_mask_test(&va_space->indirect_peers[uvm_id_value(gpu0->id)], gpu1->id)) {
7914         block_gpu_unmap_all_chunks_indirect_peer(va_block, gpu0, gpu1);
7915         block_gpu_unmap_all_chunks_indirect_peer(va_block, gpu1, gpu0);
7916     }
7917 
7918     // If either of the GPUs doesn't have GPU state then nothing could be mapped
7919     // between them.
7920     if (!uvm_va_block_gpu_state_get(va_block, gpu0->id) || !uvm_va_block_gpu_state_get(va_block, gpu1->id))
7921         return;
7922 
7923     resident0 = uvm_va_block_resident_mask_get(va_block, gpu0->id);
7924     resident1 = uvm_va_block_resident_mask_get(va_block, gpu1->id);
7925 
7926     // Unmap all pages resident on gpu1, but not on gpu0, from gpu0
7927     if (uvm_page_mask_andnot(unmap_page_mask, resident1, resident0)) {
7928         status = block_unmap_gpu(va_block, block_context, gpu0, unmap_page_mask, &tracker);
7929         if (status != NV_OK) {
7930             // Since all PTEs unmapped by this call have the same aperture, page
7931             // splits should never be required so any failure should be the
7932             // result of a system-fatal error.
7933             UVM_ASSERT_MSG(status == uvm_global_get_status(),
7934                            "Unmapping failed: %s, GPU %s\n",
7935                            nvstatusToString(status),
7936                            uvm_gpu_name(gpu0));
7937         }
7938     }
7939 
7940     // Unmap all pages resident on gpu0, but not on gpu1, from gpu1
7941     if (uvm_page_mask_andnot(unmap_page_mask, resident0, resident1)) {
7942         status = block_unmap_gpu(va_block, block_context, gpu1, unmap_page_mask, &tracker);
7943         if (status != NV_OK) {
7944             UVM_ASSERT_MSG(status == uvm_global_get_status(),
7945                            "Unmapping failed: %s, GPU %s\n",
7946                            nvstatusToString(status),
7947                            uvm_gpu_name(gpu0));
7948         }
7949     }
7950 
7951     status = uvm_tracker_add_tracker_safe(&va_block->tracker, &tracker);
7952     if (status != NV_OK)
7953         UVM_ASSERT(status == uvm_global_get_status());
7954 
7955     status = uvm_tracker_wait_deinit(&tracker);
7956     if (status != NV_OK)
7957         UVM_ASSERT(status == uvm_global_get_status());
7958 }
7959 
7960 void uvm_va_block_unmap_preferred_location_uvm_lite(uvm_va_block_t *va_block, uvm_gpu_t *gpu)
7961 {
7962     NV_STATUS status;
7963     uvm_va_range_t *va_range = va_block->va_range;
7964     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
7965     uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, NULL);
7966     uvm_va_block_region_t region = uvm_va_block_region_from_block(va_block);
7967 
7968     uvm_assert_mutex_locked(&va_block->lock);
7969     UVM_ASSERT(uvm_processor_mask_test(&va_range->uvm_lite_gpus, gpu->id));
7970 
7971     // If the GPU doesn't have GPU state then nothing could be mapped.
7972     if (!uvm_va_block_gpu_state_get(va_block, gpu->id))
7973         return;
7974 
7975     // In UVM-Lite mode, mappings to the preferred location are not tracked
7976     // directly, so just unmap the whole block.
7977     status = uvm_va_block_unmap(va_block, block_context, gpu->id, region, NULL, &va_block->tracker);
7978     if (status != NV_OK) {
7979         // Unmapping the whole block should not cause page splits so any failure
7980         // should be the result of a system-fatal error.
7981         UVM_ASSERT_MSG(status == uvm_global_get_status(),
7982                        "Unmapping failed: %s, GPU %s\n",
7983                        nvstatusToString(status), uvm_gpu_name(gpu));
7984     }
7985 
7986     status = uvm_tracker_wait(&va_block->tracker);
7987     if (status != NV_OK) {
7988         UVM_ASSERT_MSG(status == uvm_global_get_status(),
7989                        "Unmapping failed: %s, GPU %s\n",
7990                        nvstatusToString(status), uvm_gpu_name(gpu));
7991     }
7992 }
7993 
7994 // Evict pages from the GPU by moving each resident region to the CPU
7995 //
7996 // Notably the caller needs to support allocation-retry as
7997 // uvm_va_block_migrate_locked() requires that.
7998 static NV_STATUS block_evict_pages_from_gpu(uvm_va_block_t *va_block, uvm_gpu_t *gpu, struct mm_struct *mm)
7999 {
8000     NV_STATUS status = NV_OK;
8001     const uvm_page_mask_t *resident = uvm_va_block_resident_mask_get(va_block, gpu->id);
8002     uvm_va_block_region_t region = uvm_va_block_region_from_block(va_block);
8003     uvm_va_block_region_t subregion;
8004     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
8005     uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, mm);
8006 
8007     // Move all subregions resident on the GPU to the CPU
8008     for_each_va_block_subregion_in_mask(subregion, resident, region) {
8009         if (uvm_va_block_is_hmm(va_block)) {
8010             status = uvm_hmm_va_block_evict_pages_from_gpu(va_block,
8011                                                            gpu,
8012                                                            block_context,
8013                                                            resident,
8014                                                            subregion);
8015         }
8016         else {
8017             status = uvm_va_block_migrate_locked(va_block,
8018                                                  NULL,
8019                                                  block_context,
8020                                                  subregion,
8021                                                  UVM_ID_CPU,
8022                                                  UVM_MIGRATE_MODE_MAKE_RESIDENT_AND_MAP,
8023                                                  NULL);
8024         }
8025         if (status != NV_OK)
8026             return status;
8027     }
8028 
8029     UVM_ASSERT(!uvm_processor_mask_test(&va_block->resident, gpu->id));
8030     return NV_OK;
8031 }
8032 
8033 void uvm_va_block_unregister_gpu_locked(uvm_va_block_t *va_block, uvm_gpu_t *gpu, struct mm_struct *mm)
8034 {
8035     NV_STATUS status;
8036     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id);
8037 
8038     uvm_assert_mutex_locked(&va_block->lock);
8039 
8040     if (!gpu_state)
8041         return;
8042 
8043     // The mappings should've already been torn down by GPU VA space unregister
8044     UVM_ASSERT(!uvm_processor_mask_test(&va_block->mapped, gpu->id));
8045     UVM_ASSERT(uvm_page_mask_empty(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ]));
8046     UVM_ASSERT(!block_gpu_has_page_tables(va_block, gpu));
8047 
8048     // Use UVM_VA_BLOCK_RETRY_LOCKED() as the va block lock is already taken and
8049     // we don't rely on any state of the block across the call.
8050     status = UVM_VA_BLOCK_RETRY_LOCKED(va_block, NULL, block_evict_pages_from_gpu(va_block, gpu, mm));
8051     if (status != NV_OK) {
8052         UVM_ERR_PRINT("Failed to evict GPU pages on GPU unregister: %s, GPU %s\n",
8053                       nvstatusToString(status),
8054                       uvm_gpu_name(gpu));
8055         uvm_global_set_fatal_error(status);
8056     }
8057 
8058     // This function will copy the block's tracker into each chunk then free the
8059     // chunk to PMM. If we do this before waiting for the block tracker below
8060     // we'll populate PMM's free chunks with tracker entries, which gives us
8061     // better testing coverage of chunk synchronization on GPU unregister.
8062     block_destroy_gpu_state(va_block, gpu->id);
8063 
8064     // Any time a GPU is unregistered we need to make sure that there are no
8065     // pending (direct or indirect) tracker entries for that GPU left in the
8066     // block's tracker. The only way to ensure that is to wait for the whole
8067     // tracker.
8068     status = uvm_tracker_wait(&va_block->tracker);
8069     if (status != NV_OK)
8070         UVM_ASSERT(status == uvm_global_get_status());
8071 }
8072 
8073 void uvm_va_block_unregister_gpu(uvm_va_block_t *va_block, uvm_gpu_t *gpu, struct mm_struct *mm)
8074 {
8075     // Take the lock internally to not expose the caller to allocation-retry.
8076     uvm_mutex_lock(&va_block->lock);
8077 
8078     uvm_va_block_unregister_gpu_locked(va_block, gpu, mm);
8079 
8080     uvm_mutex_unlock(&va_block->lock);
8081 }
8082 
8083 static void block_mark_region_cpu_dirty(uvm_va_block_t *va_block, uvm_va_block_region_t region)
8084 {
8085     uvm_page_index_t page_index;
8086 
8087     uvm_assert_mutex_locked(&va_block->lock);
8088 
8089     for_each_va_block_page_in_region_mask (page_index, &va_block->cpu.resident, region)
8090         block_mark_cpu_page_dirty(va_block, page_index);
8091 }
8092 
8093 // Tears down everything within the block, but doesn't free the block itself.
8094 // Note that when uvm_va_block_kill is called, this is called twice: once for
8095 // the initial kill itself, then again when the block's ref count is eventually
8096 // destroyed. block->va_range is used to track whether the block has already
8097 // been killed.
8098 static void block_kill(uvm_va_block_t *block)
8099 {
8100     uvm_va_space_t *va_space;
8101     uvm_perf_event_data_t event_data;
8102     uvm_cpu_chunk_t *chunk;
8103     uvm_gpu_id_t id;
8104     NV_STATUS status;
8105     uvm_va_block_region_t region = uvm_va_block_region_from_block(block);
8106     uvm_page_index_t page_index;
8107     uvm_page_index_t next_page_index;
8108 
8109     if (uvm_va_block_is_dead(block))
8110         return;
8111 
8112     va_space = uvm_va_block_get_va_space(block);
8113     event_data.block_destroy.block = block;
8114     uvm_perf_event_notify(&va_space->perf_events, UVM_PERF_EVENT_BLOCK_DESTROY, &event_data);
8115 
8116     // Unmap all processors in parallel first. Unmapping the whole block won't
8117     // cause a page table split, so this should only fail if we have a system-
8118     // fatal error.
8119     if (!uvm_processor_mask_empty(&block->mapped)) {
8120         uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, NULL);
8121 
8122         // HMM CPU mappings are controlled by Linux so no need to unmap.
8123         // Remote GPU mappings will be removed below.
8124         if (uvm_va_block_is_hmm(block) && uvm_processor_mask_test(&block->mapped, UVM_ID_CPU)) {
8125             uvm_page_mask_zero(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE]);
8126             uvm_page_mask_zero(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ]);
8127             uvm_processor_mask_clear(&block->mapped, UVM_ID_CPU);
8128         }
8129 
8130         // We could only be killed with mapped GPU state by VA range free or VA
8131         // space teardown, so it's safe to use the va_space's block_context
8132         // because both of those have the VA space lock held in write mode.
8133         status = uvm_va_block_unmap_mask(block, block_context, &block->mapped, region, NULL);
8134         UVM_ASSERT(status == uvm_global_get_status());
8135     }
8136 
8137     UVM_ASSERT(uvm_processor_mask_empty(&block->mapped));
8138 
8139     // Free the GPU page tables and chunks
8140     for_each_gpu_id(id)
8141         block_destroy_gpu_state(block, id);
8142 
8143     // Wait for the GPU PTE unmaps before freeing CPU memory
8144     uvm_tracker_wait_deinit(&block->tracker);
8145 
8146     // No processor should have the CPU mapped at this point
8147     UVM_ASSERT(block_check_processor_not_mapped(block, UVM_ID_CPU));
8148 
8149     // Free CPU pages
8150     for_each_cpu_chunk_in_block_safe(chunk, page_index, next_page_index, block) {
8151         // be conservative.
8152         // Tell the OS we wrote to the page because we sometimes clear the dirty
8153         // bit after writing to it. HMM dirty flags are managed by the kernel.
8154         if (!uvm_va_block_is_hmm(block))
8155             uvm_cpu_chunk_mark_dirty(chunk, 0);
8156         uvm_cpu_chunk_remove_from_block(block, page_index);
8157         uvm_cpu_chunk_free(chunk);
8158     }
8159 
8160     uvm_kvfree((void *)block->cpu.chunks);
8161     block->cpu.chunks = 0;
8162 
8163     // Clearing the resident bit isn't strictly necessary since this block
8164     // is getting destroyed, but it keeps state consistent for assertions.
8165     uvm_page_mask_zero(&block->cpu.resident);
8166     block_clear_resident_processor(block, UVM_ID_CPU);
8167 
8168     if (uvm_va_block_is_hmm(block))
8169         uvm_va_policy_clear(block, block->start, block->end);
8170 
8171     block->va_range = NULL;
8172 #if UVM_IS_CONFIG_HMM()
8173     block->hmm.va_space = NULL;
8174 #endif
8175 }
8176 
8177 // Called when the block's ref count drops to 0
8178 void uvm_va_block_destroy(nv_kref_t *nv_kref)
8179 {
8180     uvm_va_block_t *block = container_of(nv_kref, uvm_va_block_t, kref);
8181 
8182     // Nobody else should have a reference when freeing
8183     uvm_assert_mutex_unlocked(&block->lock);
8184 
8185     uvm_mutex_lock(&block->lock);
8186     block_kill(block);
8187     uvm_mutex_unlock(&block->lock);
8188 
8189     if (uvm_enable_builtin_tests) {
8190         uvm_va_block_wrapper_t *block_wrapper = container_of(block, uvm_va_block_wrapper_t, block);
8191 
8192         kmem_cache_free(g_uvm_va_block_cache, block_wrapper);
8193     }
8194     else {
8195         kmem_cache_free(g_uvm_va_block_cache, block);
8196     }
8197 }
8198 
8199 void uvm_va_block_kill(uvm_va_block_t *va_block)
8200 {
8201     uvm_mutex_lock(&va_block->lock);
8202     block_kill(va_block);
8203     uvm_mutex_unlock(&va_block->lock);
8204 
8205     // May call block_kill again
8206     uvm_va_block_release(va_block);
8207 }
8208 
8209 static void block_gpu_release_region(uvm_va_block_t *va_block,
8210                                      uvm_gpu_id_t gpu_id,
8211                                      uvm_va_block_gpu_state_t *gpu_state,
8212                                      uvm_page_mask_t *page_mask,
8213                                      uvm_va_block_region_t region)
8214 {
8215     uvm_page_index_t page_index;
8216 
8217     for_each_va_block_page_in_region_mask(page_index, page_mask, region) {
8218         uvm_gpu_chunk_t *gpu_chunk = gpu_state->chunks[page_index];
8219 
8220         if (!gpu_chunk)
8221             continue;
8222 
8223         // TODO: Bug 3898467: unmap indirect peers when freeing GPU chunks
8224 
8225         uvm_mmu_chunk_unmap(gpu_chunk, &va_block->tracker);
8226 
8227         // The GPU chunk will be freed when the device private reference drops.
8228         if (uvm_page_mask_test_and_clear(&gpu_state->resident, page_index) &&
8229             uvm_page_mask_empty(&gpu_state->resident))
8230             block_clear_resident_processor(va_block, gpu_id);
8231 
8232         gpu_state->chunks[page_index] = NULL;
8233     }
8234 }
8235 
8236 void uvm_va_block_munmap_region(uvm_va_block_t *va_block,
8237                                 uvm_va_block_region_t region)
8238 {
8239     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
8240     uvm_perf_event_data_t event_data;
8241     uvm_gpu_id_t gpu_id;
8242 
8243     UVM_ASSERT(uvm_va_block_is_hmm(va_block));
8244     uvm_assert_mutex_locked(&va_block->lock);
8245 
8246     // Reset thrashing state for the region.
8247     event_data.block_munmap.block = va_block;
8248     event_data.block_munmap.region = region;
8249     uvm_perf_event_notify(&va_space->perf_events, UVM_PERF_EVENT_BLOCK_MUNMAP, &event_data);
8250 
8251     // Set a flag so that GPU fault events are flushed since they might refer
8252     // to the region being unmapped.
8253     // Note that holding the va_block lock prevents GPU VA spaces from
8254     // being removed so the registered_gpu_va_spaces mask is stable.
8255     for_each_gpu_id_in_mask(gpu_id, &va_space->registered_gpu_va_spaces) {
8256         uvm_processor_mask_set_atomic(&va_space->needs_fault_buffer_flush, gpu_id);
8257     }
8258 
8259     // Release any remaining vidmem chunks in the given region.
8260     for_each_gpu_id(gpu_id) {
8261         uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu_id);
8262 
8263         if (!gpu_state)
8264             continue;
8265 
8266         uvm_page_mask_region_clear(&gpu_state->evicted, region);
8267         if (uvm_page_mask_empty(&gpu_state->evicted))
8268             uvm_processor_mask_clear(&va_block->evicted_gpus, gpu_id);
8269 
8270         if (gpu_state->chunks) {
8271             block_gpu_release_region(va_block, gpu_id, gpu_state, NULL, region);
8272 
8273             // TODO: bug 3660922: Need to update the read duplicated pages mask
8274             // when read duplication is supported for HMM.
8275         }
8276         else {
8277             UVM_ASSERT(!uvm_processor_mask_test(&va_block->resident, gpu_id));
8278         }
8279     }
8280 
8281     uvm_va_policy_clear(va_block,
8282                         uvm_va_block_region_start(va_block, region),
8283                         uvm_va_block_region_end(va_block, region));
8284 }
8285 
8286 static NV_STATUS block_split_presplit_ptes_gpu(uvm_va_block_t *existing, uvm_va_block_t *new, uvm_gpu_t *gpu)
8287 {
8288     uvm_va_block_gpu_state_t *existing_gpu_state = uvm_va_block_gpu_state_get(existing, gpu->id);
8289     uvm_va_space_t *va_space = uvm_va_block_get_va_space(existing);
8290     uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, NULL);
8291     NvU32 big_page_size = uvm_va_block_gpu_big_page_size(existing, gpu);
8292     NvU32 alloc_sizes;
8293     DECLARE_BITMAP(new_big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
8294     uvm_page_index_t new_start_page_index = uvm_va_block_cpu_page_index(existing, new->start);
8295     size_t big_page_index;
8296     uvm_push_t push;
8297     NV_STATUS status;
8298 
8299     // We only have to split to big PTEs if we're currently a 2M PTE
8300     if (existing_gpu_state->pte_is_2m) {
8301         // We can skip the split if the 2M PTE is invalid and we have no lower
8302         // PTEs.
8303         if (block_page_prot_gpu(existing, gpu, 0) == UVM_PROT_NONE &&
8304             !existing_gpu_state->page_table_range_big.table &&
8305             !existing_gpu_state->page_table_range_4k.table)
8306             return NV_OK;
8307 
8308         alloc_sizes = big_page_size;
8309         bitmap_fill(new_big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
8310 
8311         if (!IS_ALIGNED(new->start, big_page_size)) {
8312             alloc_sizes |= UVM_PAGE_SIZE_4K;
8313 
8314             big_page_index = uvm_va_block_big_page_index(existing, new_start_page_index, big_page_size);
8315             __clear_bit(big_page_index, new_big_ptes);
8316         }
8317 
8318         status = block_alloc_ptes_with_retry(existing, gpu, alloc_sizes, NULL);
8319         if (status != NV_OK)
8320             return status;
8321 
8322         status = uvm_push_begin_acquire(gpu->channel_manager,
8323                                         UVM_CHANNEL_TYPE_MEMOPS,
8324                                         &existing->tracker,
8325                                         &push,
8326                                         "Splitting 2M PTE, existing [0x%llx, 0x%llx) new [0x%llx, 0x%llx)",
8327                                         existing->start, existing->end + 1,
8328                                         new->start, new->end + 1);
8329         if (status != NV_OK)
8330             return status;
8331 
8332         block_gpu_split_2m(existing, block_context, gpu, new_big_ptes, &push);
8333     }
8334     else {
8335         big_page_index = uvm_va_block_big_page_index(existing, new_start_page_index, big_page_size);
8336 
8337         // If the split point is on a big page boundary, or if the split point
8338         // is not currently covered by a big PTE, we don't have to split
8339         // anything.
8340         if (IS_ALIGNED(new->start, big_page_size) ||
8341             big_page_index == MAX_BIG_PAGES_PER_UVM_VA_BLOCK ||
8342             !test_bit(big_page_index, existing_gpu_state->big_ptes))
8343             return NV_OK;
8344 
8345         status = block_alloc_ptes_with_retry(existing, gpu, UVM_PAGE_SIZE_4K, NULL);
8346         if (status != NV_OK)
8347             return status;
8348 
8349         bitmap_zero(new_big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
8350         __set_bit(big_page_index, new_big_ptes);
8351 
8352         status = uvm_push_begin_acquire(gpu->channel_manager,
8353                                         UVM_CHANNEL_TYPE_MEMOPS,
8354                                         &existing->tracker,
8355                                         &push,
8356                                         "Splitting big PTE, existing [0x%llx, 0x%llx) new [0x%llx, 0x%llx)",
8357                                         existing->start, existing->end + 1,
8358                                         new->start, new->end + 1);
8359         if (status != NV_OK)
8360             return status;
8361 
8362         block_gpu_split_big(existing, block_context, gpu, new_big_ptes, &push);
8363     }
8364 
8365     uvm_push_end(&push);
8366 
8367     // Adding this push to existing block tracker will cause all GPU PTE splits
8368     // to serialize on each other, but it's simpler than maintaining a separate
8369     // tracker and this path isn't performance-critical.
8370     return uvm_tracker_add_push_safe(&existing->tracker, &push);
8371 }
8372 
8373 static NV_STATUS block_split_presplit_ptes(uvm_va_block_t *existing, uvm_va_block_t *new)
8374 {
8375     uvm_gpu_t *gpu;
8376     uvm_gpu_id_t id;
8377     NV_STATUS status;
8378 
8379     for_each_gpu_id(id) {
8380         if (!uvm_va_block_gpu_state_get(existing, id))
8381             continue;
8382 
8383         gpu = block_get_gpu(existing, id);
8384 
8385         if (block_gpu_has_page_tables(existing, gpu)) {
8386             status = block_split_presplit_ptes_gpu(existing, new, gpu);
8387             if (status != NV_OK)
8388                 return status;
8389         }
8390     }
8391 
8392     return NV_OK;
8393 }
8394 
8395 typedef struct
8396 {
8397     // Number of chunks contained by this VA block
8398     size_t num_chunks;
8399 
8400     // Index of the "interesting" chunk, either adjacent to or spanning the
8401     // split point depending on which block this is.
8402     size_t chunk_index;
8403 
8404     // Size of the chunk referenced by chunk_index
8405     uvm_chunk_size_t chunk_size;
8406 } block_gpu_chunk_split_state_t;
8407 
8408 static void block_gpu_chunk_get_split_state(uvm_va_block_t *block,
8409                                             block_gpu_chunk_split_state_t *state,
8410                                             NvU64 start,
8411                                             NvU64 end,
8412                                             uvm_page_index_t page_index,
8413                                             uvm_gpu_t *gpu)
8414 {
8415     NvU64 size = end - start + 1;
8416     state->num_chunks = block_num_gpu_chunks_range(block, start, size, gpu);
8417     state->chunk_index = block_gpu_chunk_index_range(block, start, size, gpu, page_index, &state->chunk_size);
8418 }
8419 
8420 static void block_merge_chunk(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_gpu_chunk_t *chunk)
8421 {
8422     uvm_gpu_t *accessing_gpu;
8423     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
8424 
8425     uvm_pmm_gpu_merge_chunk(&gpu->pmm, chunk);
8426 
8427     for_each_va_space_gpu_in_mask(accessing_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)]) {
8428         NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(&gpu->pmm, chunk, accessing_gpu);
8429 
8430         uvm_pmm_sysmem_mappings_merge_gpu_chunk_mappings(&accessing_gpu->pmm_reverse_sysmem_mappings,
8431                                                          peer_addr,
8432                                                          uvm_gpu_chunk_get_size(chunk));
8433     }
8434 }
8435 
8436 // Perform any chunk splitting and array growing required for this block split,
8437 // but don't actually move chunk pointers anywhere.
8438 static NV_STATUS block_presplit_gpu_chunks(uvm_va_block_t *existing, uvm_va_block_t *new, uvm_gpu_t *gpu)
8439 {
8440     uvm_va_block_gpu_state_t *existing_gpu_state = uvm_va_block_gpu_state_get(existing, gpu->id);
8441     uvm_gpu_t *accessing_gpu;
8442     uvm_va_space_t *va_space = uvm_va_block_get_va_space(existing);
8443     uvm_gpu_chunk_t **temp_chunks;
8444     uvm_gpu_chunk_t *original_chunk, *curr_chunk;
8445     uvm_page_index_t split_page_index = uvm_va_block_cpu_page_index(existing, new->start);
8446     uvm_chunk_sizes_mask_t split_sizes;
8447     uvm_chunk_size_t subchunk_size;
8448     NV_STATUS status;
8449     block_gpu_chunk_split_state_t existing_before_state, existing_after_state, new_state;
8450 
8451     block_gpu_chunk_get_split_state(existing,
8452                                     &existing_before_state,
8453                                     existing->start,
8454                                     existing->end,
8455                                     split_page_index,
8456                                     gpu);
8457     block_gpu_chunk_get_split_state(existing,
8458                                     &existing_after_state,
8459                                     existing->start,
8460                                     new->start - 1,
8461                                     split_page_index - 1,
8462                                     gpu);
8463     block_gpu_chunk_get_split_state(new,
8464                                     &new_state,
8465                                     new->start,
8466                                     new->end,
8467                                     0,
8468                                     gpu);
8469 
8470     // Even though we're splitting existing, we could wind up requiring a larger
8471     // chunks array if we split a large chunk into many smaller ones.
8472     if (existing_after_state.num_chunks > existing_before_state.num_chunks) {
8473         temp_chunks = uvm_kvrealloc(existing_gpu_state->chunks,
8474                                     existing_after_state.num_chunks * sizeof(existing_gpu_state->chunks[0]));
8475         if (!temp_chunks)
8476             return NV_ERR_NO_MEMORY;
8477         existing_gpu_state->chunks = temp_chunks;
8478     }
8479 
8480     original_chunk = existing_gpu_state->chunks[existing_before_state.chunk_index];
8481 
8482     // If the chunk covering the split point is not populated, we're done. We've
8483     // already grown the array to cover any new chunks which may be populated
8484     // later.
8485     if (!original_chunk)
8486         return NV_OK;
8487 
8488     // Figure out the splits we need to perform. Remove all sizes >= the current
8489     // size, and all sizes < the target size. Note that the resulting mask will
8490     // be 0 if the sizes match (we're already splitting at a chunk boundary).
8491     UVM_ASSERT(uvm_gpu_chunk_get_size(original_chunk) == existing_before_state.chunk_size);
8492     UVM_ASSERT(existing_before_state.chunk_size >= new_state.chunk_size);
8493     split_sizes = gpu->parent->mmu_user_chunk_sizes;
8494     split_sizes &= existing_before_state.chunk_size - 1;
8495     split_sizes &= ~(new_state.chunk_size - 1);
8496 
8497     // Keep splitting the chunk covering the split point until we hit the target
8498     // size.
8499     curr_chunk = original_chunk;
8500     for_each_chunk_size_rev(subchunk_size, split_sizes) {
8501         size_t last_index, num_subchunks;
8502 
8503         status = uvm_pmm_gpu_split_chunk(&gpu->pmm, curr_chunk, subchunk_size, NULL);
8504         if (status != NV_OK)
8505             goto error;
8506 
8507         // Split physical GPU mappings for indirect peers
8508         for_each_va_space_gpu_in_mask(accessing_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)]) {
8509             NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(&gpu->pmm, curr_chunk, accessing_gpu);
8510 
8511             status = uvm_pmm_sysmem_mappings_split_gpu_chunk_mappings(&accessing_gpu->pmm_reverse_sysmem_mappings,
8512                                                                       peer_addr,
8513                                                                       subchunk_size);
8514             if (status != NV_OK)
8515                 goto error;
8516         }
8517 
8518         if (subchunk_size == new_state.chunk_size)
8519             break;
8520 
8521         // Compute the last subchunk index prior to the split point. Divide the
8522         // entire address space into units of subchunk_size, then mod by the
8523         // number of subchunks within the parent.
8524         last_index = (size_t)uvm_div_pow2_64(new->start - 1, subchunk_size);
8525         num_subchunks = (size_t)uvm_div_pow2_64(uvm_gpu_chunk_get_size(curr_chunk), subchunk_size);
8526         UVM_ASSERT(num_subchunks > 1);
8527         last_index &= num_subchunks - 1;
8528 
8529         uvm_pmm_gpu_get_subchunks(&gpu->pmm, curr_chunk, last_index, 1, &curr_chunk);
8530         UVM_ASSERT(uvm_gpu_chunk_get_size(curr_chunk) == subchunk_size);
8531     }
8532 
8533     // Note that existing's chunks array still has a pointer to original_chunk,
8534     // not to any newly-split subchunks. If a subsequent split failure occurs on
8535     // a later GPU we'll have to merge it back. Once we're past the preallocate
8536     // stage we'll remove it from the chunks array and move the new split chunks
8537     // in.
8538 
8539     return NV_OK;
8540 
8541 error:
8542     // On error we need to leave the chunk in its initial state
8543     block_merge_chunk(existing, gpu, original_chunk);
8544 
8545     return status;
8546 }
8547 
8548 static NV_STATUS block_split_cpu_chunk_to_64k(uvm_va_block_t *block)
8549 {
8550     uvm_cpu_chunk_storage_mixed_t *mixed;
8551     uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, 0);
8552     NV_STATUS status;
8553 
8554     UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_2M);
8555     UVM_ASSERT(uvm_cpu_storage_get_type(block) == UVM_CPU_CHUNK_STORAGE_CHUNK);
8556 
8557     mixed = uvm_kvmalloc_zero(sizeof(*mixed));
8558     if (!mixed)
8559         return NV_ERR_NO_MEMORY;
8560 
8561     status = uvm_cpu_chunk_split(chunk, (uvm_cpu_chunk_t **)&mixed->slots);
8562     if (status != NV_OK) {
8563         uvm_kvfree(mixed);
8564         return status;
8565     }
8566 
8567     bitmap_fill(mixed->big_chunks, MAX_BIG_CPU_CHUNK_SLOTS_PER_UVM_VA_BLOCK);
8568     block->cpu.chunks = (unsigned long)mixed | UVM_CPU_CHUNK_STORAGE_MIXED;
8569     return status;
8570 }
8571 
8572 static NV_STATUS block_split_cpu_chunk_to_4k(uvm_va_block_t *block, uvm_page_index_t page_index)
8573 {
8574     uvm_cpu_chunk_storage_mixed_t *mixed;
8575     uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index);
8576     uvm_cpu_chunk_t **small_chunks;
8577     size_t slot_index;
8578     NV_STATUS status;
8579 
8580     UVM_ASSERT(chunk);
8581     UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_64K);
8582     UVM_ASSERT(uvm_cpu_storage_get_type(block) == UVM_CPU_CHUNK_STORAGE_MIXED);
8583 
8584     mixed = uvm_cpu_storage_get_ptr(block);
8585     slot_index = compute_slot_index(block, page_index);
8586     small_chunks = uvm_kvmalloc_zero(sizeof(*small_chunks) * MAX_SMALL_CHUNKS_PER_BIG_SLOT);
8587     if (!small_chunks)
8588         return NV_ERR_NO_MEMORY;
8589 
8590     status = uvm_cpu_chunk_split(chunk, small_chunks);
8591     if (status != NV_OK) {
8592         uvm_kvfree(small_chunks);
8593         return status;
8594     }
8595 
8596     mixed->slots[slot_index] = small_chunks;
8597     clear_bit(slot_index, mixed->big_chunks);
8598     return status;
8599 }
8600 
8601 static NV_STATUS block_split_cpu_chunk_one(uvm_va_block_t *block, uvm_page_index_t page_index)
8602 {
8603     uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index);
8604     uvm_chunk_size_t chunk_size = uvm_cpu_chunk_get_size(chunk);
8605     uvm_chunk_size_t new_size;
8606     uvm_gpu_t *gpu;
8607     NvU64 gpu_mapping_addr;
8608     uvm_processor_mask_t gpu_split_mask;
8609     uvm_gpu_id_t id;
8610     NV_STATUS status;
8611 
8612     if (chunk_size == UVM_CHUNK_SIZE_2M)
8613         new_size = UVM_CHUNK_SIZE_64K;
8614     else
8615         new_size = UVM_CHUNK_SIZE_4K;
8616 
8617     UVM_ASSERT(IS_ALIGNED(chunk_size, new_size));
8618 
8619     uvm_processor_mask_zero(&gpu_split_mask);
8620     for_each_gpu_id(id) {
8621         if (!uvm_va_block_gpu_state_get(block, id))
8622             continue;
8623 
8624         gpu = block_get_gpu(block, id);
8625 
8626         // If the parent chunk has not been mapped, there is nothing to split.
8627         gpu_mapping_addr = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu->parent);
8628         if (gpu_mapping_addr == 0)
8629             continue;
8630 
8631         status = uvm_pmm_sysmem_mappings_split_gpu_mappings(&gpu->pmm_reverse_sysmem_mappings,
8632                                                             gpu_mapping_addr,
8633                                                             new_size);
8634         if (status != NV_OK)
8635             goto merge;
8636 
8637         uvm_processor_mask_set(&gpu_split_mask, id);
8638     }
8639 
8640     if (new_size == UVM_CHUNK_SIZE_64K)
8641         status = block_split_cpu_chunk_to_64k(block);
8642     else
8643         status = block_split_cpu_chunk_to_4k(block, page_index);
8644 
8645     if (status != NV_OK) {
8646 merge:
8647         for_each_gpu_id_in_mask(id, &gpu_split_mask) {
8648             gpu = block_get_gpu(block, id);
8649             gpu_mapping_addr = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu->parent);
8650             uvm_pmm_sysmem_mappings_merge_gpu_mappings(&gpu->pmm_reverse_sysmem_mappings,
8651                                                        gpu_mapping_addr,
8652                                                        chunk_size);
8653         }
8654     }
8655 
8656     return status;
8657 }
8658 
8659 static NV_STATUS block_prealloc_cpu_chunk_storage(uvm_va_block_t *existing, uvm_va_block_t *new)
8660 {
8661     uvm_cpu_chunk_storage_mixed_t *existing_mixed;
8662     uvm_cpu_chunk_storage_mixed_t *new_mixed = NULL;
8663     size_t slot_offset;
8664     size_t existing_slot;
8665     NV_STATUS status = NV_OK;
8666 
8667     UVM_ASSERT(uvm_cpu_storage_get_type(existing) == UVM_CPU_CHUNK_STORAGE_MIXED);
8668     existing_mixed = uvm_cpu_storage_get_ptr(existing);
8669 
8670     // Pre-allocate chunk storage for the new block. By definition, the new block
8671     // will contain either 64K and/or 4K chunks.
8672     //
8673     // We do this here so there are no failures in block_split_cpu().
8674     new_mixed = uvm_kvmalloc_zero(sizeof(*new_mixed));
8675     if (!new_mixed)
8676         return NV_ERR_NO_MEMORY;
8677 
8678     slot_offset = compute_slot_index(existing, uvm_va_block_cpu_page_index(existing, new->start));
8679     existing_slot = slot_offset;
8680     for_each_clear_bit_from(existing_slot, existing_mixed->big_chunks, MAX_BIG_CPU_CHUNK_SLOTS_PER_UVM_VA_BLOCK) {
8681         size_t new_slot = existing_slot - slot_offset;
8682 
8683         if (existing_mixed->slots[existing_slot]) {
8684             uvm_cpu_chunk_t **small_chunks = uvm_kvmalloc_zero(sizeof(*small_chunks) * MAX_SMALL_CHUNKS_PER_BIG_SLOT);
8685 
8686             if (!small_chunks) {
8687                 status = NV_ERR_NO_MEMORY;
8688                 goto done;
8689             }
8690 
8691             new_mixed->slots[new_slot] = small_chunks;
8692         }
8693     }
8694 
8695     new->cpu.chunks = (unsigned long)new_mixed | UVM_CPU_CHUNK_STORAGE_MIXED;
8696     UVM_ASSERT(status == NV_OK);
8697 
8698 done:
8699     if (status != NV_OK) {
8700         for (; existing_slot > slot_offset; existing_slot--)
8701             uvm_kvfree(new_mixed->slots[existing_slot - slot_offset]);
8702 
8703         uvm_kvfree(new_mixed);
8704     }
8705 
8706     return status;
8707 }
8708 
8709 static void block_free_cpu_chunk_storage(uvm_va_block_t *block)
8710 {
8711     if (block->cpu.chunks) {
8712         uvm_cpu_chunk_storage_mixed_t *mixed;
8713         size_t slot_index;
8714 
8715         UVM_ASSERT(uvm_cpu_storage_get_type(block) == UVM_CPU_CHUNK_STORAGE_MIXED);
8716         mixed = uvm_cpu_storage_get_ptr(block);
8717         for (slot_index = 0; slot_index < MAX_BIG_CPU_CHUNK_SLOTS_PER_UVM_VA_BLOCK; slot_index++)
8718             uvm_kvfree(mixed->slots[slot_index]);
8719 
8720         uvm_kvfree(mixed);
8721         block->cpu.chunks = 0;
8722     }
8723 }
8724 
8725 // Perform any CPU chunk splitting that may be required for this block split.
8726 // Just like block_presplit_gpu_chunks, no chunks are moved to the new block.
8727 static NV_STATUS block_presplit_cpu_chunks(uvm_va_block_t *existing, uvm_va_block_t *new)
8728 {
8729     uvm_page_index_t page_index = uvm_va_block_cpu_page_index(existing, new->start);
8730     uvm_cpu_chunk_t *splitting_chunk;
8731     uvm_chunk_sizes_mask_t split_sizes = uvm_cpu_chunk_get_allocation_sizes();
8732     uvm_chunk_size_t subchunk_size;
8733     NV_STATUS status = NV_OK;
8734 
8735     UVM_ASSERT(!IS_ALIGNED(new->start, UVM_VA_BLOCK_SIZE));
8736     splitting_chunk = uvm_cpu_chunk_get_chunk_for_page(existing, page_index);
8737 
8738     // If the page covering the split point has not been populated, there is no
8739     // need to split.
8740     if (!splitting_chunk)
8741         return NV_OK;
8742 
8743     // If the split point is aligned on the chunk size, there is no need to
8744     // split.
8745     if (IS_ALIGNED(new->start, uvm_cpu_chunk_get_size(splitting_chunk)))
8746         return NV_OK;
8747 
8748     // Remove all sizes above the chunk's current size.
8749     split_sizes &= uvm_cpu_chunk_get_size(splitting_chunk) - 1;
8750     // Remove all sizes below the alignment of the new block's start.
8751     split_sizes &= ~(IS_ALIGNED(new->start, UVM_CHUNK_SIZE_64K) ? UVM_CHUNK_SIZE_64K - 1 : 0);
8752 
8753     for_each_chunk_size_rev(subchunk_size, split_sizes) {
8754         status = block_split_cpu_chunk_one(existing, page_index);
8755         if (status != NV_OK)
8756             return status;
8757     }
8758 
8759     return block_prealloc_cpu_chunk_storage(existing, new);
8760 }
8761 
8762 static void block_merge_cpu_chunks_to_64k(uvm_va_block_t *block, uvm_page_index_t page_index)
8763 {
8764     uvm_cpu_chunk_storage_mixed_t *mixed = uvm_cpu_storage_get_ptr(block);
8765     size_t slot_index = compute_slot_index(block, page_index);
8766     uvm_cpu_chunk_t **small_chunks = mixed->slots[slot_index];
8767     uvm_cpu_chunk_t *merged_chunk;
8768 
8769     UVM_ASSERT(uvm_cpu_storage_get_type(block) == UVM_CPU_CHUNK_STORAGE_MIXED);
8770     UVM_ASSERT(small_chunks);
8771     UVM_ASSERT(!test_bit(slot_index, mixed->big_chunks));
8772 
8773     merged_chunk = uvm_cpu_chunk_merge(small_chunks);
8774     mixed->slots[slot_index] = merged_chunk;
8775     set_bit(slot_index, mixed->big_chunks);
8776     uvm_kvfree(small_chunks);
8777 }
8778 
8779 static void block_merge_cpu_chunks_to_2m(uvm_va_block_t *block, uvm_page_index_t page_index)
8780 {
8781     uvm_cpu_chunk_storage_mixed_t *mixed = uvm_cpu_storage_get_ptr(block);
8782     uvm_cpu_chunk_t **big_chunks = (uvm_cpu_chunk_t **)&mixed->slots;
8783     uvm_cpu_chunk_t *merged_chunk;
8784 
8785     UVM_ASSERT(uvm_cpu_storage_get_type(block) == UVM_CPU_CHUNK_STORAGE_MIXED);
8786     UVM_ASSERT(bitmap_full(mixed->big_chunks, MAX_BIG_CPU_CHUNK_SLOTS_PER_UVM_VA_BLOCK));
8787 
8788     merged_chunk = uvm_cpu_chunk_merge(big_chunks);
8789     block->cpu.chunks = (unsigned long)merged_chunk | UVM_CPU_CHUNK_STORAGE_CHUNK;
8790     uvm_kvfree(mixed);
8791 }
8792 
8793 static void block_merge_cpu_chunks_one(uvm_va_block_t *block, uvm_page_index_t page_index)
8794 {
8795     uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index);
8796     uvm_gpu_id_t id;
8797 
8798     if (uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_4K) {
8799         block_merge_cpu_chunks_to_64k(block, page_index);
8800     }
8801     else {
8802         UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_64K);
8803         block_merge_cpu_chunks_to_2m(block, page_index);
8804     }
8805 
8806     chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index);
8807 
8808     for_each_gpu_id(id) {
8809         NvU64 gpu_mapping_addr;
8810         uvm_gpu_t *gpu;
8811 
8812         if (!uvm_va_block_gpu_state_get(block, id))
8813             continue;
8814 
8815         gpu = block_get_gpu(block, id);
8816         gpu_mapping_addr = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu->parent);
8817         if (gpu_mapping_addr == 0)
8818             continue;
8819 
8820         uvm_pmm_sysmem_mappings_merge_gpu_mappings(&gpu->pmm_reverse_sysmem_mappings,
8821                                                    gpu_mapping_addr,
8822                                                    uvm_cpu_chunk_get_size(chunk));
8823     }
8824 }
8825 
8826 static void block_merge_cpu_chunks(uvm_va_block_t *existing, uvm_va_block_t *new)
8827 {
8828     uvm_page_index_t page_index = uvm_va_block_cpu_page_index(existing, new->start);
8829     uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(existing, page_index);
8830     uvm_chunk_sizes_mask_t merge_sizes = uvm_cpu_chunk_get_allocation_sizes();
8831     uvm_chunk_size_t largest_size;
8832     uvm_chunk_size_t chunk_size;
8833     uvm_chunk_size_t merge_size;
8834     size_t block_size = uvm_va_block_size(existing);
8835 
8836     if (!chunk || uvm_cpu_chunk_is_physical(chunk))
8837         return;
8838 
8839     chunk_size = uvm_cpu_chunk_get_size(chunk);
8840 
8841     // Remove all CPU chunk sizes above the size of the existing VA block.
8842     // Since block sizes are not always powers of 2, use the largest power of 2
8843     // less than or equal to the block size since we can't merge to a size
8844     // larger than the block's size.
8845     largest_size = rounddown_pow_of_two(block_size);
8846     merge_sizes &= (largest_size | (largest_size - 1));
8847 
8848     // Remove all CPU chunk sizes smaller than the size of the chunk being merged up.
8849     merge_sizes &= ~(chunk_size | (chunk_size - 1));
8850 
8851     for_each_chunk_size(merge_size, merge_sizes) {
8852         uvm_va_block_region_t chunk_region;
8853 
8854         // The block has to fully contain the VA range after the merge.
8855         if (!uvm_va_block_contains_address(existing, UVM_ALIGN_DOWN(new->start, merge_size)) ||
8856             !uvm_va_block_contains_address(existing, UVM_ALIGN_DOWN(new->start, merge_size) + merge_size - 1))
8857             break;
8858 
8859         chunk_region = uvm_va_block_chunk_region(existing, merge_size, page_index);
8860 
8861         // If not all pages in the region covered by the chunk are allocated,
8862         // we can't merge.
8863         if (!uvm_page_mask_region_full(&existing->cpu.allocated, chunk_region))
8864             break;
8865 
8866         block_merge_cpu_chunks_one(existing, chunk_region.first);
8867         chunk = uvm_cpu_chunk_get_chunk_for_page(existing, page_index);
8868         if (uvm_cpu_chunk_is_physical(chunk))
8869             break;
8870     }
8871 
8872     block_free_cpu_chunk_storage(new);
8873 }
8874 
8875 // Pre-allocate everything which doesn't require retry on both existing and new
8876 // which will be needed to handle a split. If this fails, existing must remain
8877 // functionally unmodified.
8878 static NV_STATUS block_split_preallocate_no_retry(uvm_va_block_t *existing, uvm_va_block_t *new)
8879 {
8880     NV_STATUS status;
8881     uvm_gpu_t *gpu;
8882     uvm_gpu_id_t id;
8883     uvm_page_index_t split_page_index;
8884     uvm_va_block_test_t *block_test;
8885 
8886     status = block_presplit_cpu_chunks(existing, new);
8887     if (status != NV_OK)
8888         goto error;
8889 
8890     for_each_gpu_id(id) {
8891         if (!uvm_va_block_gpu_state_get(existing, id))
8892             continue;
8893 
8894         gpu = block_get_gpu(existing, id);
8895 
8896         status = block_presplit_gpu_chunks(existing, new, gpu);
8897         if (status != NV_OK)
8898             goto error;
8899 
8900         if (!block_gpu_state_get_alloc(new, gpu)) {
8901             status = NV_ERR_NO_MEMORY;
8902             goto error;
8903         }
8904     }
8905 
8906     block_test = uvm_va_block_get_test(existing);
8907     if (block_test && block_test->inject_split_error) {
8908         block_test->inject_split_error = false;
8909         if (!uvm_va_block_is_hmm(existing)) {
8910             UVM_ASSERT(existing->va_range->inject_split_error);
8911             existing->va_range->inject_split_error = false;
8912         }
8913         status = NV_ERR_NO_MEMORY;
8914         goto error;
8915     }
8916 
8917     if (uvm_va_block_is_hmm(existing)) {
8918         uvm_va_policy_node_t *node = uvm_va_policy_node_find(existing, new->start);
8919 
8920         if (node && node->node.start != new->start) {
8921             status = uvm_va_policy_node_split(existing, node, new->start - 1, NULL);
8922             if (status != NV_OK)
8923                 goto error;
8924         }
8925     }
8926 
8927     return NV_OK;
8928 
8929 error:
8930     // Merge back the chunks we split
8931     split_page_index = uvm_va_block_cpu_page_index(existing, new->start);
8932 
8933     for_each_gpu_id(id) {
8934         uvm_gpu_chunk_t *chunk;
8935         size_t chunk_index;
8936         uvm_va_block_gpu_state_t *existing_gpu_state = uvm_va_block_gpu_state_get(existing, id);
8937 
8938         if (!existing_gpu_state)
8939             continue;
8940 
8941         // If the chunk spanning the split point was split, merge it back
8942         gpu = block_get_gpu(existing, id);
8943         chunk_index = block_gpu_chunk_index(existing, gpu, split_page_index, NULL);
8944         chunk = existing_gpu_state->chunks[chunk_index];
8945         if (!chunk || chunk->state != UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT)
8946             continue;
8947 
8948         block_merge_chunk(existing, gpu, chunk);
8949 
8950         // We could attempt to shrink the chunks array back down, but it doesn't
8951         // hurt much to have it larger than necessary, and we'd have to handle
8952         // the shrink call failing anyway on this error path.
8953 
8954     }
8955 
8956     block_merge_cpu_chunks(existing, new);
8957 
8958     return status;
8959 }
8960 
8961 // Re-calculate the block's top-level processor masks:
8962 //   - block->mapped
8963 //   - block->resident
8964 //
8965 // This is called on block split.
8966 static void block_set_processor_masks(uvm_va_block_t *block)
8967 {
8968     size_t num_pages = uvm_va_block_num_cpu_pages(block);
8969     uvm_va_block_region_t block_region = uvm_va_block_region(0, num_pages);
8970     uvm_gpu_id_t id;
8971 
8972     if (uvm_page_mask_region_empty(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ], block_region)) {
8973         UVM_ASSERT(uvm_page_mask_region_empty(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE], block_region));
8974         uvm_processor_mask_clear(&block->mapped, UVM_ID_CPU);
8975     }
8976     else {
8977         uvm_processor_mask_set(&block->mapped, UVM_ID_CPU);
8978     }
8979 
8980     if (uvm_page_mask_region_empty(&block->cpu.resident, block_region)) {
8981         uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
8982 
8983         if (uvm_processor_mask_get_gpu_count(&va_space->can_access[UVM_ID_CPU_VALUE]) == 0)
8984             UVM_ASSERT(!uvm_processor_mask_test(&block->mapped, UVM_ID_CPU));
8985 
8986         block_clear_resident_processor(block, UVM_ID_CPU);
8987     }
8988     else {
8989         block_set_resident_processor(block, UVM_ID_CPU);
8990     }
8991 
8992     for_each_gpu_id(id) {
8993         uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, id);
8994         if (!gpu_state)
8995             continue;
8996 
8997         if (uvm_page_mask_region_empty(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ], block_region)) {
8998             UVM_ASSERT(uvm_page_mask_region_empty(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_WRITE], block_region));
8999             UVM_ASSERT(uvm_page_mask_region_empty(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_ATOMIC], block_region));
9000             uvm_processor_mask_clear(&block->mapped, id);
9001         }
9002         else {
9003             uvm_processor_mask_set(&block->mapped, id);
9004         }
9005 
9006         if (uvm_page_mask_region_empty(&gpu_state->resident, block_region))
9007             block_clear_resident_processor(block, id);
9008         else
9009             block_set_resident_processor(block, id);
9010 
9011         if (uvm_page_mask_region_empty(&gpu_state->evicted, block_region))
9012             uvm_processor_mask_clear(&block->evicted_gpus, id);
9013         else
9014             uvm_processor_mask_set(&block->evicted_gpus, id);
9015     }
9016 }
9017 
9018 // Split a PAGES_PER_UVM_VA_BLOCK sized bitmap into new and existing parts
9019 // corresponding to a block split.
9020 static void block_split_page_mask(uvm_page_mask_t *existing_mask,
9021                                   size_t existing_pages,
9022                                   uvm_page_mask_t *new_mask,
9023                                   size_t new_pages)
9024 {
9025     UVM_ASSERT_MSG(existing_pages + new_pages <= PAGES_PER_UVM_VA_BLOCK, "existing %zu new %zu\n",
9026                    existing_pages, new_pages);
9027 
9028     // The new block is always in the upper region of existing, so shift the bit
9029     // vectors down.
9030     //
9031     // Note that bitmap_shift_right requires both dst and src to be the same
9032     // size. That's ok since we don't scale them by block size.
9033     uvm_page_mask_shift_right(new_mask, existing_mask, existing_pages);
9034     uvm_page_mask_region_clear(existing_mask, uvm_va_block_region(existing_pages, existing_pages + new_pages));
9035 }
9036 
9037 // Split the CPU state within the existing block. existing's start is correct
9038 // but its end has not yet been adjusted.
9039 static void block_split_cpu(uvm_va_block_t *existing, uvm_va_block_t *new)
9040 {
9041     size_t existing_pages, new_pages = uvm_va_block_num_cpu_pages(new);
9042     uvm_pte_bits_cpu_t pte_bit;
9043     uvm_va_block_region_t block_region = uvm_va_block_region_from_block(existing);
9044     uvm_page_index_t split_page_index = uvm_va_block_cpu_page_index(existing, new->start);
9045     uvm_page_index_t page_index;
9046     uvm_page_index_t next_page_index;
9047     uvm_cpu_chunk_t *chunk;
9048     uvm_va_range_t *existing_va_range = existing->va_range;
9049 
9050     if (existing_va_range) {
9051         UVM_ASSERT(existing->va_range->type == UVM_VA_RANGE_TYPE_MANAGED);
9052         UVM_ASSERT(existing->va_range->type == new->va_range->type);
9053     }
9054 
9055     UVM_ASSERT(existing->start < new->start);
9056     UVM_ASSERT(existing->end == new->end);
9057 
9058     UVM_ASSERT(PAGE_ALIGNED(new->start));
9059     UVM_ASSERT(PAGE_ALIGNED(existing->start));
9060 
9061     existing_pages = (new->start - existing->start) / PAGE_SIZE;
9062 
9063     // We don't have to unmap the CPU since its virtual -> physical mappings
9064     // don't change.
9065 
9066     page_index = uvm_va_block_next_page_in_mask(block_region, &existing->cpu.allocated, split_page_index - 1);
9067 
9068     for_each_cpu_chunk_in_block_region_safe(chunk,
9069                                             page_index,
9070                                             next_page_index,
9071                                             existing,
9072                                             uvm_va_block_region(split_page_index, block_region.outer)) {
9073         uvm_page_index_t new_chunk_page_index;
9074         NV_STATUS status;
9075 
9076         uvm_cpu_chunk_remove_from_block(existing, page_index);
9077 
9078         // The chunk has to be adjusted for the new block before inserting it.
9079         new_chunk_page_index = page_index - split_page_index;
9080 
9081         // This should never fail because all necessary storage was allocated
9082         // in block_presplit_cpu_chunks().
9083         status = uvm_cpu_chunk_insert_in_block(new, chunk, new_chunk_page_index);
9084         UVM_ASSERT(status == NV_OK);
9085     }
9086 
9087     new->cpu.ever_mapped = existing->cpu.ever_mapped;
9088 
9089     block_split_page_mask(&existing->cpu.resident, existing_pages, &new->cpu.resident, new_pages);
9090 
9091     for (pte_bit = 0; pte_bit < UVM_PTE_BITS_CPU_MAX; pte_bit++)
9092         block_split_page_mask(&existing->cpu.pte_bits[pte_bit], existing_pages, &new->cpu.pte_bits[pte_bit], new_pages);
9093 }
9094 
9095 // Fill out the blocks' chunks arrays with the chunks split by
9096 // block_presplit_gpu_chunks.
9097 static void block_copy_split_gpu_chunks(uvm_va_block_t *existing, uvm_va_block_t *new, uvm_gpu_t *gpu)
9098 {
9099     uvm_va_block_gpu_state_t *existing_gpu_state = uvm_va_block_gpu_state_get(existing, gpu->id);
9100     uvm_va_block_gpu_state_t *new_gpu_state = uvm_va_block_gpu_state_get(new, gpu->id);
9101     uvm_gpu_chunk_t **temp_chunks;
9102     uvm_gpu_chunk_t *original_chunk;
9103     block_gpu_chunk_split_state_t existing_before_state, existing_after_state, new_state;
9104     size_t num_pre_chunks, num_post_chunks, num_split_chunks_existing, num_split_chunks_new;
9105     uvm_page_index_t split_page_index = uvm_va_block_cpu_page_index(existing, new->start);
9106     size_t i;
9107 
9108     block_gpu_chunk_get_split_state(existing,
9109                                     &existing_before_state,
9110                                     existing->start,
9111                                     existing->end,
9112                                     split_page_index,
9113                                     gpu);
9114     block_gpu_chunk_get_split_state(existing,
9115                                     &existing_after_state,
9116                                     existing->start,
9117                                     new->start - 1,
9118                                     split_page_index - 1,
9119                                     gpu);
9120     block_gpu_chunk_get_split_state(new,
9121                                     &new_state,
9122                                     new->start,
9123                                     new->end,
9124                                     0,
9125                                     gpu);
9126 
9127     // General case (B is original_chunk):
9128     //                                          split
9129     //                                            v
9130     //  existing (before) [------ A -----][------ B -----][------ C -----]
9131     //  existing (after)  [------ A -----][- B0 -]
9132     //  new                                       [- B1 -][------ C -----]
9133     //
9134     // Note that the logic below also handles the case of the split happening at
9135     // a chunk boundary. That case behaves as though there is no B0 chunk.
9136 
9137     // Number of chunks to the left and right of original_chunk (A and C above).
9138     // Either or both of these may be 0.
9139     num_pre_chunks  = existing_before_state.chunk_index;
9140     num_post_chunks = existing_before_state.num_chunks - num_pre_chunks - 1;
9141 
9142     // Number of subchunks under existing's portion of original_chunk (B0 above)
9143     num_split_chunks_existing = existing_after_state.num_chunks - num_pre_chunks;
9144 
9145     // Number of subchunks under new's portion of original_chunk (B1 above)
9146     num_split_chunks_new = new_state.num_chunks - num_post_chunks;
9147 
9148     UVM_ASSERT(num_pre_chunks + num_split_chunks_existing > 0);
9149     UVM_ASSERT(num_split_chunks_new > 0);
9150 
9151     // Copy post chunks from the end of existing into new (C above)
9152     memcpy(&new_gpu_state->chunks[num_split_chunks_new],
9153            &existing_gpu_state->chunks[existing_before_state.chunk_index + 1],
9154            num_post_chunks * sizeof(new_gpu_state->chunks[0]));
9155 
9156     // Save off the original split chunk since we may overwrite the array
9157     original_chunk = existing_gpu_state->chunks[existing_before_state.chunk_index];
9158 
9159     // Fill out the new pointers
9160     if (original_chunk) {
9161         // Note that if the split happened at a chunk boundary, original_chunk
9162         // will not be split. In that case, num_split_chunks_existing will be 0
9163         // and num_split_chunks_new will be 1, so the left copy will be skipped
9164         // and the right copy will pick up the chunk.
9165 
9166         // Copy left newly-split chunks into existing (B0 above). The array was
9167         // re-sized in block_presplit_gpu_chunks as necessary.
9168         size_t num_subchunks;
9169 
9170         num_subchunks = uvm_pmm_gpu_get_subchunks(&gpu->pmm,
9171                                                   original_chunk,
9172                                                   0, // start_index
9173                                                   num_split_chunks_existing,
9174                                                   &existing_gpu_state->chunks[existing_before_state.chunk_index]);
9175         UVM_ASSERT(num_subchunks == num_split_chunks_existing);
9176 
9177         // Copy right newly-split chunks into new (B1 above), overwriting the
9178         // pointer to the original chunk.
9179         num_subchunks = uvm_pmm_gpu_get_subchunks(&gpu->pmm,
9180                                                   original_chunk,
9181                                                   num_split_chunks_existing, // start_index
9182                                                   num_split_chunks_new,
9183                                                   &new_gpu_state->chunks[0]);
9184         UVM_ASSERT(num_subchunks == num_split_chunks_new);
9185     }
9186     else {
9187         // If the chunk wasn't already populated we don't need to copy pointers
9188         // anywhere, but we need to clear out stale pointers from existing's
9189         // array covering the new elements. new's chunks array was already zero-
9190         // initialized.
9191         memset(&existing_gpu_state->chunks[existing_before_state.chunk_index],
9192                0,
9193                num_split_chunks_existing * sizeof(existing_gpu_state->chunks[0]));
9194     }
9195 
9196     // Since we update the reverse map information, protect it against a
9197     // concurrent lookup
9198     uvm_spin_lock(&gpu->pmm.list_lock);
9199 
9200     // Update the reverse map of all the chunks that are now under the new block
9201     for (i = 0; i < new_state.num_chunks; ++i) {
9202         if (new_gpu_state->chunks[i]) {
9203             UVM_ASSERT(new_gpu_state->chunks[i]->va_block == existing);
9204             new_gpu_state->chunks[i]->va_block = new;
9205 
9206             // Adjust the page_index within the VA block for the new subchunks in
9207             // the new VA block
9208             UVM_ASSERT(new_gpu_state->chunks[i]->va_block_page_index >= split_page_index);
9209             new_gpu_state->chunks[i]->va_block_page_index -= split_page_index;
9210         }
9211     }
9212 
9213     uvm_spin_unlock(&gpu->pmm.list_lock);
9214 
9215     // Attempt to shrink existing's chunk allocation. If the realloc fails, just
9216     // keep on using the old larger one.
9217     if (existing_after_state.num_chunks < existing_before_state.num_chunks) {
9218         temp_chunks = uvm_kvrealloc(existing_gpu_state->chunks,
9219                                     existing_after_state.num_chunks * sizeof(existing_gpu_state->chunks[0]));
9220         if (temp_chunks)
9221             existing_gpu_state->chunks = temp_chunks;
9222     }
9223 }
9224 
9225 static void block_split_gpu(uvm_va_block_t *existing, uvm_va_block_t *new, uvm_gpu_id_t gpu_id)
9226 {
9227     uvm_va_block_gpu_state_t *existing_gpu_state = uvm_va_block_gpu_state_get(existing, gpu_id);
9228     uvm_va_block_gpu_state_t *new_gpu_state = uvm_va_block_gpu_state_get(new, gpu_id);
9229     uvm_va_space_t *va_space = uvm_va_block_get_va_space(existing);
9230     uvm_gpu_va_space_t *gpu_va_space;
9231     uvm_gpu_t *gpu;
9232     uvm_gpu_t *accessing_gpu;
9233     size_t new_pages = uvm_va_block_num_cpu_pages(new);
9234     size_t existing_pages, existing_pages_4k, existing_pages_big, new_pages_big;
9235     uvm_pte_bits_gpu_t pte_bit;
9236     size_t num_chunks, i;
9237     uvm_cpu_chunk_t *cpu_chunk;
9238     uvm_page_index_t page_index;
9239 
9240     if (!existing_gpu_state)
9241         return;
9242 
9243     gpu = uvm_va_space_get_gpu(va_space, gpu_id);
9244     UVM_ASSERT(new_gpu_state);
9245 
9246     new_gpu_state->force_4k_ptes = existing_gpu_state->force_4k_ptes;
9247 
9248     UVM_ASSERT(PAGE_ALIGNED(new->start));
9249     UVM_ASSERT(PAGE_ALIGNED(existing->start));
9250     existing_pages = (new->start - existing->start) / PAGE_SIZE;
9251 
9252     for_each_cpu_chunk_in_block(cpu_chunk, page_index, new) {
9253         uvm_pmm_sysmem_mappings_reparent_gpu_mapping(&gpu->pmm_reverse_sysmem_mappings,
9254                                                      uvm_cpu_chunk_get_gpu_phys_addr(cpu_chunk, gpu->parent),
9255                                                      new);
9256     }
9257 
9258     block_copy_split_gpu_chunks(existing, new, gpu);
9259 
9260     num_chunks = block_num_gpu_chunks(new, gpu);
9261 
9262     // Reparent GPU mappings for indirect peers
9263     for (i = 0; i < num_chunks; ++i) {
9264         uvm_gpu_chunk_t *chunk = new_gpu_state->chunks[i];
9265         if (!chunk)
9266             continue;
9267 
9268         for_each_va_space_gpu_in_mask(accessing_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)]) {
9269             NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(&gpu->pmm, chunk, accessing_gpu);
9270 
9271             uvm_pmm_sysmem_mappings_reparent_gpu_chunk_mapping(&accessing_gpu->pmm_reverse_sysmem_mappings,
9272                                                                peer_addr,
9273                                                                new);
9274         }
9275     }
9276 
9277     block_split_page_mask(&existing_gpu_state->resident,
9278                           existing_pages,
9279                           &new_gpu_state->resident,
9280                           new_pages);
9281 
9282     for (pte_bit = 0; pte_bit < UVM_PTE_BITS_GPU_MAX; pte_bit++) {
9283         block_split_page_mask(&existing_gpu_state->pte_bits[pte_bit], existing_pages,
9284                               &new_gpu_state->pte_bits[pte_bit], new_pages);
9285     }
9286 
9287     // Adjust page table ranges.
9288     gpu_va_space = uvm_gpu_va_space_get(va_space, gpu);
9289     if (gpu_va_space) {
9290         if (existing_gpu_state->page_table_range_big.table) {
9291             NvU32 big_page_size = uvm_va_block_gpu_big_page_size(existing, gpu);
9292 
9293             // existing's end has not been adjusted yet
9294             existing_pages_big = range_num_big_pages(existing->start, new->start - 1, big_page_size);
9295 
9296             // Take references on all big pages covered by new
9297             new_pages_big = uvm_va_block_num_big_pages(new, big_page_size);
9298             if (new_pages_big) {
9299                 uvm_page_table_range_get_upper(&gpu_va_space->page_tables,
9300                                                &existing_gpu_state->page_table_range_big,
9301                                                &new_gpu_state->page_table_range_big,
9302                                                new_pages_big);
9303 
9304                 // If the split point is within a big page region, we might have
9305                 // a gap since neither existing nor new can use it anymore.
9306                 // Get the top N bits from existing's mask to handle that.
9307                 bitmap_shift_right(new_gpu_state->big_ptes,
9308                                    existing_gpu_state->big_ptes,
9309                                    uvm_va_block_num_big_pages(existing, big_page_size) - new_pages_big,
9310                                    MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
9311 
9312                 new_gpu_state->initialized_big = existing_gpu_state->initialized_big;
9313             }
9314 
9315             // Drop existing's references on the big PTEs it no longer covers
9316             // now that new has references on them. Note that neither existing
9317             // nor new might have big PTEs after the split. In that case, this
9318             // shrink will free the entire old range.
9319             uvm_page_table_range_shrink(&gpu_va_space->page_tables,
9320                                         &existing_gpu_state->page_table_range_big,
9321                                         existing_pages_big);
9322 
9323             if (existing_pages_big == 0) {
9324                 memset(&existing_gpu_state->page_table_range_big, 0, sizeof(existing_gpu_state->page_table_range_big));
9325                 existing_gpu_state->initialized_big = false;
9326             }
9327 
9328             bitmap_clear(existing_gpu_state->big_ptes,
9329                          existing_pages_big,
9330                          MAX_BIG_PAGES_PER_UVM_VA_BLOCK - existing_pages_big);
9331         }
9332 
9333         if (existing_gpu_state->page_table_range_4k.table) {
9334             // Since existing and new share the same PDE we just need to bump
9335             // the ref-count on new's sub-range.
9336             uvm_page_table_range_get_upper(&gpu_va_space->page_tables,
9337                                            &existing_gpu_state->page_table_range_4k,
9338                                            &new_gpu_state->page_table_range_4k,
9339                                            uvm_va_block_size(new) / UVM_PAGE_SIZE_4K);
9340 
9341             // Drop existing's references on the PTEs it no longer covers now
9342             // that new has references on them.
9343             existing_pages_4k = existing_pages * (PAGE_SIZE / UVM_PAGE_SIZE_4K);
9344             uvm_page_table_range_shrink(&gpu_va_space->page_tables,
9345                                         &existing_gpu_state->page_table_range_4k,
9346                                         existing_pages_4k);
9347         }
9348 
9349         // We have to set this explicitly to handle the case of splitting an
9350         // invalid, active 2M PTE with no lower page tables allocated.
9351         if (existing_gpu_state->pte_is_2m) {
9352             UVM_ASSERT(!existing_gpu_state->page_table_range_big.table);
9353             UVM_ASSERT(!existing_gpu_state->page_table_range_4k.table);
9354             existing_gpu_state->pte_is_2m = false;
9355         }
9356 
9357         // existing can't possibly cover 2MB after a split, so drop any 2M PTE
9358         // references it has. We've taken the necessary references on the lower
9359         // tables above.
9360         block_put_ptes_safe(&gpu_va_space->page_tables, &existing_gpu_state->page_table_range_2m);
9361         existing_gpu_state->activated_big = false;
9362         existing_gpu_state->activated_4k = false;
9363     }
9364 
9365     block_split_page_mask(&existing_gpu_state->evicted, existing_pages, &new_gpu_state->evicted, new_pages);
9366 }
9367 
9368 NV_STATUS uvm_va_block_split(uvm_va_block_t *existing_va_block,
9369                              NvU64 new_end,
9370                              uvm_va_block_t **new_va_block,
9371                              uvm_va_range_t *new_va_range)
9372 {
9373     uvm_va_space_t *va_space;
9374     uvm_va_block_t *new_block = NULL;
9375     NV_STATUS status;
9376 
9377     va_space = new_va_range->va_space;
9378     UVM_ASSERT(existing_va_block->va_range);
9379     UVM_ASSERT(existing_va_block->va_range->va_space == va_space);
9380     UVM_ASSERT(!uvm_va_block_is_hmm(existing_va_block));
9381 
9382     // External range types can't be split
9383     UVM_ASSERT(existing_va_block->va_range->type == UVM_VA_RANGE_TYPE_MANAGED);
9384     UVM_ASSERT(new_va_range->type == UVM_VA_RANGE_TYPE_MANAGED);
9385     uvm_assert_rwsem_locked_write(&va_space->lock);
9386 
9387     UVM_ASSERT(new_end > existing_va_block->start);
9388     UVM_ASSERT(new_end < existing_va_block->end);
9389     UVM_ASSERT(PAGE_ALIGNED(new_end + 1));
9390 
9391     status = uvm_va_block_create(new_va_range, new_end + 1, existing_va_block->end, &new_block);
9392     if (status != NV_OK)
9393         return status;
9394 
9395     // We're protected from other splits and faults by the va_space lock being
9396     // held in write mode, but that doesn't stop the reverse mapping (eviction
9397     // path) from inspecting the existing block. Stop those threads by taking
9398     // the block lock. When a reverse mapping thread takes this lock after the
9399     // split has been performed, it will have to re-inspect state and may see
9400     // that it should use the newly-split block instead.
9401     uvm_mutex_lock(&existing_va_block->lock);
9402 
9403     status = uvm_va_block_split_locked(existing_va_block, new_end, new_block, new_va_range);
9404 
9405     uvm_mutex_unlock(&existing_va_block->lock);
9406 
9407     if (status != NV_OK)
9408         uvm_va_block_release(new_block);
9409     else if (new_va_block)
9410         *new_va_block = new_block;
9411 
9412     return status;
9413 }
9414 
9415 NV_STATUS uvm_va_block_split_locked(uvm_va_block_t *existing_va_block,
9416                                     NvU64 new_end,
9417                                     uvm_va_block_t *new_block,
9418                                     uvm_va_range_t *new_va_range)
9419 {
9420     uvm_va_space_t *va_space = uvm_va_block_get_va_space(existing_va_block);
9421     uvm_gpu_id_t id;
9422     NV_STATUS status;
9423     uvm_perf_event_data_t event_data;
9424 
9425     UVM_ASSERT(block_check_chunks(existing_va_block));
9426 
9427     // As soon as we update existing's reverse mappings to point to the newly-
9428     // split block, the eviction path could try to operate on the new block.
9429     // Lock that out too until new is ready.
9430     //
9431     // Note that we usually shouldn't nest block locks, but it's ok here because
9432     // we just created new_block so no other thread could possibly take it out
9433     // of order with existing's lock.
9434     uvm_mutex_lock_no_tracking(&new_block->lock);
9435 
9436     // The split has to be transactional, meaning that if we fail, the existing
9437     // block must not be modified. Handle that by pre-allocating everything we
9438     // might need under both existing and new at the start so we only have a
9439     // single point of failure.
9440 
9441     // Since pre-allocation might require allocating new PTEs, we have to handle
9442     // allocation retry which might drop existing's block lock. The
9443     // preallocation is split into two steps for that: the first part which
9444     // allocates and splits PTEs can handle having the block lock dropped then
9445     // re-taken. It won't modify existing_va_block other than adding new PTE
9446     // allocations and splitting existing PTEs, which is always safe.
9447     status = UVM_VA_BLOCK_RETRY_LOCKED(existing_va_block,
9448                                        NULL,
9449                                        block_split_presplit_ptes(existing_va_block, new_block));
9450     if (status != NV_OK)
9451         goto out;
9452 
9453     // Pre-allocate, stage two. This modifies existing_va_block in ways which
9454     // violate many assumptions (such as changing chunk size), but it will put
9455     // things back into place on a failure without dropping the block lock.
9456     status = block_split_preallocate_no_retry(existing_va_block, new_block);
9457     if (status != NV_OK)
9458         goto out;
9459 
9460     // We'll potentially be freeing page tables, so we need to wait for any
9461     // outstanding work before we start
9462     status = uvm_tracker_wait(&existing_va_block->tracker);
9463     if (status != NV_OK)
9464         goto out;
9465 
9466     // Update existing's state only once we're past all failure points
9467 
9468     event_data.block_shrink.block = existing_va_block;
9469     uvm_perf_event_notify(&va_space->perf_events, UVM_PERF_EVENT_BLOCK_SHRINK, &event_data);
9470 
9471     block_split_cpu(existing_va_block, new_block);
9472 
9473     for_each_gpu_id(id)
9474         block_split_gpu(existing_va_block, new_block, id);
9475 
9476     // Update the size of the existing block first so that
9477     // block_set_processor_masks can use block_{set,clear}_resident_processor
9478     // that relies on the size to be correct.
9479     existing_va_block->end = new_end;
9480 
9481     block_split_page_mask(&existing_va_block->read_duplicated_pages,
9482                           uvm_va_block_num_cpu_pages(existing_va_block),
9483                           &new_block->read_duplicated_pages,
9484                           uvm_va_block_num_cpu_pages(new_block));
9485 
9486     block_split_page_mask(&existing_va_block->maybe_mapped_pages,
9487                           uvm_va_block_num_cpu_pages(existing_va_block),
9488                           &new_block->maybe_mapped_pages,
9489                           uvm_va_block_num_cpu_pages(new_block));
9490 
9491     block_set_processor_masks(existing_va_block);
9492     block_set_processor_masks(new_block);
9493 
9494     if (uvm_va_block_is_hmm(existing_va_block)) {
9495         uvm_hmm_va_block_split_tree(existing_va_block, new_block);
9496         uvm_va_policy_node_split_move(existing_va_block, new_block);
9497     }
9498 
9499 out:
9500     // Run checks on existing_va_block even on failure, since an error must
9501     // leave the block in a consistent state.
9502     UVM_ASSERT(block_check_chunks(existing_va_block));
9503     UVM_ASSERT(block_check_mappings(existing_va_block));
9504     if (status == NV_OK) {
9505         UVM_ASSERT(block_check_chunks(new_block));
9506         UVM_ASSERT(block_check_mappings(new_block));
9507     }
9508     else {
9509         block_free_cpu_chunk_storage(new_block);
9510     }
9511 
9512     uvm_mutex_unlock_no_tracking(&new_block->lock);
9513 
9514     return status;
9515 }
9516 
9517 static bool block_region_might_read_duplicate(uvm_va_block_t *va_block,
9518                                               uvm_va_block_region_t region)
9519 {
9520     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
9521     uvm_va_range_t *va_range = va_block->va_range;
9522 
9523     if (!uvm_va_space_can_read_duplicate(va_space, NULL))
9524         return false;
9525 
9526     // TODO: Bug 3660922: need to implement HMM read duplication support.
9527     if (uvm_va_block_is_hmm(va_block) ||
9528         uvm_va_range_get_policy(va_range)->read_duplication == UVM_READ_DUPLICATION_DISABLED)
9529         return false;
9530 
9531     if (uvm_va_range_get_policy(va_range)->read_duplication == UVM_READ_DUPLICATION_UNSET
9532         && uvm_page_mask_region_weight(&va_block->read_duplicated_pages, region) == 0)
9533         return false;
9534 
9535     return true;
9536 }
9537 
9538 // Returns the new access permission for the processor that faulted or
9539 // triggered access counter notifications on the given page
9540 //
9541 // TODO: Bug 1766424: this function works on a single page at a time. This
9542 //       could be changed in the future to optimize multiple faults/counters on
9543 //       contiguous pages.
9544 static uvm_prot_t compute_new_permission(uvm_va_block_t *va_block,
9545                                          struct vm_area_struct *hmm_vma,
9546                                          uvm_page_index_t page_index,
9547                                          uvm_processor_id_t fault_processor_id,
9548                                          uvm_processor_id_t new_residency,
9549                                          uvm_fault_access_type_t access_type)
9550 {
9551     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
9552     uvm_prot_t logical_prot, new_prot;
9553 
9554     // TODO: Bug 1766432: Refactor into policies. Current policy is
9555     //       query_promote: upgrade access privileges to avoid future faults IF
9556     //       they don't trigger further revocations.
9557     new_prot = uvm_fault_access_type_to_prot(access_type);
9558     logical_prot = compute_logical_prot(va_block, hmm_vma, page_index);
9559 
9560     UVM_ASSERT(logical_prot >= new_prot);
9561 
9562     if (logical_prot > UVM_PROT_READ_ONLY && new_prot == UVM_PROT_READ_ONLY &&
9563         !block_region_might_read_duplicate(va_block, uvm_va_block_region_for_page(page_index))) {
9564         uvm_processor_mask_t processors_with_atomic_mapping;
9565         uvm_processor_mask_t revoke_processors;
9566 
9567         block_page_authorized_processors(va_block,
9568                                          page_index,
9569                                          UVM_PROT_READ_WRITE_ATOMIC,
9570                                          &processors_with_atomic_mapping);
9571 
9572         uvm_processor_mask_andnot(&revoke_processors,
9573                                   &processors_with_atomic_mapping,
9574                                   &va_space->has_native_atomics[uvm_id_value(new_residency)]);
9575 
9576         // Only check if there are no faultable processors in the revoke
9577         // processors mask.
9578         uvm_processor_mask_and(&revoke_processors, &revoke_processors, &va_space->faultable_processors);
9579 
9580         if (uvm_processor_mask_empty(&revoke_processors))
9581             new_prot = UVM_PROT_READ_WRITE;
9582     }
9583     if (logical_prot == UVM_PROT_READ_WRITE_ATOMIC && new_prot == UVM_PROT_READ_WRITE) {
9584         if (uvm_processor_mask_test(&va_space->has_native_atomics[uvm_id_value(new_residency)], fault_processor_id))
9585             new_prot = UVM_PROT_READ_WRITE_ATOMIC;
9586     }
9587 
9588     return new_prot;
9589 }
9590 
9591 static NV_STATUS do_block_add_mappings_after_migration(uvm_va_block_t *va_block,
9592                                                        uvm_va_block_context_t *va_block_context,
9593                                                        uvm_processor_id_t new_residency,
9594                                                        uvm_processor_id_t processor_id,
9595                                                        const uvm_processor_mask_t *map_processors,
9596                                                        uvm_va_block_region_t region,
9597                                                        const uvm_page_mask_t *map_page_mask,
9598                                                        uvm_prot_t max_prot,
9599                                                        const uvm_processor_mask_t *thrashing_processors,
9600                                                        uvm_tracker_t *tracker)
9601 {
9602     NV_STATUS status;
9603     uvm_processor_id_t map_processor_id;
9604     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
9605     uvm_prot_t new_map_prot = max_prot;
9606     uvm_processor_mask_t map_processors_local;
9607 
9608     uvm_processor_mask_copy(&map_processors_local, map_processors);
9609 
9610     // Handle atomic mappings separately
9611     if (max_prot == UVM_PROT_READ_WRITE_ATOMIC) {
9612         bool this_processor_has_native_atomics;
9613 
9614         this_processor_has_native_atomics =
9615             uvm_processor_mask_test(&va_space->has_native_atomics[uvm_id_value(new_residency)], processor_id);
9616 
9617         if (this_processor_has_native_atomics) {
9618             uvm_processor_mask_t map_atomic_processors;
9619 
9620             // Compute processors with native atomics to the residency
9621             uvm_processor_mask_and(&map_atomic_processors,
9622                                    &map_processors_local,
9623                                    &va_space->has_native_atomics[uvm_id_value(new_residency)]);
9624 
9625             // Filter out these mapped processors for the next steps
9626             uvm_processor_mask_andnot(&map_processors_local, &map_processors_local, &map_atomic_processors);
9627 
9628             for_each_id_in_mask(map_processor_id, &map_atomic_processors) {
9629                 UvmEventMapRemoteCause cause = UvmEventMapRemoteCausePolicy;
9630                 if (thrashing_processors && uvm_processor_mask_test(thrashing_processors, map_processor_id))
9631                     cause = UvmEventMapRemoteCauseThrashing;
9632 
9633                 status = uvm_va_block_map(va_block,
9634                                           va_block_context,
9635                                           map_processor_id,
9636                                           region,
9637                                           map_page_mask,
9638                                           UVM_PROT_READ_WRITE_ATOMIC,
9639                                           cause,
9640                                           tracker);
9641                 if (status != NV_OK)
9642                     return status;
9643             }
9644 
9645             new_map_prot = UVM_PROT_READ_WRITE;
9646         }
9647         else {
9648             if (UVM_ID_IS_CPU(processor_id))
9649                 new_map_prot = UVM_PROT_READ_WRITE;
9650             else
9651                 new_map_prot = UVM_PROT_READ_ONLY;
9652         }
9653     }
9654 
9655     // Map the rest of processors
9656     for_each_id_in_mask(map_processor_id, &map_processors_local) {
9657         UvmEventMapRemoteCause cause = UvmEventMapRemoteCausePolicy;
9658         uvm_prot_t final_map_prot;
9659         bool map_processor_has_enabled_system_wide_atomics =
9660             uvm_processor_mask_test(&va_space->system_wide_atomics_enabled_processors, map_processor_id);
9661 
9662         // Write mappings from processors with disabled system-wide atomics are treated like atomics
9663         if (new_map_prot == UVM_PROT_READ_WRITE && !map_processor_has_enabled_system_wide_atomics)
9664             final_map_prot = UVM_PROT_READ_WRITE_ATOMIC;
9665         else
9666             final_map_prot = new_map_prot;
9667 
9668         if (thrashing_processors && uvm_processor_mask_test(thrashing_processors, map_processor_id))
9669             cause = UvmEventMapRemoteCauseThrashing;
9670 
9671         status = uvm_va_block_map(va_block,
9672                                   va_block_context,
9673                                   map_processor_id,
9674                                   region,
9675                                   map_page_mask,
9676                                   final_map_prot,
9677                                   cause,
9678                                   tracker);
9679         if (status != NV_OK)
9680             return status;
9681     }
9682 
9683     return NV_OK;
9684 }
9685 
9686 NV_STATUS uvm_va_block_add_mappings_after_migration(uvm_va_block_t *va_block,
9687                                                     uvm_va_block_context_t *va_block_context,
9688                                                     uvm_processor_id_t new_residency,
9689                                                     uvm_processor_id_t processor_id,
9690                                                     uvm_va_block_region_t region,
9691                                                     const uvm_page_mask_t *map_page_mask,
9692                                                     uvm_prot_t max_prot,
9693                                                     const uvm_processor_mask_t *thrashing_processors)
9694 {
9695     NV_STATUS tracker_status, status = NV_OK;
9696     uvm_processor_mask_t map_other_processors, map_uvm_lite_gpus;
9697     uvm_processor_id_t map_processor_id;
9698     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
9699     const uvm_page_mask_t *final_page_mask = map_page_mask;
9700     uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
9701     const uvm_va_policy_t *policy = uvm_va_policy_get_region(va_block, region);
9702     uvm_processor_id_t preferred_location;
9703 
9704     uvm_assert_mutex_locked(&va_block->lock);
9705 
9706     // Read duplication takes precedence over SetAccessedBy.
9707     //
9708     // Exclude ranges with read duplication set...
9709     if (uvm_va_policy_is_read_duplicate(policy, va_space)) {
9710         status = NV_OK;
9711         goto out;
9712     }
9713 
9714     // ... and pages read-duplicated by performance heuristics
9715     if (policy->read_duplication == UVM_READ_DUPLICATION_UNSET) {
9716         if (map_page_mask) {
9717             uvm_page_mask_andnot(&va_block_context->mapping.filtered_page_mask,
9718                                  map_page_mask,
9719                                  &va_block->read_duplicated_pages);
9720         }
9721         else {
9722             uvm_page_mask_complement(&va_block_context->mapping.filtered_page_mask, &va_block->read_duplicated_pages);
9723         }
9724         final_page_mask = &va_block_context->mapping.filtered_page_mask;
9725     }
9726 
9727     // Add mappings for accessed_by processors and the given processor mask
9728     if (thrashing_processors)
9729         uvm_processor_mask_or(&map_other_processors, &policy->accessed_by, thrashing_processors);
9730     else
9731         uvm_processor_mask_copy(&map_other_processors, &policy->accessed_by);
9732 
9733     // Only processors that can access the new location must be considered
9734     uvm_processor_mask_and(&map_other_processors,
9735                            &map_other_processors,
9736                            &va_space->accessible_from[uvm_id_value(new_residency)]);
9737 
9738     // Exclude caller processor as it must have already been mapped
9739     uvm_processor_mask_clear(&map_other_processors, processor_id);
9740 
9741     // Exclude preferred location so it won't get remote mappings
9742     preferred_location = policy->preferred_location;
9743     if (UVM_ID_IS_VALID(preferred_location) &&
9744         !uvm_id_equal(new_residency, preferred_location) &&
9745         uvm_va_space_processor_has_memory(va_space, preferred_location)) {
9746         uvm_processor_mask_clear(&map_other_processors, preferred_location);
9747     }
9748 
9749     // Map the UVM-Lite GPUs if the new location is the preferred location. This
9750     // will only create mappings on first touch. After that they're persistent
9751     // so uvm_va_block_map will be a no-op.
9752     uvm_processor_mask_and(&map_uvm_lite_gpus, &map_other_processors, block_get_uvm_lite_gpus(va_block));
9753     if (!uvm_processor_mask_empty(&map_uvm_lite_gpus) &&
9754         uvm_id_equal(new_residency, preferred_location)) {
9755         for_each_id_in_mask(map_processor_id, &map_uvm_lite_gpus) {
9756             status = uvm_va_block_map(va_block,
9757                                       va_block_context,
9758                                       map_processor_id,
9759                                       region,
9760                                       final_page_mask,
9761                                       UVM_PROT_READ_WRITE_ATOMIC,
9762                                       UvmEventMapRemoteCauseCoherence,
9763                                       &local_tracker);
9764             if (status != NV_OK)
9765                 goto out;
9766         }
9767     }
9768 
9769     uvm_processor_mask_andnot(&map_other_processors, &map_other_processors, block_get_uvm_lite_gpus(va_block));
9770 
9771     // We can't map non-migratable pages to the CPU. If we have any, build a
9772     // new mask of migratable pages and map the CPU separately.
9773     if (uvm_processor_mask_test(&map_other_processors, UVM_ID_CPU) &&
9774         !uvm_range_group_all_migratable(va_space,
9775                                         uvm_va_block_region_start(va_block, region),
9776                                         uvm_va_block_region_end(va_block, region))) {
9777         uvm_page_mask_t *migratable_mask = &va_block_context->mapping.migratable_mask;
9778 
9779         uvm_range_group_migratable_page_mask(va_block, region, migratable_mask);
9780         if (uvm_page_mask_and(migratable_mask, migratable_mask, final_page_mask)) {
9781             uvm_processor_mask_t cpu_mask;
9782             uvm_processor_mask_zero(&cpu_mask);
9783             uvm_processor_mask_set(&cpu_mask, UVM_ID_CPU);
9784 
9785             status = do_block_add_mappings_after_migration(va_block,
9786                                                            va_block_context,
9787                                                            new_residency,
9788                                                            processor_id,
9789                                                            &cpu_mask,
9790                                                            region,
9791                                                            migratable_mask,
9792                                                            max_prot,
9793                                                            thrashing_processors,
9794                                                            &local_tracker);
9795             if (status != NV_OK)
9796                 goto out;
9797         }
9798 
9799         uvm_processor_mask_clear(&map_other_processors, UVM_ID_CPU);
9800     }
9801 
9802     status = do_block_add_mappings_after_migration(va_block,
9803                                                    va_block_context,
9804                                                    new_residency,
9805                                                    processor_id,
9806                                                    &map_other_processors,
9807                                                    region,
9808                                                    final_page_mask,
9809                                                    max_prot,
9810                                                    thrashing_processors,
9811                                                    &local_tracker);
9812     if (status != NV_OK)
9813         goto out;
9814 
9815 out:
9816     tracker_status = uvm_tracker_add_tracker_safe(&va_block->tracker, &local_tracker);
9817     uvm_tracker_deinit(&local_tracker);
9818     return status == NV_OK ? tracker_status : status;
9819 }
9820 
9821 uvm_prot_t uvm_va_block_page_compute_highest_permission(uvm_va_block_t *va_block,
9822                                                         uvm_processor_id_t processor_id,
9823                                                         uvm_page_index_t page_index)
9824 {
9825     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
9826     uvm_processor_mask_t resident_processors;
9827     NvU32 resident_processors_count;
9828 
9829     if (uvm_processor_mask_test(block_get_uvm_lite_gpus(va_block), processor_id))
9830         return UVM_PROT_READ_WRITE_ATOMIC;
9831 
9832     uvm_va_block_page_resident_processors(va_block, page_index, &resident_processors);
9833     resident_processors_count = uvm_processor_mask_get_count(&resident_processors);
9834 
9835     if (resident_processors_count == 0) {
9836         return UVM_PROT_NONE;
9837     }
9838     else if (resident_processors_count > 1) {
9839         // If there are many copies, we can only map READ ONLY
9840         //
9841         // The block state doesn't track the mapping target (aperture) of each
9842         // individual PTE, just the permissions and where the data is resident.
9843         // If the data is resident in multiple places, then we have a problem
9844         // since we can't know where the PTE points. This means we won't know
9845         // what needs to be unmapped for cases like UvmUnregisterGpu and
9846         // UvmDisablePeerAccess.
9847         //
9848         // The simple way to solve this is to enforce that a read-duplication
9849         // mapping always points to local memory.
9850         if (uvm_processor_mask_test(&resident_processors, processor_id))
9851             return UVM_PROT_READ_ONLY;
9852 
9853         return UVM_PROT_NONE;
9854     }
9855     else {
9856         uvm_processor_id_t atomic_id;
9857         uvm_processor_id_t residency;
9858         uvm_processor_mask_t atomic_mappings;
9859         uvm_processor_mask_t write_mappings;
9860 
9861         // Search the id of the processor with the only resident copy
9862         residency = uvm_processor_mask_find_first_id(&resident_processors);
9863         UVM_ASSERT(UVM_ID_IS_VALID(residency));
9864 
9865         // If we cannot map the processor with the resident copy, exit
9866         if (!uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(residency)], processor_id))
9867             return UVM_PROT_NONE;
9868 
9869         // Fast path: if the page is not mapped anywhere else, it can be safely
9870         // mapped with RWA permission
9871         if (!uvm_page_mask_test(&va_block->maybe_mapped_pages, page_index))
9872             return UVM_PROT_READ_WRITE_ATOMIC;
9873 
9874         block_page_authorized_processors(va_block, page_index, UVM_PROT_READ_WRITE_ATOMIC, &atomic_mappings);
9875 
9876         // Exclude processors with system-wide atomics disabled from atomic_mappings
9877         uvm_processor_mask_and(&atomic_mappings,
9878                                &atomic_mappings,
9879                                &va_space->system_wide_atomics_enabled_processors);
9880 
9881         // Exclude the processor for which the mapping protections are being computed
9882         uvm_processor_mask_clear(&atomic_mappings, processor_id);
9883 
9884         // If there is any processor with atomic mapping, check if it has native atomics to the processor
9885         // with the resident copy. If it does not, we can only map READ ONLY
9886         atomic_id = uvm_processor_mask_find_first_id(&atomic_mappings);
9887         if (UVM_ID_IS_VALID(atomic_id) &&
9888             !uvm_processor_mask_test(&va_space->has_native_atomics[uvm_id_value(residency)], atomic_id)) {
9889             return UVM_PROT_READ_ONLY;
9890         }
9891 
9892         block_page_authorized_processors(va_block, page_index, UVM_PROT_READ_WRITE, &write_mappings);
9893 
9894         // Exclude the processor for which the mapping protections are being computed
9895         uvm_processor_mask_clear(&write_mappings, processor_id);
9896 
9897         // At this point, any processor with atomic mappings either has native
9898         // atomics support to the processor with the resident copy or has
9899         // disabled system-wide atomics. If the requesting processor has
9900         // disabled system-wide atomics or has native atomics to that processor,
9901         // we can map with ATOMIC privileges. Likewise, if there are no other
9902         // processors with WRITE or ATOMIC mappings, we can map with ATOMIC
9903         // privileges. For HMM, don't allow GPU atomic access to remote mapped
9904         // system memory even if there are no write mappings since CPU access
9905         // can be upgraded without notification.
9906         if (!uvm_processor_mask_test(&va_space->system_wide_atomics_enabled_processors, processor_id) ||
9907             uvm_processor_mask_test(&va_space->has_native_atomics[uvm_id_value(residency)], processor_id) ||
9908             (uvm_processor_mask_empty(&write_mappings) && !uvm_va_block_is_hmm(va_block))) {
9909             return UVM_PROT_READ_WRITE_ATOMIC;
9910         }
9911 
9912         return UVM_PROT_READ_WRITE;
9913     }
9914 }
9915 
9916 NV_STATUS uvm_va_block_add_mappings(uvm_va_block_t *va_block,
9917                                     uvm_va_block_context_t *va_block_context,
9918                                     uvm_processor_id_t processor_id,
9919                                     uvm_va_block_region_t region,
9920                                     const uvm_page_mask_t *page_mask,
9921                                     UvmEventMapRemoteCause cause)
9922 {
9923     uvm_va_range_t *va_range = va_block->va_range;
9924     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
9925     NV_STATUS status = NV_OK;
9926     uvm_page_index_t page_index;
9927     uvm_range_group_range_iter_t iter;
9928     uvm_prot_t prot_to_map;
9929 
9930     if (UVM_ID_IS_CPU(processor_id) && !uvm_va_block_is_hmm(va_block)) {
9931         if (!uvm_va_range_vma_check(va_range, va_block_context->mm))
9932             return NV_OK;
9933 
9934         uvm_range_group_range_migratability_iter_first(va_space,
9935                                                        uvm_va_block_region_start(va_block, region),
9936                                                        uvm_va_block_region_end(va_block, region),
9937                                                        &iter);
9938     }
9939 
9940     for (prot_to_map = UVM_PROT_READ_ONLY; prot_to_map <= UVM_PROT_READ_WRITE_ATOMIC; ++prot_to_map)
9941         va_block_context->mask_by_prot[prot_to_map - 1].count = 0;
9942 
9943     for_each_va_block_page_in_region_mask(page_index, page_mask, region) {
9944         // Read duplication takes precedence over SetAccessedBy. Exclude pages
9945         // read-duplicated by performance heuristics
9946         if (uvm_page_mask_test(&va_block->read_duplicated_pages, page_index))
9947             continue;
9948 
9949         prot_to_map = uvm_va_block_page_compute_highest_permission(va_block, processor_id, page_index);
9950         if (prot_to_map == UVM_PROT_NONE)
9951             continue;
9952 
9953         if (UVM_ID_IS_CPU(processor_id) && !uvm_va_block_is_hmm(va_block)) {
9954             while (uvm_va_block_cpu_page_index(va_block, iter.end) < page_index) {
9955                 uvm_range_group_range_migratability_iter_next(va_space,
9956                                                               &iter,
9957                                                               uvm_va_block_region_end(va_block, region));
9958             }
9959 
9960             if (!iter.migratable)
9961                 continue;
9962         }
9963 
9964         if (va_block_context->mask_by_prot[prot_to_map - 1].count++ == 0)
9965             uvm_page_mask_zero(&va_block_context->mask_by_prot[prot_to_map - 1].page_mask);
9966 
9967         uvm_page_mask_set(&va_block_context->mask_by_prot[prot_to_map - 1].page_mask, page_index);
9968     }
9969 
9970     for (prot_to_map = UVM_PROT_READ_ONLY; prot_to_map <= UVM_PROT_READ_WRITE_ATOMIC; ++prot_to_map) {
9971         if (va_block_context->mask_by_prot[prot_to_map - 1].count == 0)
9972             continue;
9973 
9974         status = uvm_va_block_map(va_block,
9975                                   va_block_context,
9976                                   processor_id,
9977                                   region,
9978                                   &va_block_context->mask_by_prot[prot_to_map - 1].page_mask,
9979                                   prot_to_map,
9980                                   cause,
9981                                   &va_block->tracker);
9982         if (status != NV_OK)
9983             break;
9984     }
9985 
9986     return status;
9987 }
9988 
9989 static bool can_read_duplicate(uvm_va_block_t *va_block,
9990                                uvm_page_index_t page_index,
9991                                const uvm_va_policy_t *policy,
9992                                const uvm_perf_thrashing_hint_t *thrashing_hint)
9993 {
9994     if (uvm_va_policy_is_read_duplicate(policy, uvm_va_block_get_va_space(va_block)))
9995         return true;
9996 
9997     if (policy->read_duplication != UVM_READ_DUPLICATION_DISABLED &&
9998         uvm_page_mask_test(&va_block->read_duplicated_pages, page_index) &&
9999         thrashing_hint->type != UVM_PERF_THRASHING_HINT_TYPE_PIN)
10000         return true;
10001 
10002     return false;
10003 }
10004 
10005 // TODO: Bug 1827400: If the faulting processor has support for native
10006 //       atomics to the current location and the faults on the page were
10007 //       triggered by atomic accesses only, we keep the current residency.
10008 //       This is a short-term solution to exercise remote atomics over
10009 //       NVLINK when possible (not only when preferred location is set to
10010 //       the remote GPU) as they are much faster than relying on page
10011 //       faults and permission downgrades, which cause thrashing. In the
10012 //       future, the thrashing detection/prevention heuristics should
10013 //       detect and handle this case.
10014 static bool map_remote_on_atomic_fault(uvm_va_space_t *va_space,
10015                                        NvU32 access_type_mask,
10016                                        uvm_processor_id_t processor_id,
10017                                        uvm_processor_id_t residency)
10018 {
10019     // This policy can be enabled/disabled using a module parameter
10020     if (!uvm_perf_map_remote_on_native_atomics_fault)
10021         return false;
10022 
10023     // Only consider atomics faults
10024     if (uvm_fault_access_type_mask_lowest(access_type_mask) < UVM_FAULT_ACCESS_TYPE_ATOMIC_WEAK)
10025         return false;
10026 
10027     // We cannot differentiate CPU writes from atomics. We exclude CPU faults
10028     // from the logic explained above in order to avoid mapping CPU to vidmem
10029     // memory due to a write.
10030     if (UVM_ID_IS_CPU(processor_id))
10031         return false;
10032 
10033     // On P9 systems (which have native HW support for system-wide atomics), we
10034     // have determined experimentally that placing memory on a GPU yields the
10035     // best performance on most cases (since CPU can cache vidmem but not vice
10036     // versa). Therefore, don't map remotely if the current residency is
10037     // sysmem.
10038     if (UVM_ID_IS_CPU(residency))
10039         return false;
10040 
10041     return uvm_processor_mask_test(&va_space->has_native_atomics[uvm_id_value(residency)], processor_id);
10042 }
10043 
10044 // TODO: Bug 1766424: this function works on a single page at a time. This
10045 //       could be changed in the future to optimize multiple faults or access
10046 //       counter notifications on contiguous pages.
10047 static uvm_processor_id_t block_select_residency(uvm_va_block_t *va_block,
10048                                                  uvm_va_block_context_t *va_block_context,
10049                                                  uvm_page_index_t page_index,
10050                                                  uvm_processor_id_t processor_id,
10051                                                  NvU32 access_type_mask,
10052                                                  const uvm_va_policy_t *policy,
10053                                                  const uvm_perf_thrashing_hint_t *thrashing_hint,
10054                                                  uvm_service_operation_t operation,
10055                                                  bool *read_duplicate)
10056 {
10057     uvm_processor_id_t closest_resident_processor;
10058     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
10059     bool may_read_duplicate;
10060     uvm_processor_id_t preferred_location;
10061 
10062     // TODO: Bug 3660968: Remove uvm_hmm_force_sysmem_set() check as soon as
10063     // HMM migration is implemented VMAs other than anonymous memory.
10064     if (is_uvm_fault_force_sysmem_set() || uvm_hmm_must_use_sysmem(va_block, va_block_context)) {
10065         *read_duplicate = false;
10066         return UVM_ID_CPU;
10067     }
10068 
10069     may_read_duplicate = can_read_duplicate(va_block, page_index, policy, thrashing_hint);
10070 
10071     // Read/prefetch faults on a VA range with read duplication enabled
10072     // always create a copy of the page on the faulting processor's memory.
10073     // Note that access counters always use UVM_FAULT_ACCESS_TYPE_PREFETCH,
10074     // which will lead to read duplication if it is enabled.
10075     *read_duplicate = may_read_duplicate &&
10076                       (uvm_fault_access_type_mask_highest(access_type_mask) <= UVM_FAULT_ACCESS_TYPE_READ);
10077 
10078     if (*read_duplicate)
10079         return processor_id;
10080 
10081     *read_duplicate = false;
10082 
10083     // If read-duplication is active in the page but we are not
10084     // read-duplicating because the access type is not a read or a prefetch,
10085     // the faulting processor should get a local copy
10086     if (may_read_duplicate)
10087         return processor_id;
10088 
10089     // If the faulting processor is the preferred location always migrate
10090     preferred_location = policy->preferred_location;
10091     if (uvm_id_equal(processor_id, preferred_location)) {
10092         if (thrashing_hint->type != UVM_PERF_THRASHING_HINT_TYPE_NONE) {
10093             UVM_ASSERT(thrashing_hint->type == UVM_PERF_THRASHING_HINT_TYPE_PIN);
10094             if (uvm_va_space_processor_has_memory(va_space, processor_id))
10095                 UVM_ASSERT(uvm_id_equal(thrashing_hint->pin.residency, processor_id));
10096         }
10097 
10098         return processor_id;
10099     }
10100 
10101     // If the faulting processor is the CPU, HMM has to migrate the block to
10102     // system memory.
10103     // TODO: Bug 3900021: [UVM-HMM] investigate thrashing improvements.
10104     if (UVM_ID_IS_CPU(processor_id) && uvm_va_block_is_hmm(va_block))
10105         return processor_id;
10106 
10107     if (thrashing_hint->type == UVM_PERF_THRASHING_HINT_TYPE_PIN) {
10108         UVM_ASSERT(uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(thrashing_hint->pin.residency)],
10109                                            processor_id));
10110         return thrashing_hint->pin.residency;
10111     }
10112 
10113     closest_resident_processor = uvm_va_block_page_get_closest_resident(va_block, page_index, processor_id);
10114 
10115     // If the page is not resident anywhere, select the preferred location as
10116     // long as the preferred location is accessible from the faulting processor.
10117     // Otherwise select the faulting processor.
10118     if (UVM_ID_IS_INVALID(closest_resident_processor)) {
10119         if (UVM_ID_IS_VALID(preferred_location) &&
10120             uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(preferred_location)],
10121                                     processor_id)) {
10122             return preferred_location;
10123         }
10124 
10125         return processor_id;
10126     }
10127 
10128     // AccessedBy mappings might have not been created for the CPU if the thread
10129     // which made the memory resident did not have the proper references on the
10130     // mm_struct (for example, the GPU fault handling path when
10131     // uvm_va_space_mm_enabled() is false).
10132     //
10133     // Also, in uvm_migrate_*, we implement a two-pass scheme in which
10134     // AccessedBy mappings may be delayed to the second pass. This can produce
10135     // faults even if the faulting processor is in the accessed_by mask.
10136     //
10137     // Here, we keep it on the current residency and we just add the missing
10138     // mapping.
10139     if (uvm_processor_mask_test(&policy->accessed_by, processor_id) &&
10140         uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(closest_resident_processor)], processor_id) &&
10141         operation != UVM_SERVICE_OPERATION_ACCESS_COUNTERS) {
10142         return closest_resident_processor;
10143     }
10144 
10145     // Check if we should map the closest resident processor remotely on atomic
10146     // fault
10147     if (map_remote_on_atomic_fault(va_space, access_type_mask, processor_id, closest_resident_processor))
10148         return closest_resident_processor;
10149 
10150     // If the processor has access to the preferred location, and the page is
10151     // not resident on the accessing processor, move it to the preferred
10152     // location.
10153     if (!uvm_id_equal(closest_resident_processor, processor_id) &&
10154         UVM_ID_IS_VALID(preferred_location) &&
10155         uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(preferred_location)], processor_id))
10156         return preferred_location;
10157 
10158     // Check if we should map the closest resident processor remotely on remote CPU fault
10159     //
10160     // When faulting on CPU, there's a linux process on behalf of it, which is associated
10161     // with a unique VM pointed by current->mm. A block of memory residing on GPU is also
10162     // associated with VM, pointed by va_block_context->mm. If they match, it's a regular
10163     // (local) fault, and we may want to migrate a page from GPU to CPU.
10164     // If it's a 'remote' fault, i.e. linux process differs from one associated with block
10165     // VM, we might preserve residence.
10166     //
10167     // Establishing a remote fault without access counters means the memory could stay in
10168     // the wrong spot for a long time, which is why we prefer to avoid creating remote
10169     // mappings. However when NIC accesses a memory residing on GPU, it's worth to keep it
10170     // in place for NIC accesses.
10171     //
10172     // The logic that's used to detect remote faulting also keeps memory in place for
10173     // ptrace accesses. We would prefer to control those policies separately, but the
10174     // NIC case takes priority.
10175     if (UVM_ID_IS_CPU(processor_id) &&
10176         uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(closest_resident_processor)], processor_id) &&
10177         va_block_context->mm != current->mm) {
10178         UVM_ASSERT(va_block_context->mm != NULL);
10179         return closest_resident_processor;
10180     }
10181 
10182     // If the page is resident on a processor other than the preferred location,
10183     // or the faulting processor can't access the preferred location, we select
10184     // the faulting processor as the new residency.
10185     return processor_id;
10186 }
10187 
10188 uvm_processor_id_t uvm_va_block_select_residency(uvm_va_block_t *va_block,
10189                                                  uvm_va_block_context_t *va_block_context,
10190                                                  uvm_page_index_t page_index,
10191                                                  uvm_processor_id_t processor_id,
10192                                                  NvU32 access_type_mask,
10193                                                  const uvm_va_policy_t *policy,
10194                                                  const uvm_perf_thrashing_hint_t *thrashing_hint,
10195                                                  uvm_service_operation_t operation,
10196                                                  bool *read_duplicate)
10197 {
10198     uvm_processor_id_t id;
10199 
10200     UVM_ASSERT(uvm_hmm_check_context_vma_is_valid(va_block,
10201                                                   va_block_context->hmm.vma,
10202                                                   uvm_va_block_region_for_page(page_index)));
10203 
10204     id = block_select_residency(va_block,
10205                                 va_block_context,
10206                                 page_index,
10207                                 processor_id,
10208                                 access_type_mask,
10209                                 policy,
10210                                 thrashing_hint,
10211                                 operation,
10212                                 read_duplicate);
10213 
10214     // If the intended residency doesn't have memory, fall back to the CPU.
10215     if (!block_processor_has_memory(va_block, id)) {
10216         *read_duplicate = false;
10217         return UVM_ID_CPU;
10218     }
10219 
10220     return id;
10221 }
10222 
10223 static bool check_access_counters_dont_revoke(uvm_va_block_t *block,
10224                                               uvm_va_block_context_t *block_context,
10225                                               uvm_va_block_region_t region,
10226                                               const uvm_processor_mask_t *revoke_processors,
10227                                               const uvm_page_mask_t *revoke_page_mask,
10228                                               uvm_prot_t revoke_prot)
10229 {
10230     uvm_processor_id_t id;
10231     for_each_id_in_mask(id, revoke_processors) {
10232         const uvm_page_mask_t *mapped_with_prot = block_map_with_prot_mask_get(block, id, revoke_prot);
10233 
10234         uvm_page_mask_and(&block_context->caller_page_mask, revoke_page_mask, mapped_with_prot);
10235 
10236         UVM_ASSERT(uvm_page_mask_region_weight(&block_context->caller_page_mask, region) == 0);
10237     }
10238 
10239     return true;
10240 }
10241 
10242 // Update service_context->prefetch_hint, service_context->per_processor_masks,
10243 // and service_context->region.
10244 static void uvm_va_block_get_prefetch_hint(uvm_va_block_t *va_block,
10245                                            const uvm_va_policy_t *policy,
10246                                            uvm_service_block_context_t *service_context)
10247 {
10248     uvm_processor_id_t new_residency;
10249     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
10250 
10251     // Performance heuristics policy: we only consider prefetching when there
10252     // are migrations to a single processor, only.
10253     if (uvm_processor_mask_get_count(&service_context->resident_processors) == 1) {
10254         uvm_page_index_t page_index;
10255         uvm_page_mask_t *new_residency_mask;
10256 
10257         new_residency = uvm_processor_mask_find_first_id(&service_context->resident_processors);
10258         new_residency_mask = &service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency;
10259 
10260         // Update prefetch tracking structure with the pages that will migrate
10261         // due to faults
10262         uvm_perf_prefetch_get_hint_va_block(va_block,
10263                                             &service_context->block_context,
10264                                             new_residency,
10265                                             new_residency_mask,
10266                                             service_context->region,
10267                                             &service_context->prefetch_bitmap_tree,
10268                                             &service_context->prefetch_hint);
10269 
10270         // Obtain the prefetch hint and give a fake fault access type to the
10271         // prefetched pages
10272         if (UVM_ID_IS_VALID(service_context->prefetch_hint.residency)) {
10273             const uvm_page_mask_t *prefetch_pages_mask = &service_context->prefetch_hint.prefetch_pages_mask;
10274 
10275             for_each_va_block_page_in_mask(page_index, prefetch_pages_mask, va_block) {
10276                 UVM_ASSERT(!uvm_page_mask_test(new_residency_mask, page_index));
10277 
10278                 service_context->access_type[page_index] = UVM_FAULT_ACCESS_TYPE_PREFETCH;
10279 
10280                 if (uvm_va_policy_is_read_duplicate(policy, va_space) ||
10281                     (policy->read_duplication != UVM_READ_DUPLICATION_DISABLED &&
10282                      uvm_page_mask_test(&va_block->read_duplicated_pages, page_index))) {
10283                     if (service_context->read_duplicate_count++ == 0)
10284                         uvm_page_mask_zero(&service_context->read_duplicate_mask);
10285 
10286                     uvm_page_mask_set(&service_context->read_duplicate_mask, page_index);
10287                 }
10288             }
10289 
10290             uvm_page_mask_or(new_residency_mask, new_residency_mask, prefetch_pages_mask);
10291             service_context->region = uvm_va_block_region_from_mask(va_block, new_residency_mask);
10292         }
10293     }
10294     else {
10295         service_context->prefetch_hint.residency = UVM_ID_INVALID;
10296     }
10297 }
10298 
10299 NV_STATUS uvm_va_block_service_copy(uvm_processor_id_t processor_id,
10300                                     uvm_processor_id_t new_residency,
10301                                     uvm_va_block_t *va_block,
10302                                     uvm_va_block_retry_t *block_retry,
10303                                     uvm_service_block_context_t *service_context)
10304 {
10305     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
10306     uvm_processor_mask_t *all_involved_processors =
10307         &service_context->block_context.make_resident.all_involved_processors;
10308     uvm_page_mask_t *new_residency_mask =
10309         &service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency;
10310     uvm_page_mask_t *did_migrate_mask = &service_context->block_context.make_resident.pages_changed_residency;
10311     uvm_page_mask_t *caller_page_mask = &service_context->block_context.caller_page_mask;
10312     uvm_make_resident_cause_t cause;
10313     NV_STATUS status;
10314 
10315     // 1- Migrate pages
10316     switch (service_context->operation) {
10317         case UVM_SERVICE_OPERATION_REPLAYABLE_FAULTS:
10318             cause = UVM_MAKE_RESIDENT_CAUSE_REPLAYABLE_FAULT;
10319             break;
10320         case UVM_SERVICE_OPERATION_NON_REPLAYABLE_FAULTS:
10321             cause = UVM_MAKE_RESIDENT_CAUSE_NON_REPLAYABLE_FAULT;
10322             break;
10323         case UVM_SERVICE_OPERATION_ACCESS_COUNTERS:
10324             cause = UVM_MAKE_RESIDENT_CAUSE_ACCESS_COUNTER;
10325             break;
10326         default:
10327             UVM_ASSERT_MSG(false, "Invalid operation value %d\n", service_context->operation);
10328             // Set cause to silence compiler warning that it may be unused.
10329             cause = UVM_MAKE_RESIDENT_CAUSE_ACCESS_COUNTER;
10330             break;
10331     }
10332 
10333     // Reset masks before all of the make_resident calls
10334     uvm_page_mask_zero(did_migrate_mask);
10335     uvm_processor_mask_zero(all_involved_processors);
10336 
10337     // Handle read duplication first so that the caller_page_mask will be free
10338     // to use below and still valid in uvm_va_block_service_finish().
10339     // TODO: Bug 3660922: need to implement HMM read duplication support.
10340     if (service_context->read_duplicate_count != 0 &&
10341         uvm_page_mask_and(caller_page_mask,
10342                           new_residency_mask,
10343                           &service_context->read_duplicate_mask)) {
10344         status = uvm_va_block_make_resident_read_duplicate(va_block,
10345                                                            block_retry,
10346                                                            &service_context->block_context,
10347                                                            new_residency,
10348                                                            service_context->region,
10349                                                            caller_page_mask,
10350                                                            &service_context->prefetch_hint.prefetch_pages_mask,
10351                                                            cause);
10352         if (status != NV_OK)
10353             return status;
10354     }
10355 
10356     if (service_context->read_duplicate_count == 0 ||
10357         uvm_page_mask_andnot(caller_page_mask, new_residency_mask, &service_context->read_duplicate_mask)) {
10358         if (service_context->read_duplicate_count == 0)
10359             uvm_page_mask_copy(caller_page_mask, new_residency_mask);
10360         status = uvm_va_block_make_resident_copy(va_block,
10361                                                  block_retry,
10362                                                  &service_context->block_context,
10363                                                  new_residency,
10364                                                  service_context->region,
10365                                                  caller_page_mask,
10366                                                  &service_context->prefetch_hint.prefetch_pages_mask,
10367                                                  cause);
10368         if (status != NV_OK)
10369             return status;
10370     }
10371 
10372     if (UVM_ID_IS_CPU(processor_id) && !uvm_processor_mask_empty(all_involved_processors))
10373         service_context->cpu_fault.did_migrate = true;
10374 
10375     // 2- Check for ECC errors on all GPUs involved in the migration if CPU is
10376     //    the destination. Migrations in response to CPU faults are special
10377     //    because they're on the only path (apart from tools) where CUDA is not
10378     //    involved and wouldn't have a chance to do its own ECC checking.
10379     if (service_context->operation == UVM_SERVICE_OPERATION_REPLAYABLE_FAULTS &&
10380         UVM_ID_IS_CPU(new_residency) &&
10381         !uvm_processor_mask_empty(all_involved_processors)) {
10382         uvm_gpu_t *gpu;
10383 
10384         // Before checking for ECC errors, make sure all of the GPU work
10385         // is finished. Creating mappings on the CPU would have to wait
10386         // for the tracker anyway so this shouldn't hurt performance.
10387         status = uvm_tracker_wait(&va_block->tracker);
10388         if (status != NV_OK)
10389             return status;
10390 
10391         for_each_va_space_gpu_in_mask(gpu, va_space, all_involved_processors) {
10392             // We cannot call into RM here so use the no RM ECC check.
10393             status = uvm_gpu_check_ecc_error_no_rm(gpu);
10394             if (status == NV_WARN_MORE_PROCESSING_REQUIRED) {
10395                 // In case we need to call into RM to be sure whether
10396                 // there is an ECC error or not, signal that to the
10397                 // caller by adding the GPU to the mask.
10398                 //
10399                 // In that case the ECC error might be noticed only after
10400                 // the CPU mappings have been already created below,
10401                 // exposing different CPU threads to the possibly corrupt
10402                 // data, but this thread will fault eventually and that's
10403                 // considered to be an acceptable trade-off between
10404                 // performance and ECC error containment.
10405                 uvm_processor_mask_set(&service_context->cpu_fault.gpus_to_check_for_ecc, gpu->id);
10406                 status = NV_OK;
10407             }
10408             if (status != NV_OK)
10409                 return status;
10410         }
10411     }
10412 
10413     return NV_OK;
10414 }
10415 
10416 NV_STATUS uvm_va_block_service_finish(uvm_processor_id_t processor_id,
10417                                       uvm_va_block_t *va_block,
10418                                       uvm_service_block_context_t *service_context)
10419 {
10420     uvm_processor_id_t new_residency = service_context->block_context.make_resident.dest_id;
10421     uvm_page_mask_t *new_residency_mask =
10422         &service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency;
10423     uvm_page_mask_t *did_migrate_mask = &service_context->block_context.make_resident.pages_changed_residency;
10424     uvm_page_mask_t *caller_page_mask = &service_context->block_context.caller_page_mask;
10425     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
10426     uvm_prot_t new_prot;
10427     uvm_page_index_t page_index;
10428     NV_STATUS status;
10429 
10430     // Update residency.
10431     if (service_context->read_duplicate_count == 0 || !uvm_page_mask_empty(caller_page_mask))
10432         uvm_va_block_make_resident_finish(va_block,
10433                                           &service_context->block_context,
10434                                           service_context->region,
10435                                           caller_page_mask);
10436 
10437     uvm_page_mask_andnot(&service_context->did_not_migrate_mask, new_residency_mask, did_migrate_mask);
10438 
10439     // The loops below depend on the enums having the following values in order
10440     // to index into service_context->mappings_by_prot[].
10441     BUILD_BUG_ON(UVM_PROT_READ_ONLY != 1);
10442     BUILD_BUG_ON(UVM_PROT_READ_WRITE != 2);
10443     BUILD_BUG_ON(UVM_PROT_READ_WRITE_ATOMIC != 3);
10444     BUILD_BUG_ON(UVM_PROT_MAX != 4);
10445 
10446     // 1- Compute mapping protections for the requesting processor on the new
10447     // residency.
10448     for (new_prot = UVM_PROT_READ_ONLY; new_prot < UVM_PROT_MAX; ++new_prot)
10449         service_context->mappings_by_prot[new_prot - 1].count = 0;
10450 
10451     for_each_va_block_page_in_region_mask(page_index, new_residency_mask, service_context->region) {
10452         new_prot = compute_new_permission(va_block,
10453                                           service_context->block_context.hmm.vma,
10454                                           page_index,
10455                                           processor_id,
10456                                           new_residency,
10457                                           service_context->access_type[page_index]);
10458 
10459         if (service_context->mappings_by_prot[new_prot - 1].count++ == 0)
10460             uvm_page_mask_zero(&service_context->mappings_by_prot[new_prot - 1].page_mask);
10461 
10462         uvm_page_mask_set(&service_context->mappings_by_prot[new_prot - 1].page_mask, page_index);
10463     }
10464 
10465     // 2- Revoke permissions
10466     //
10467     // NOTE: uvm_va_block_make_resident_copy destroys mappings to old locations.
10468     //       Thus, we need to revoke only if residency did not change and we
10469     //       are mapping higher than READ ONLY.
10470     for (new_prot = UVM_PROT_READ_WRITE; new_prot <= UVM_PROT_READ_WRITE_ATOMIC; ++new_prot) {
10471         bool pages_need_revocation;
10472         uvm_processor_mask_t revoke_processors;
10473         uvm_prot_t revoke_prot;
10474         bool this_processor_has_enabled_atomics;
10475 
10476         if (service_context->mappings_by_prot[new_prot - 1].count == 0)
10477             continue;
10478 
10479         pages_need_revocation = uvm_page_mask_and(&service_context->revocation_mask,
10480                                                   &service_context->did_not_migrate_mask,
10481                                                   &service_context->mappings_by_prot[new_prot - 1].page_mask);
10482         if (!pages_need_revocation)
10483             continue;
10484 
10485         uvm_processor_mask_and(&revoke_processors, &va_block->mapped, &va_space->faultable_processors);
10486 
10487         // Do not revoke the processor that took the fault
10488         uvm_processor_mask_clear(&revoke_processors, processor_id);
10489 
10490         this_processor_has_enabled_atomics = uvm_processor_mask_test(&va_space->system_wide_atomics_enabled_processors,
10491                                                                      processor_id);
10492 
10493         // Atomic operations on processors with system-wide atomics
10494         // disabled or with native atomics access to new_residency
10495         // behave like writes.
10496         if (new_prot == UVM_PROT_READ_WRITE ||
10497             !this_processor_has_enabled_atomics ||
10498             uvm_processor_mask_test(&va_space->has_native_atomics[uvm_id_value(new_residency)], processor_id)) {
10499 
10500             // Exclude processors with native atomics on the resident copy
10501             uvm_processor_mask_andnot(&revoke_processors,
10502                                       &revoke_processors,
10503                                       &va_space->has_native_atomics[uvm_id_value(new_residency)]);
10504 
10505             // Exclude processors with disabled system-wide atomics
10506             uvm_processor_mask_and(&revoke_processors,
10507                                    &revoke_processors,
10508                                    &va_space->system_wide_atomics_enabled_processors);
10509         }
10510 
10511         if (UVM_ID_IS_CPU(processor_id)) {
10512             revoke_prot = UVM_PROT_READ_WRITE_ATOMIC;
10513         }
10514         else {
10515             revoke_prot = (new_prot == UVM_PROT_READ_WRITE_ATOMIC)? UVM_PROT_READ_WRITE:
10516                                                                     UVM_PROT_READ_WRITE_ATOMIC;
10517         }
10518 
10519         // UVM-Lite processors must always have RWA mappings
10520         if (uvm_processor_mask_andnot(&revoke_processors, &revoke_processors, block_get_uvm_lite_gpus(va_block))) {
10521             // Access counters should never trigger revocations apart from
10522             // read-duplication, which are performed in the calls to
10523             // uvm_va_block_make_resident_read_duplicate, above.
10524             if (service_context->operation == UVM_SERVICE_OPERATION_ACCESS_COUNTERS) {
10525                 UVM_ASSERT(check_access_counters_dont_revoke(va_block,
10526                                                              &service_context->block_context,
10527                                                              service_context->region,
10528                                                              &revoke_processors,
10529                                                              &service_context->revocation_mask,
10530                                                              revoke_prot));
10531             }
10532 
10533             // Downgrade other processors' mappings
10534             status = uvm_va_block_revoke_prot_mask(va_block,
10535                                                    &service_context->block_context,
10536                                                    &revoke_processors,
10537                                                    service_context->region,
10538                                                    &service_context->revocation_mask,
10539                                                    revoke_prot);
10540             if (status != NV_OK)
10541                 return status;
10542         }
10543     }
10544 
10545     // 3- Map requesting processor with the necessary privileges
10546     for (new_prot = UVM_PROT_READ_ONLY; new_prot <= UVM_PROT_READ_WRITE_ATOMIC; ++new_prot) {
10547         const uvm_page_mask_t *map_prot_mask = &service_context->mappings_by_prot[new_prot - 1].page_mask;
10548 
10549         if (service_context->mappings_by_prot[new_prot - 1].count == 0)
10550             continue;
10551 
10552         // 3.1 - Unmap CPU pages
10553         // HMM cpu mappings can be upgraded at any time without notification
10554         // so no need to downgrade first.
10555         if (service_context->operation != UVM_SERVICE_OPERATION_ACCESS_COUNTERS &&
10556             UVM_ID_IS_CPU(processor_id) &&
10557             !uvm_va_block_is_hmm(va_block)) {
10558             // The kernel can downgrade managed CPU mappings at any time without
10559             // notifying us, which means our PTE state could be stale. We
10560             // handle this by unmapping the CPU PTE and re-mapping it again.
10561             //
10562             // A CPU fault is unexpected if:
10563             // curr_prot == RW || (!is_write && curr_prot == RO)
10564             status = uvm_va_block_unmap(va_block,
10565                                         &service_context->block_context,
10566                                         UVM_ID_CPU,
10567                                         service_context->region,
10568                                         map_prot_mask,
10569                                         NULL);
10570             if (status != NV_OK)
10571                 return status;
10572         }
10573 
10574         // 3.2 - Add new mappings
10575 
10576         // The faulting processor can be mapped remotely due to user policy or
10577         // the thrashing mitigation heuristics. Therefore, we set the cause
10578         // accordingly in each case.
10579 
10580         // Map pages that are thrashing first
10581         if (service_context->thrashing_pin_count > 0 && va_space->tools.enabled) {
10582             uvm_page_mask_t *helper_page_mask = &service_context->block_context.caller_page_mask;
10583             bool pages_need_mapping = uvm_page_mask_and(helper_page_mask,
10584                                                         map_prot_mask,
10585                                                         &service_context->thrashing_pin_mask);
10586             if (pages_need_mapping) {
10587                 status = uvm_va_block_map(va_block,
10588                                           &service_context->block_context,
10589                                           processor_id,
10590                                           service_context->region,
10591                                           helper_page_mask,
10592                                           new_prot,
10593                                           UvmEventMapRemoteCauseThrashing,
10594                                           &va_block->tracker);
10595                 if (status != NV_OK)
10596                     return status;
10597 
10598                 // Remove thrashing pages from the map mask
10599                 pages_need_mapping = uvm_page_mask_andnot(helper_page_mask,
10600                                                           map_prot_mask,
10601                                                           &service_context->thrashing_pin_mask);
10602                 if (!pages_need_mapping)
10603                     continue;
10604 
10605                 map_prot_mask = helper_page_mask;
10606             }
10607         }
10608 
10609         status = uvm_va_block_map(va_block,
10610                                   &service_context->block_context,
10611                                   processor_id,
10612                                   service_context->region,
10613                                   map_prot_mask,
10614                                   new_prot,
10615                                   UvmEventMapRemoteCausePolicy,
10616                                   &va_block->tracker);
10617         if (status != NV_OK)
10618             return status;
10619     }
10620 
10621     // 4- If pages did migrate, map SetAccessedBy processors, except for
10622     // UVM-Lite
10623     for (new_prot = UVM_PROT_READ_ONLY; new_prot <= UVM_PROT_READ_WRITE_ATOMIC; ++new_prot) {
10624         bool pages_need_mapping;
10625 
10626         if (service_context->mappings_by_prot[new_prot - 1].count == 0)
10627             continue;
10628 
10629         pages_need_mapping = uvm_page_mask_and(caller_page_mask,
10630                                                new_residency_mask,
10631                                                &service_context->mappings_by_prot[new_prot - 1].page_mask);
10632         if (!pages_need_mapping)
10633             continue;
10634 
10635         // Map pages that are thrashing
10636         if (service_context->thrashing_pin_count > 0) {
10637             uvm_page_index_t page_index;
10638 
10639             for_each_va_block_page_in_region_mask(page_index,
10640                                                   &service_context->thrashing_pin_mask,
10641                                                   service_context->region) {
10642                 uvm_processor_mask_t *map_thrashing_processors = NULL;
10643                 NvU64 page_addr = uvm_va_block_cpu_page_address(va_block, page_index);
10644 
10645                 // Check protection type
10646                 if (!uvm_page_mask_test(caller_page_mask, page_index))
10647                     continue;
10648 
10649                 map_thrashing_processors = uvm_perf_thrashing_get_thrashing_processors(va_block, page_addr);
10650 
10651                 status = uvm_va_block_add_mappings_after_migration(va_block,
10652                                                                    &service_context->block_context,
10653                                                                    new_residency,
10654                                                                    processor_id,
10655                                                                    uvm_va_block_region_for_page(page_index),
10656                                                                    caller_page_mask,
10657                                                                    new_prot,
10658                                                                    map_thrashing_processors);
10659                 if (status != NV_OK)
10660                     return status;
10661             }
10662 
10663             pages_need_mapping = uvm_page_mask_andnot(caller_page_mask,
10664                                                       caller_page_mask,
10665                                                       &service_context->thrashing_pin_mask);
10666             if (!pages_need_mapping)
10667                 continue;
10668         }
10669 
10670         // Map the rest of pages in a single shot
10671         status = uvm_va_block_add_mappings_after_migration(va_block,
10672                                                            &service_context->block_context,
10673                                                            new_residency,
10674                                                            processor_id,
10675                                                            service_context->region,
10676                                                            caller_page_mask,
10677                                                            new_prot,
10678                                                            NULL);
10679         if (status != NV_OK)
10680             return status;
10681     }
10682 
10683     return NV_OK;
10684 }
10685 
10686 NV_STATUS uvm_va_block_service_locked(uvm_processor_id_t processor_id,
10687                                       uvm_va_block_t *va_block,
10688                                       uvm_va_block_retry_t *block_retry,
10689                                       uvm_service_block_context_t *service_context)
10690 {
10691     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
10692     uvm_processor_id_t new_residency;
10693     NV_STATUS status = NV_OK;
10694 
10695     uvm_assert_mutex_locked(&va_block->lock);
10696     UVM_ASSERT(uvm_hmm_check_context_vma_is_valid(va_block,
10697                                                   service_context->block_context.hmm.vma,
10698                                                   service_context->region));
10699 
10700     // GPU fault servicing must be done under the VA space read lock. GPU fault
10701     // servicing is required for RM to make forward progress, and we allow other
10702     // threads to call into RM while holding the VA space lock in read mode. If
10703     // we took the VA space lock in write mode on the GPU fault service path,
10704     // we could deadlock because the thread in RM which holds the VA space lock
10705     // for read wouldn't be able to complete until fault servicing completes.
10706     if (service_context->operation != UVM_SERVICE_OPERATION_REPLAYABLE_FAULTS || UVM_ID_IS_CPU(processor_id))
10707         uvm_assert_rwsem_locked(&va_space->lock);
10708     else
10709         uvm_assert_rwsem_locked_read(&va_space->lock);
10710 
10711     uvm_va_block_get_prefetch_hint(va_block,
10712                                    uvm_va_policy_get_region(va_block, service_context->region),
10713                                    service_context);
10714 
10715     for_each_id_in_mask(new_residency, &service_context->resident_processors) {
10716         if (uvm_va_block_is_hmm(va_block)) {
10717             status = uvm_hmm_va_block_service_locked(processor_id, new_residency, va_block, block_retry, service_context);
10718             if (status != NV_OK)
10719                 break;
10720 
10721             continue;
10722         }
10723 
10724         status = uvm_va_block_service_copy(processor_id, new_residency, va_block, block_retry, service_context);
10725         if (status != NV_OK)
10726             break;
10727 
10728         status = uvm_va_block_service_finish(processor_id, va_block, service_context);
10729         if (status != NV_OK)
10730             break;
10731     }
10732 
10733     return status;
10734 }
10735 
10736 NV_STATUS uvm_va_block_check_logical_permissions(uvm_va_block_t *va_block,
10737                                                  uvm_va_block_context_t *va_block_context,
10738                                                  uvm_processor_id_t processor_id,
10739                                                  uvm_page_index_t page_index,
10740                                                  uvm_fault_type_t access_type,
10741                                                  bool allow_migration)
10742 {
10743     uvm_va_range_t *va_range = va_block->va_range;
10744     uvm_prot_t access_prot = uvm_fault_access_type_to_prot(access_type);
10745 
10746     UVM_ASSERT(uvm_hmm_check_context_vma_is_valid(va_block,
10747                                                   va_block_context->hmm.vma,
10748                                                   uvm_va_block_region_for_page(page_index)));
10749 
10750     // CPU permissions are checked later by block_map_cpu_page.
10751     //
10752     // TODO: Bug 1766124: permissions are checked by block_map_cpu_page because
10753     //       it can also be called from change_pte. Make change_pte call this
10754     //       function and only check CPU permissions here.
10755     if (UVM_ID_IS_GPU(processor_id)) {
10756         if (va_range && uvm_va_range_is_managed_zombie(va_range))
10757             return NV_ERR_INVALID_ADDRESS;
10758 
10759         // GPU faults only check vma permissions if a mm is registered with the
10760         // VA space (ie. uvm_va_space_mm_retain_lock(va_space) != NULL) or if
10761         // uvm_enable_builtin_tests is set, because the Linux kernel can change
10762         // vm_flags at any moment (for example on mprotect) and here we are not
10763         // guaranteed to have vma->vm_mm->mmap_lock. During tests we ensure that
10764         // this scenario does not happen.
10765         if (((va_block->hmm.va_space && va_block->hmm.va_space->va_space_mm.mm) || uvm_enable_builtin_tests) &&
10766             (access_prot > compute_logical_prot(va_block, va_block_context->hmm.vma, page_index)))
10767             return NV_ERR_INVALID_ACCESS_TYPE;
10768     }
10769 
10770     // Non-migratable range:
10771     // - CPU accesses are always fatal, regardless of the VA range residency
10772     // - GPU accesses are fatal if the GPU can't map the preferred location
10773     if (!allow_migration) {
10774         UVM_ASSERT(!uvm_va_block_is_hmm(va_block));
10775 
10776         if (UVM_ID_IS_CPU(processor_id)) {
10777             return NV_ERR_INVALID_OPERATION;
10778         }
10779         else {
10780             uvm_va_space_t *va_space = va_range->va_space;
10781 
10782             return uvm_processor_mask_test(
10783                     &va_space->accessible_from[uvm_id_value(uvm_va_range_get_policy(va_range)->preferred_location)],
10784                     processor_id)?
10785                 NV_OK : NV_ERR_INVALID_ACCESS_TYPE;
10786         }
10787     }
10788 
10789     return NV_OK;
10790 }
10791 
10792 // Check if we are faulting on a page with valid permissions to check if we can
10793 // skip fault handling. See uvm_va_block_t::cpu::fault_authorized for more
10794 // details
10795 static bool skip_cpu_fault_with_valid_permissions(uvm_va_block_t *va_block,
10796                                                   uvm_page_index_t page_index,
10797                                                   uvm_fault_access_type_t fault_access_type)
10798 {
10799     // TODO: Bug 3900038: is skip_cpu_fault_with_valid_permissions() needed for
10800     // HMM?
10801     if (uvm_va_block_is_hmm(va_block))
10802         return false;
10803 
10804     if (block_page_is_processor_authorized(va_block,
10805                                            page_index,
10806                                            UVM_ID_CPU,
10807                                            uvm_fault_access_type_to_prot(fault_access_type))) {
10808         NvU64 now = NV_GETTIME();
10809         pid_t pid = current->pid;
10810 
10811         // Latch the pid/timestamp/page_index values for the first time
10812         if (!va_block->cpu.fault_authorized.first_fault_stamp) {
10813             va_block->cpu.fault_authorized.first_fault_stamp = now;
10814             va_block->cpu.fault_authorized.first_pid = pid;
10815             va_block->cpu.fault_authorized.page_index = page_index;
10816 
10817             return true;
10818         }
10819 
10820         // If the same thread shows up again, this means that the kernel
10821         // downgraded the page's PTEs. Service the fault to force a remap of
10822         // the page.
10823         if (va_block->cpu.fault_authorized.first_pid == pid &&
10824             va_block->cpu.fault_authorized.page_index == page_index) {
10825             va_block->cpu.fault_authorized.first_fault_stamp = 0;
10826         }
10827         else {
10828             // If the window has expired, clear the information and service the
10829             // fault. Otherwise, just return
10830             if (now - va_block->cpu.fault_authorized.first_fault_stamp > uvm_perf_authorized_cpu_fault_tracking_window_ns)
10831                 va_block->cpu.fault_authorized.first_fault_stamp = 0;
10832             else
10833                 return true;
10834         }
10835     }
10836 
10837     return false;
10838 }
10839 
10840 static NV_STATUS block_cpu_fault_locked(uvm_va_block_t *va_block,
10841                                         uvm_va_block_retry_t *va_block_retry,
10842                                         NvU64 fault_addr,
10843                                         uvm_fault_access_type_t fault_access_type,
10844                                         uvm_service_block_context_t *service_context)
10845 {
10846     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
10847     NV_STATUS status = NV_OK;
10848     uvm_page_index_t page_index;
10849     uvm_perf_thrashing_hint_t thrashing_hint;
10850     uvm_processor_id_t new_residency;
10851     bool read_duplicate;
10852     const uvm_va_policy_t *policy;
10853 
10854     uvm_assert_rwsem_locked(&va_space->lock);
10855 
10856     UVM_ASSERT(fault_addr >= va_block->start);
10857     UVM_ASSERT(fault_addr <= va_block->end);
10858 
10859     uvm_assert_mmap_lock_locked(service_context->block_context.mm);
10860 
10861     policy = uvm_va_policy_get(va_block, fault_addr);
10862 
10863     if (service_context->num_retries == 0) {
10864         // notify event to tools/performance heuristics
10865         uvm_perf_event_notify_cpu_fault(&va_space->perf_events,
10866                                         va_block,
10867                                         policy->preferred_location,
10868                                         fault_addr,
10869                                         fault_access_type > UVM_FAULT_ACCESS_TYPE_READ,
10870                                         KSTK_EIP(current));
10871     }
10872 
10873     // Check logical permissions
10874     page_index = uvm_va_block_cpu_page_index(va_block, fault_addr);
10875     status = uvm_va_block_check_logical_permissions(va_block,
10876                                                     &service_context->block_context,
10877                                                     UVM_ID_CPU,
10878                                                     page_index,
10879                                                     fault_access_type,
10880                                                     uvm_range_group_address_migratable(va_space, fault_addr));
10881     if (status != NV_OK)
10882         return status;
10883 
10884     uvm_processor_mask_zero(&service_context->cpu_fault.gpus_to_check_for_ecc);
10885 
10886     if (skip_cpu_fault_with_valid_permissions(va_block, page_index, fault_access_type))
10887         return NV_OK;
10888 
10889     thrashing_hint = uvm_perf_thrashing_get_hint(va_block, fault_addr, UVM_ID_CPU);
10890     // Throttling is implemented by sleeping in the fault handler on the CPU
10891     if (thrashing_hint.type == UVM_PERF_THRASHING_HINT_TYPE_THROTTLE) {
10892         service_context->cpu_fault.wakeup_time_stamp = thrashing_hint.throttle.end_time_stamp;
10893         return NV_WARN_MORE_PROCESSING_REQUIRED;
10894     }
10895 
10896     service_context->read_duplicate_count = 0;
10897     service_context->thrashing_pin_count = 0;
10898     service_context->operation = UVM_SERVICE_OPERATION_REPLAYABLE_FAULTS;
10899 
10900     if (thrashing_hint.type == UVM_PERF_THRASHING_HINT_TYPE_PIN) {
10901         uvm_page_mask_zero(&service_context->thrashing_pin_mask);
10902         uvm_page_mask_set(&service_context->thrashing_pin_mask, page_index);
10903         service_context->thrashing_pin_count = 1;
10904     }
10905 
10906     // Compute new residency and update the masks
10907     new_residency = uvm_va_block_select_residency(va_block,
10908                                                   &service_context->block_context,
10909                                                   page_index,
10910                                                   UVM_ID_CPU,
10911                                                   uvm_fault_access_type_mask_bit(fault_access_type),
10912                                                   policy,
10913                                                   &thrashing_hint,
10914                                                   UVM_SERVICE_OPERATION_REPLAYABLE_FAULTS,
10915                                                   &read_duplicate);
10916 
10917     // Initialize the minimum necessary state in the fault service context
10918     uvm_processor_mask_zero(&service_context->resident_processors);
10919 
10920     // Set new residency and update the masks
10921     uvm_processor_mask_set(&service_context->resident_processors, new_residency);
10922 
10923     // The masks need to be fully zeroed as the fault region may grow due to prefetching
10924     uvm_page_mask_zero(&service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency);
10925     uvm_page_mask_set(&service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency, page_index);
10926 
10927     if (read_duplicate) {
10928         uvm_page_mask_zero(&service_context->read_duplicate_mask);
10929         uvm_page_mask_set(&service_context->read_duplicate_mask, page_index);
10930         service_context->read_duplicate_count = 1;
10931     }
10932 
10933     service_context->access_type[page_index] = fault_access_type;
10934 
10935     service_context->region = uvm_va_block_region_for_page(page_index);
10936 
10937     status = uvm_va_block_service_locked(UVM_ID_CPU, va_block, va_block_retry, service_context);
10938 
10939     ++service_context->num_retries;
10940 
10941     return status;
10942 }
10943 
10944 NV_STATUS uvm_va_block_cpu_fault(uvm_va_block_t *va_block,
10945                                  NvU64 fault_addr,
10946                                  bool is_write,
10947                                  uvm_service_block_context_t *service_context)
10948 {
10949     NV_STATUS status;
10950     uvm_va_block_retry_t va_block_retry;
10951     uvm_fault_access_type_t fault_access_type;
10952 
10953     if (is_write)
10954         fault_access_type = UVM_FAULT_ACCESS_TYPE_ATOMIC_STRONG;
10955     else
10956         fault_access_type = UVM_FAULT_ACCESS_TYPE_READ;
10957 
10958     service_context->num_retries = 0;
10959     service_context->cpu_fault.did_migrate = false;
10960 
10961     // We have to use vm_insert_page instead of handing the page to the kernel
10962     // and letting it insert the mapping, and we must do that while holding the
10963     // lock on this VA block. Otherwise there will be a window in which we think
10964     // we've mapped the page but the CPU mapping hasn't actually been created
10965     // yet. During that window a GPU fault event could arrive and claim
10966     // ownership of that VA, "unmapping" it. Then later the kernel would
10967     // eventually establish the mapping, and we'd end up with both CPU and GPU
10968     // thinking they each owned the page.
10969     //
10970     // This function must only be called when it's safe to call vm_insert_page.
10971     // That is, there must be a reference held on the vma's vm_mm, and
10972     // vm_mm->mmap_lock is held in at least read mode. Note that current->mm
10973     // might not be vma->vm_mm.
10974     status = UVM_VA_BLOCK_LOCK_RETRY(va_block,
10975                                      &va_block_retry,
10976                                      block_cpu_fault_locked(va_block,
10977                                                             &va_block_retry,
10978                                                             fault_addr,
10979                                                             fault_access_type,
10980                                                             service_context));
10981     return status;
10982 }
10983 
10984 NV_STATUS uvm_va_block_find(uvm_va_space_t *va_space, NvU64 addr, uvm_va_block_t **out_block)
10985 {
10986     uvm_va_range_t *va_range;
10987     uvm_va_block_t *block;
10988     size_t index;
10989 
10990     va_range = uvm_va_range_find(va_space, addr);
10991     if (!va_range)
10992         return uvm_hmm_va_block_find(va_space, addr, out_block);
10993 
10994     UVM_ASSERT(uvm_hmm_va_block_find(va_space, addr, out_block) == NV_ERR_INVALID_ADDRESS ||
10995                uvm_hmm_va_block_find(va_space, addr, out_block) == NV_ERR_OBJECT_NOT_FOUND);
10996 
10997     if (va_range->type != UVM_VA_RANGE_TYPE_MANAGED)
10998         return NV_ERR_INVALID_ADDRESS;
10999 
11000     index = uvm_va_range_block_index(va_range, addr);
11001     block = uvm_va_range_block(va_range, index);
11002     if (!block)
11003         return NV_ERR_OBJECT_NOT_FOUND;
11004 
11005     *out_block = block;
11006     return NV_OK;
11007 }
11008 
11009 NV_STATUS uvm_va_block_find_create_in_range(uvm_va_space_t *va_space,
11010                                             uvm_va_range_t *va_range,
11011                                             NvU64 addr,
11012                                             uvm_va_block_t **out_block)
11013 {
11014     size_t index;
11015 
11016     if (uvm_enable_builtin_tests && atomic_dec_if_positive(&va_space->test.va_block_allocation_fail_nth) == 0)
11017         return NV_ERR_NO_MEMORY;
11018 
11019     UVM_ASSERT(va_range);
11020     UVM_ASSERT(addr >= va_range->node.start);
11021     UVM_ASSERT(addr <= va_range->node.end);
11022 
11023     UVM_ASSERT(uvm_hmm_va_block_find(va_space, addr, out_block) == NV_ERR_INVALID_ADDRESS ||
11024                uvm_hmm_va_block_find(va_space, addr, out_block) == NV_ERR_OBJECT_NOT_FOUND);
11025 
11026     if (va_range->type != UVM_VA_RANGE_TYPE_MANAGED)
11027         return NV_ERR_INVALID_ADDRESS;
11028 
11029     index = uvm_va_range_block_index(va_range, addr);
11030     return uvm_va_range_block_create(va_range, index, out_block);
11031 }
11032 
11033 NV_STATUS uvm_va_block_find_create_managed(uvm_va_space_t *va_space,
11034                                    NvU64 addr,
11035                                    uvm_va_block_t **out_block)
11036 {
11037     uvm_va_range_t *va_range = uvm_va_range_find(va_space, addr);
11038 
11039     if (va_range)
11040         return uvm_va_block_find_create_in_range(va_space, va_range, addr, out_block);
11041     else
11042         return NV_ERR_INVALID_ADDRESS;
11043 }
11044 
11045 NV_STATUS uvm_va_block_find_create(uvm_va_space_t *va_space,
11046                                    NvU64 addr,
11047                                    struct vm_area_struct **hmm_vma,
11048                                    uvm_va_block_t **out_block)
11049 {
11050     uvm_va_range_t *va_range = uvm_va_range_find(va_space, addr);
11051 
11052     if (hmm_vma)
11053         *hmm_vma = NULL;
11054 
11055     if (va_range)
11056         return uvm_va_block_find_create_in_range(va_space, va_range, addr, out_block);
11057     else
11058         return uvm_hmm_va_block_find_create(va_space, addr, hmm_vma, out_block);
11059 }
11060 
11061 // Launch a synchronous, encrypted copy between GPU and CPU.
11062 //
11063 // The copy entails a GPU-side encryption (relying on the Copy Engine), and a
11064 // CPU-side decryption step, such that the destination CPU buffer pointed by
11065 // dst_plain will contain the unencrypted (plain text) contents. The destination
11066 // buffer can be in protected or unprotected sysmem, while the source buffer
11067 // must be in protected vidmem.
11068 //
11069 // The maximum copy size allowed is UVM_CONF_COMPUTING_DMA_BUFFER_SIZE.
11070 //
11071 // The input tracker, if not NULL, is internally acquired by the push
11072 // responsible for the encrypted copy.
11073 __attribute__ ((format(printf, 6, 7)))
11074 static NV_STATUS encrypted_memcopy_gpu_to_cpu(uvm_gpu_t *gpu,
11075                                               void *dst_plain,
11076                                               uvm_gpu_address_t src_gpu_address,
11077                                               size_t size,
11078                                               uvm_tracker_t *tracker,
11079                                               const char *format,
11080                                               ...)
11081 {
11082     NV_STATUS status;
11083     UvmCslIv decrypt_iv;
11084     uvm_push_t push;
11085     uvm_conf_computing_dma_buffer_t *dma_buffer;
11086     uvm_gpu_address_t dst_gpu_address, auth_tag_gpu_address;
11087     void *src_cipher, *auth_tag;
11088     va_list args;
11089 
11090     UVM_ASSERT(uvm_conf_computing_mode_enabled(gpu));
11091     UVM_ASSERT(size <= UVM_CONF_COMPUTING_DMA_BUFFER_SIZE);
11092 
11093     status = uvm_conf_computing_dma_buffer_alloc(&gpu->conf_computing.dma_buffer_pool, &dma_buffer, NULL);
11094     if (status != NV_OK)
11095         return status;
11096 
11097     va_start(args, format);
11098     status = uvm_push_begin_acquire(gpu->channel_manager, UVM_CHANNEL_TYPE_GPU_TO_CPU, tracker, &push, format, args);
11099     va_end(args);
11100 
11101     if (status != NV_OK)
11102         goto out;
11103 
11104     uvm_conf_computing_log_gpu_encryption(push.channel, &decrypt_iv);
11105 
11106     dst_gpu_address = uvm_mem_gpu_address_virtual_kernel(dma_buffer->alloc, gpu);
11107     auth_tag_gpu_address = uvm_mem_gpu_address_virtual_kernel(dma_buffer->auth_tag, gpu);
11108     gpu->parent->ce_hal->encrypt(&push, dst_gpu_address, src_gpu_address, size, auth_tag_gpu_address);
11109 
11110     status = uvm_push_end_and_wait(&push);
11111     if (status != NV_OK)
11112         goto out;
11113 
11114     src_cipher = uvm_mem_get_cpu_addr_kernel(dma_buffer->alloc);
11115     auth_tag = uvm_mem_get_cpu_addr_kernel(dma_buffer->auth_tag);
11116     status = uvm_conf_computing_cpu_decrypt(push.channel, dst_plain, src_cipher, &decrypt_iv, size, auth_tag);
11117 
11118  out:
11119     uvm_conf_computing_dma_buffer_free(&gpu->conf_computing.dma_buffer_pool, dma_buffer, NULL);
11120     return status;
11121 }
11122 
11123 // Launch a synchronous, encrypted copy between CPU and GPU.
11124 //
11125 // The source CPU buffer pointed by src_plain contains the unencrypted (plain
11126 // text) contents; the function internally performs a CPU-side encryption step
11127 // before launching the GPU-side CE decryption. The source buffer can be in
11128 // protected or unprotected sysmem, while the destination buffer must be in
11129 // protected vidmem.
11130 //
11131 // The maximum copy size allowed is UVM_CONF_COMPUTING_DMA_BUFFER_SIZE.
11132 //
11133 // The input tracker, if not NULL, is internally acquired by the push
11134 // responsible for the encrypted copy.
11135 __attribute__ ((format(printf, 6, 7)))
11136 static NV_STATUS encrypted_memcopy_cpu_to_gpu(uvm_gpu_t *gpu,
11137                                               uvm_gpu_address_t dst_gpu_address,
11138                                               void *src_plain,
11139                                               size_t size,
11140                                               uvm_tracker_t *tracker,
11141                                               const char *format,
11142                                               ...)
11143 {
11144     NV_STATUS status;
11145     uvm_push_t push;
11146     uvm_conf_computing_dma_buffer_t *dma_buffer;
11147     uvm_gpu_address_t src_gpu_address, auth_tag_gpu_address;
11148     void *dst_cipher, *auth_tag;
11149     va_list args;
11150 
11151     UVM_ASSERT(uvm_conf_computing_mode_enabled(gpu));
11152     UVM_ASSERT(size <= UVM_CONF_COMPUTING_DMA_BUFFER_SIZE);
11153 
11154     status = uvm_conf_computing_dma_buffer_alloc(&gpu->conf_computing.dma_buffer_pool, &dma_buffer, NULL);
11155     if (status != NV_OK)
11156         return status;
11157 
11158     va_start(args, format);
11159     status = uvm_push_begin_acquire(gpu->channel_manager, UVM_CHANNEL_TYPE_CPU_TO_GPU, tracker, &push, format, args);
11160     va_end(args);
11161 
11162     if (status != NV_OK)
11163         goto out;
11164 
11165     dst_cipher = uvm_mem_get_cpu_addr_kernel(dma_buffer->alloc);
11166     auth_tag = uvm_mem_get_cpu_addr_kernel(dma_buffer->auth_tag);
11167     uvm_conf_computing_cpu_encrypt(push.channel, dst_cipher, src_plain, NULL, size, auth_tag);
11168 
11169     src_gpu_address = uvm_mem_gpu_address_virtual_kernel(dma_buffer->alloc, gpu);
11170     auth_tag_gpu_address = uvm_mem_gpu_address_virtual_kernel(dma_buffer->auth_tag, gpu);
11171     gpu->parent->ce_hal->decrypt(&push, dst_gpu_address, src_gpu_address, size, auth_tag_gpu_address);
11172 
11173     status = uvm_push_end_and_wait(&push);
11174 
11175 out:
11176     uvm_conf_computing_dma_buffer_free(&gpu->conf_computing.dma_buffer_pool, dma_buffer, NULL);
11177     return status;
11178 }
11179 
11180 static NV_STATUS va_block_write_cpu_to_gpu(uvm_va_block_t *va_block,
11181                                            uvm_gpu_t *gpu,
11182                                            uvm_gpu_address_t dst_gpu_address,
11183                                            NvU64 dst,
11184                                            uvm_mem_t *src_mem,
11185                                            size_t size)
11186 {
11187     NV_STATUS status;
11188     uvm_push_t push;
11189     uvm_gpu_address_t src_gpu_address;
11190 
11191     if (uvm_conf_computing_mode_enabled(gpu)) {
11192         return encrypted_memcopy_cpu_to_gpu(gpu,
11193                                             dst_gpu_address,
11194                                             uvm_mem_get_cpu_addr_kernel(src_mem),
11195                                             size,
11196                                             &va_block->tracker,
11197                                             "Encrypted write to [0x%llx, 0x%llx)",
11198                                             dst,
11199                                             dst + size);
11200     }
11201 
11202     status = uvm_push_begin_acquire(gpu->channel_manager,
11203                                     UVM_CHANNEL_TYPE_CPU_TO_GPU,
11204                                     &va_block->tracker,
11205                                     &push,
11206                                     "Direct write to [0x%llx, 0x%llx)",
11207                                     dst,
11208                                     dst + size);
11209     if (status != NV_OK)
11210         return status;
11211 
11212     src_gpu_address = uvm_mem_gpu_address_virtual_kernel(src_mem, gpu);
11213     gpu->parent->ce_hal->memcopy(&push, dst_gpu_address, src_gpu_address, size);
11214     return uvm_push_end_and_wait(&push);
11215 }
11216 
11217 NV_STATUS uvm_va_block_write_from_cpu(uvm_va_block_t *va_block,
11218                                       uvm_va_block_context_t *block_context,
11219                                       NvU64 dst,
11220                                       uvm_mem_t *src_mem,
11221                                       size_t size)
11222 {
11223     NV_STATUS status;
11224     uvm_page_index_t page_index = uvm_va_block_cpu_page_index(va_block, dst);
11225     NvU64 page_offset = dst & (PAGE_SIZE - 1);
11226     uvm_processor_id_t proc = uvm_va_block_page_get_closest_resident(va_block, page_index, UVM_ID_CPU);
11227     uvm_va_block_region_t region = uvm_va_block_region_for_page(page_index);
11228 
11229     uvm_assert_mutex_locked(&va_block->lock);
11230     UVM_ASSERT_MSG(page_offset + size <= PAGE_SIZE, "Write spans multiple pages: dst 0x%llx, size 0x%zx\n", dst, size);
11231 
11232     if (UVM_ID_IS_INVALID(proc))
11233         proc = UVM_ID_CPU;
11234 
11235     // Use make_resident() in all cases to break read-duplication, but
11236     // block_retry can be NULL as if the page is not resident yet we will make
11237     // it resident on the CPU.
11238     // Notably we don't care about coherence with respect to atomics from other
11239     // processors.
11240     status = uvm_va_block_make_resident(va_block,
11241                                         NULL,
11242                                         block_context,
11243                                         proc,
11244                                         region,
11245                                         NULL,
11246                                         NULL,
11247                                         UVM_MAKE_RESIDENT_CAUSE_API_TOOLS);
11248 
11249     if (status != NV_OK)
11250         return status;
11251 
11252     if (UVM_ID_IS_CPU(proc)) {
11253         char *mapped_page;
11254         struct page *page = uvm_cpu_chunk_get_cpu_page(va_block, page_index);
11255         void *src = uvm_mem_get_cpu_addr_kernel(src_mem);
11256 
11257         status = uvm_tracker_wait(&va_block->tracker);
11258         if (status != NV_OK)
11259             return status;
11260 
11261         mapped_page = (char *)kmap(page);
11262         memcpy(mapped_page + page_offset, src, size);
11263         kunmap(page);
11264 
11265         return NV_OK;
11266     }
11267     else {
11268         uvm_gpu_t *dst_gpu;
11269         uvm_gpu_address_t dst_gpu_address;
11270 
11271         UVM_ASSERT(UVM_ID_IS_GPU(proc));
11272 
11273         dst_gpu = block_get_gpu(va_block, proc);
11274 
11275         dst_gpu_address = block_phys_page_copy_address(va_block, block_phys_page(proc, page_index), dst_gpu);
11276         dst_gpu_address.address += page_offset;
11277 
11278         return va_block_write_cpu_to_gpu(va_block, dst_gpu, dst_gpu_address, dst, src_mem, size);
11279     }
11280 }
11281 
11282 static NV_STATUS va_block_read_gpu_to_cpu(uvm_va_block_t *va_block,
11283                                           uvm_mem_t *dst_mem,
11284                                           uvm_gpu_t *gpu,
11285                                           uvm_gpu_address_t src_gpu_address,
11286                                           NvU64 src,
11287                                           size_t size)
11288 {
11289     NV_STATUS status;
11290     uvm_push_t push;
11291     uvm_gpu_address_t dst_gpu_address;
11292 
11293     if (uvm_conf_computing_mode_enabled(gpu)) {
11294         return encrypted_memcopy_gpu_to_cpu(gpu,
11295                                             uvm_mem_get_cpu_addr_kernel(dst_mem),
11296                                             src_gpu_address,
11297                                             size,
11298                                             &va_block->tracker,
11299                                             "Encrypted read from [0x%llx, 0x%llx)",
11300                                             src,
11301                                             src + size);
11302     }
11303 
11304     status = uvm_push_begin_acquire(gpu->channel_manager,
11305                                     UVM_CHANNEL_TYPE_GPU_TO_CPU,
11306                                     &va_block->tracker,
11307                                     &push,
11308                                     "Direct read from [0x%llx, 0x%llx)",
11309                                     src,
11310                                     src + size);
11311     if (status != NV_OK)
11312         return status;
11313 
11314     dst_gpu_address = uvm_mem_gpu_address_virtual_kernel(dst_mem, gpu);
11315     gpu->parent->ce_hal->memcopy(&push, dst_gpu_address, src_gpu_address, size);
11316     return uvm_push_end_and_wait(&push);
11317 }
11318 
11319 NV_STATUS uvm_va_block_read_to_cpu(uvm_va_block_t *va_block, uvm_mem_t *dst_mem, NvU64 src, size_t size)
11320 {
11321     uvm_page_index_t page_index = uvm_va_block_cpu_page_index(va_block, src);
11322     NvU64 page_offset = src & (PAGE_SIZE - 1);
11323     uvm_processor_id_t proc = uvm_va_block_page_get_closest_resident(va_block, page_index, UVM_ID_CPU);
11324     void *dst = uvm_mem_get_cpu_addr_kernel(dst_mem);
11325 
11326     uvm_assert_mutex_locked(&va_block->lock);
11327     UVM_ASSERT_MSG(page_offset + size <= PAGE_SIZE, "Read spans multiple pages: src 0x%llx, size 0x%zx\n", src, size);
11328 
11329     if (UVM_ID_IS_INVALID(proc)) {
11330         memset(dst, 0, size);
11331         return NV_OK;
11332     }
11333     else if (UVM_ID_IS_CPU(proc)) {
11334         NV_STATUS status;
11335         char *mapped_page;
11336         struct page *page = uvm_cpu_chunk_get_cpu_page(va_block, page_index);
11337 
11338         status = uvm_tracker_wait(&va_block->tracker);
11339         if (status != NV_OK)
11340             return status;
11341 
11342         mapped_page = (char *)kmap(page);
11343         memcpy(dst, mapped_page + page_offset, size);
11344         kunmap(page);
11345 
11346         return NV_OK;
11347     }
11348     else {
11349         uvm_gpu_address_t src_gpu_address;
11350         uvm_gpu_t *gpu = block_get_gpu(va_block, proc);
11351 
11352         src_gpu_address = block_phys_page_copy_address(va_block, block_phys_page(proc, page_index), gpu);
11353         src_gpu_address.address += page_offset;
11354 
11355         return va_block_read_gpu_to_cpu(va_block, dst_mem, gpu, src_gpu_address, src, size);
11356     }
11357 }
11358 
11359 // Deferred work item reestablishing accessed by mappings after eviction. On
11360 // GPUs with access counters enabled, the evicted GPU will also get remote
11361 // mappings.
11362 static void block_add_eviction_mappings(void *args)
11363 {
11364     uvm_va_block_t *va_block = (uvm_va_block_t*)args;
11365     uvm_va_space_t *va_space;
11366     uvm_processor_id_t id;
11367     uvm_va_block_context_t *block_context = NULL;
11368     struct mm_struct *mm = NULL;
11369 
11370     uvm_mutex_lock(&va_block->lock);
11371     va_space = uvm_va_block_get_va_space_maybe_dead(va_block);
11372     uvm_mutex_unlock(&va_block->lock);
11373 
11374     if (!va_space) {
11375         // Block has been killed in the meantime
11376         goto done;
11377     }
11378 
11379     mm = uvm_va_space_mm_retain_lock(va_space);
11380 
11381     block_context = uvm_va_block_context_alloc(mm);
11382     if (!block_context)
11383         goto done;
11384 
11385     // The block wasn't dead when we checked above and that's enough to
11386     // guarantee that the VA space is still around, because
11387     // uvm_va_space_destroy() flushes the associated nv_kthread_q, and that
11388     // flush waits for this function call to finish.
11389     uvm_va_space_down_read(va_space);
11390 
11391     // Now that we have the VA space lock held, we can check whether the block
11392     // is still alive since the VA space write lock is needed to kill blocks.
11393     if (uvm_va_block_is_dead(va_block))
11394         goto unlock;
11395 
11396     if (uvm_va_block_is_hmm(va_block)) {
11397         uvm_hmm_block_add_eviction_mappings(va_space, va_block, block_context);
11398     }
11399     else {
11400         uvm_va_range_t *va_range = va_block->va_range;
11401         NV_STATUS status = NV_OK;
11402 
11403         for_each_id_in_mask(id, &uvm_va_range_get_policy(va_range)->accessed_by) {
11404             status = uvm_va_block_set_accessed_by(va_block, block_context, id);
11405             if (status != NV_OK)
11406                 break;
11407         }
11408 
11409         if (status == NV_OK && uvm_va_space_map_remote_on_eviction(va_space)) {
11410             uvm_processor_mask_t map_processors;
11411 
11412             // Exclude the processors that have been already mapped due to
11413             // AccessedBy
11414             uvm_processor_mask_andnot(&map_processors,
11415                                       &va_block->evicted_gpus,
11416                                       &uvm_va_range_get_policy(va_range)->accessed_by);
11417 
11418             for_each_gpu_id_in_mask(id, &map_processors) {
11419                 uvm_gpu_t *gpu = uvm_va_space_get_gpu(va_space, id);
11420                 uvm_va_block_gpu_state_t *gpu_state;
11421 
11422                 if (!gpu->parent->access_counters_supported)
11423                     continue;
11424 
11425                 gpu_state = uvm_va_block_gpu_state_get(va_block, id);
11426                 UVM_ASSERT(gpu_state);
11427 
11428                 // TODO: Bug 2096389: uvm_va_block_add_mappings does not add
11429                 // remote mappings to read-duplicated pages. Add support for it
11430                 // or create a new function.
11431                 status = UVM_VA_BLOCK_LOCK_RETRY(va_block, NULL,
11432                                                  uvm_va_block_add_mappings(va_block,
11433                                                                            block_context,
11434                                                                            id,
11435                                                                            uvm_va_block_region_from_block(va_block),
11436                                                                            &gpu_state->evicted,
11437                                                                            UvmEventMapRemoteCauseEviction));
11438                 if (status != NV_OK)
11439                     break;
11440             }
11441         }
11442 
11443         if (status != NV_OK) {
11444             UVM_ERR_PRINT("Deferred mappings to evicted memory for block [0x%llx, 0x%llx] failed %s, processor %s\n",
11445                           va_block->start,
11446                           va_block->end,
11447                           nvstatusToString(status),
11448                           uvm_va_space_processor_name(va_space, id));
11449         }
11450     }
11451 
11452 unlock:
11453     uvm_va_space_up_read(va_space);
11454     uvm_va_block_context_free(block_context);
11455 
11456 done:
11457     uvm_va_space_mm_release_unlock(va_space, mm);
11458     uvm_va_block_release(va_block);
11459 }
11460 
11461 static void block_add_eviction_mappings_entry(void *args)
11462 {
11463     UVM_ENTRY_VOID(block_add_eviction_mappings(args));
11464 }
11465 
11466 NV_STATUS uvm_va_block_evict_chunks(uvm_va_block_t *va_block,
11467                                     uvm_gpu_t *gpu,
11468                                     uvm_gpu_chunk_t *root_chunk,
11469                                     uvm_tracker_t *tracker)
11470 {
11471     NV_STATUS status = NV_OK;
11472     NvU32 i;
11473     uvm_va_block_gpu_state_t *gpu_state;
11474     uvm_va_block_region_t chunk_region;
11475     size_t num_gpu_chunks = block_num_gpu_chunks(va_block, gpu);
11476     size_t chunks_to_evict = 0;
11477     uvm_va_block_context_t *block_context;
11478     uvm_page_mask_t *pages_to_evict;
11479     uvm_va_block_test_t *va_block_test = uvm_va_block_get_test(va_block);
11480     uvm_va_space_t *va_space = uvm_va_block_get_va_space_maybe_dead(va_block);
11481     struct mm_struct *mm;
11482     bool accessed_by_set = false;
11483 
11484     uvm_assert_mutex_locked(&va_block->lock);
11485 
11486     // The block might have been killed in the meantime
11487     if (!va_space)
11488         return NV_OK;
11489 
11490     gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id);
11491     if (!gpu_state)
11492         return NV_OK;
11493 
11494     if (va_block_test && va_block_test->inject_eviction_error) {
11495         va_block_test->inject_eviction_error = false;
11496         return NV_ERR_NO_MEMORY;
11497     }
11498 
11499     // We cannot take this block's VA space or mmap_lock locks on the eviction
11500     // path, however, we retain mm in order to support accounting of CPU memory
11501     // allocations. If mappings need to be created,
11502     // block_add_eviction_mappings() will be scheduled below.
11503     mm = uvm_va_space_mm_retain(va_space);
11504     block_context = uvm_va_block_context_alloc(mm);
11505     if (!block_context) {
11506         if (mm)
11507             uvm_va_space_mm_release(va_space);
11508         return NV_ERR_NO_MEMORY;
11509     }
11510 
11511     pages_to_evict = &block_context->caller_page_mask;
11512     uvm_page_mask_zero(pages_to_evict);
11513     chunk_region.outer = 0;
11514 
11515     // Find all chunks that are subchunks of the root chunk
11516     for (i = 0; i < num_gpu_chunks; ++i) {
11517         uvm_chunk_size_t chunk_size;
11518         size_t chunk_index = block_gpu_chunk_index(va_block, gpu, chunk_region.outer, &chunk_size);
11519         UVM_ASSERT(chunk_index == i);
11520         chunk_region.first = chunk_region.outer;
11521         chunk_region.outer = chunk_region.first + chunk_size / PAGE_SIZE;
11522 
11523         if (!gpu_state->chunks[i])
11524             continue;
11525         if (!uvm_gpu_chunk_same_root(gpu_state->chunks[i], root_chunk))
11526             continue;
11527 
11528         if (uvm_va_block_is_hmm(va_block)) {
11529             status = uvm_hmm_va_block_evict_chunk_prep(va_block, block_context, gpu_state->chunks[i], chunk_region);
11530             if (status != NV_OK)
11531                 break;
11532         }
11533 
11534         uvm_page_mask_region_fill(pages_to_evict, chunk_region);
11535         ++chunks_to_evict;
11536     }
11537 
11538     if (chunks_to_evict == 0)
11539         goto out;
11540 
11541     // Only move pages resident on the GPU
11542     uvm_page_mask_and(pages_to_evict, pages_to_evict, uvm_va_block_resident_mask_get(va_block, gpu->id));
11543     uvm_processor_mask_zero(&block_context->make_resident.all_involved_processors);
11544 
11545     if (uvm_va_block_is_hmm(va_block)) {
11546         status = uvm_hmm_va_block_evict_chunks(va_block,
11547                                                block_context,
11548                                                pages_to_evict,
11549                                                uvm_va_block_region_from_block(va_block),
11550                                                &accessed_by_set);
11551     }
11552     else {
11553         const uvm_va_policy_t *policy = uvm_va_range_get_policy(va_block->va_range);
11554         accessed_by_set = uvm_processor_mask_get_count(&policy->accessed_by) > 0;
11555 
11556         // TODO: Bug 1765193: make_resident() breaks read-duplication, but it's
11557         // not necessary to do so for eviction. Add a version that unmaps only
11558         // the processors that have mappings to the pages being evicted.
11559         status = uvm_va_block_make_resident(va_block,
11560                                             NULL,
11561                                             block_context,
11562                                             UVM_ID_CPU,
11563                                             uvm_va_block_region_from_block(va_block),
11564                                             pages_to_evict,
11565                                             NULL,
11566                                             UVM_MAKE_RESIDENT_CAUSE_EVICTION);
11567     }
11568     if (status != NV_OK)
11569         goto out;
11570 
11571     // VA space lock may not be held and hence we cannot reestablish any
11572     // mappings here and need to defer it to a work queue.
11573     //
11574     // Reading the accessed_by mask without the VA space lock is safe because
11575     // adding a new processor to the mask triggers going over all the VA blocks
11576     // in the range and locking them. And we hold one of the VA block's locks.
11577     //
11578     // If uvm_va_range_set_accessed_by() hasn't called
11579     // uvm_va_block_set_accessed_by() for this block yet then it will take care
11580     // of adding the mapping after we are done. If it already did then we are
11581     // guaranteed to see the new processor in the accessed_by mask because we
11582     // locked the block's lock that the thread calling
11583     // uvm_va_range_set_accessed_by() unlocked after updating the mask.
11584     //
11585     // If a processor gets removed from the mask then we might not notice and
11586     // schedule the work item anyway, but that's benign as
11587     // block_add_eviction_mappings() re-examines the mask.
11588     //
11589     // Checking if access counters migrations are enabled on a VA space is racy
11590     // without holding the VA space lock. However, this is fine as
11591     // block_add_eviction_mappings() reexamines the value with the VA space
11592     // lock being held.
11593     if (accessed_by_set || (gpu->parent->access_counters_supported && uvm_va_space_map_remote_on_eviction(va_space))) {
11594         // Always retain the VA block first so that it's safe for the deferred
11595         // callback to release it immediately after it runs.
11596         uvm_va_block_retain(va_block);
11597 
11598         if (!nv_kthread_q_schedule_q_item(&g_uvm_global.global_q,
11599                                           &va_block->eviction_mappings_q_item)) {
11600             // And release it if no new callback was scheduled
11601             uvm_va_block_release_no_destroy(va_block);
11602         }
11603     }
11604 
11605     status = uvm_tracker_add_tracker_safe(tracker, &va_block->tracker);
11606     if (status != NV_OK)
11607         goto out;
11608 
11609     for (i = 0; i < num_gpu_chunks; ++i) {
11610         uvm_gpu_id_t accessing_gpu_id;
11611         uvm_gpu_chunk_t *chunk = gpu_state->chunks[i];
11612 
11613         if (!chunk)
11614             continue;
11615         if (!uvm_gpu_chunk_same_root(chunk, root_chunk))
11616             continue;
11617 
11618         // Remove the mappings of indirect peers from the reverse map. We
11619         // access the indirect peer mask from the VA space without holding the
11620         // VA space lock. Therefore, we can race with enable_peer/disable_peer
11621         // operations. However this is fine:
11622         //
11623         // The enable_peer sequence is as follows:
11624         //
11625         // set_bit in va_space->indirect_peers
11626         // uvm_va_block_enable_peer;
11627         //
11628         // - If we read the mask BEFORE it is set or AFTER the mapping has
11629         // been added to the map there is no race.
11630         // - If we read the mask AFTER it is set but BEFORE adding the mapping
11631         // to the reverse map, we will try to remove it although it is not
11632         // there yet. Therefore, we use
11633         // uvm_pmm_sysmem_mappings_remove_gpu_mapping_on_eviction, which does
11634         // not check if the mapping is present in the reverse map.
11635         //
11636         // The disable_peer sequence is as follows:
11637         //
11638         // uvm_va_block_disable_peer;
11639         // clear_bit in va_space->indirect_peers
11640         //
11641         // - If we read the mask BEFORE the mapping has been added to the map
11642         // or AFTER the bit has been cleared, there is no race.
11643         // - If we read the mask AFTER the mapping has been removed and BEFORE
11644         // the bit is cleared, we will try to remove the mapping, too.
11645         // Again, uvm_pmm_sysmem_mappings_remove_gpu_mapping_on_eviction works
11646         // in this scenario.
11647         // Obtain the uvm_gpu_t directly via the parent GPU's id since indirect
11648         // peers are not supported when SMC is enabled.
11649         for_each_gpu_id_in_mask(accessing_gpu_id, &va_space->indirect_peers[uvm_id_value(gpu->id)]) {
11650             uvm_gpu_t *accessing_gpu = uvm_va_space_get_gpu(va_space, accessing_gpu_id);
11651             NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(&gpu->pmm, chunk, accessing_gpu);
11652 
11653             uvm_pmm_sysmem_mappings_remove_gpu_mapping_on_eviction(&accessing_gpu->pmm_reverse_sysmem_mappings,
11654                                                                    peer_addr);
11655         }
11656 
11657         uvm_mmu_chunk_unmap(chunk, tracker);
11658 
11659         uvm_pmm_gpu_mark_chunk_evicted(&gpu->pmm, gpu_state->chunks[i]);
11660         gpu_state->chunks[i] = NULL;
11661     }
11662 
11663 out:
11664     uvm_va_block_context_free(block_context);
11665     if (mm)
11666         uvm_va_space_mm_release(va_space);
11667 
11668     return status;
11669 }
11670 
11671 static NV_STATUS block_gpu_force_4k_ptes(uvm_va_block_t *block, uvm_va_block_context_t *block_context, uvm_gpu_t *gpu)
11672 {
11673     uvm_va_block_gpu_state_t *gpu_state = block_gpu_state_get_alloc(block, gpu);
11674     uvm_push_t push;
11675     NV_STATUS status;
11676 
11677     // See comment in uvm_va_block_set_cancel
11678     UVM_ASSERT(!gpu->parent->fault_cancel_va_supported);
11679 
11680     if (!gpu_state)
11681         return NV_ERR_NO_MEMORY;
11682 
11683     // Force all pages to be 4K and prevent future upgrades during cancel
11684     gpu_state->force_4k_ptes = true;
11685 
11686     // If we have no page tables we're done. For fault cancel we need to make
11687     // sure that fatal faults are on different 4k PTEs than non-fatal faults,
11688     // and we need to service all non-fatal faults before issuing the cancel. So
11689     // either all faults are fatal and we have no PTEs (we're PROT_NONE), or
11690     // we'll allocate PTEs later when we service the non-fatal faults. Those
11691     // PTEs will be 4k since force_4k_ptes is set.
11692     if (!block_gpu_has_page_tables(block, gpu))
11693         return NV_OK;
11694 
11695     // Are we 4k already?
11696     if (!gpu_state->pte_is_2m && bitmap_empty(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK))
11697         return NV_OK;
11698 
11699     status = block_alloc_ptes_with_retry(block, gpu, UVM_PAGE_SIZE_4K, NULL);
11700     if (status != NV_OK)
11701         return status;
11702 
11703     status = uvm_push_begin_acquire(gpu->channel_manager,
11704                                     UVM_CHANNEL_TYPE_MEMOPS,
11705                                     &block->tracker,
11706                                     &push,
11707                                     "Forcing 4k PTEs on block [0x%llx, 0x%llx)",
11708                                     block->start,
11709                                     block->end + 1);
11710     if (status != NV_OK)
11711         return status;
11712 
11713     if (gpu_state->pte_is_2m)
11714         block_gpu_split_2m(block, block_context, gpu, NULL, &push);
11715     else
11716         block_gpu_split_big(block, block_context, gpu, gpu_state->big_ptes, &push);
11717 
11718     uvm_push_end(&push);
11719 
11720     UVM_ASSERT(block_check_mappings(block));
11721 
11722     return uvm_tracker_add_push_safe(&block->tracker, &push);
11723 }
11724 
11725 NV_STATUS uvm_va_block_set_cancel(uvm_va_block_t *va_block, uvm_va_block_context_t *block_context, uvm_gpu_t *gpu)
11726 {
11727     uvm_assert_mutex_locked(&va_block->lock);
11728 
11729     // Volta+ devices support a global VA cancel method that does not require
11730     // 4k PTEs. Thus, skip doing this PTE splitting, particularly because it
11731     // could result in 4k PTEs on P9 systems which otherwise would never need
11732     // them.
11733     if (gpu->parent->fault_cancel_va_supported)
11734         return NV_OK;
11735 
11736     return block_gpu_force_4k_ptes(va_block, block_context, gpu);
11737 }
11738 
11739 NV_STATUS uvm_test_va_block_inject_error(UVM_TEST_VA_BLOCK_INJECT_ERROR_PARAMS *params, struct file *filp)
11740 {
11741     uvm_va_space_t *va_space = uvm_va_space_get(filp);
11742     struct mm_struct *mm;
11743     uvm_va_block_t *va_block;
11744     uvm_va_block_test_t *va_block_test;
11745     NV_STATUS status = NV_OK;
11746 
11747     mm = uvm_va_space_mm_or_current_retain_lock(va_space);
11748     uvm_va_space_down_read(va_space);
11749 
11750     if (mm)
11751         status = uvm_va_block_find_create(va_space, params->lookup_address, NULL, &va_block);
11752     else
11753         status = uvm_va_block_find_create_managed(va_space, params->lookup_address, &va_block);
11754 
11755     if (status != NV_OK)
11756         goto out;
11757 
11758     va_block_test = uvm_va_block_get_test(va_block);
11759     UVM_ASSERT(va_block_test);
11760 
11761     uvm_mutex_lock(&va_block->lock);
11762 
11763     if (params->page_table_allocation_retry_force_count)
11764         va_block_test->page_table_allocation_retry_force_count = params->page_table_allocation_retry_force_count;
11765 
11766     if (params->user_pages_allocation_retry_force_count)
11767         va_block_test->user_pages_allocation_retry_force_count = params->user_pages_allocation_retry_force_count;
11768 
11769     if (params->cpu_chunk_allocation_size_mask) {
11770         if (params->cpu_chunk_allocation_size_mask & ~UVM_CPU_CHUNK_SIZES ||
11771             !(params->cpu_chunk_allocation_size_mask & PAGE_SIZE)) {
11772             status = NV_ERR_INVALID_ARGUMENT;
11773             goto block_unlock;
11774         }
11775 
11776         va_block_test->cpu_chunk_allocation_size_mask = params->cpu_chunk_allocation_size_mask & UVM_CPU_CHUNK_SIZES;
11777     }
11778 
11779     if (params->eviction_error)
11780         va_block_test->inject_eviction_error = params->eviction_error;
11781 
11782     if (params->cpu_pages_allocation_error_count)
11783         va_block_test->inject_cpu_pages_allocation_error_count = params->cpu_pages_allocation_error_count;
11784 
11785     if (params->populate_error)
11786         va_block_test->inject_populate_error = params->populate_error;
11787 
11788 block_unlock:
11789     uvm_mutex_unlock(&va_block->lock);
11790 
11791 out:
11792     uvm_va_space_up_read(va_space);
11793     uvm_va_space_mm_or_current_release_unlock(va_space, mm);
11794     return status;
11795 }
11796 
11797 static uvm_prot_t g_uvm_test_pte_mapping_to_prot[UVM_TEST_PTE_MAPPING_MAX] =
11798 {
11799     [UVM_TEST_PTE_MAPPING_INVALID]           = UVM_PROT_NONE,
11800     [UVM_TEST_PTE_MAPPING_READ_ONLY]         = UVM_PROT_READ_ONLY,
11801     [UVM_TEST_PTE_MAPPING_READ_WRITE]        = UVM_PROT_READ_WRITE,
11802     [UVM_TEST_PTE_MAPPING_READ_WRITE_ATOMIC] = UVM_PROT_READ_WRITE_ATOMIC,
11803 };
11804 
11805 static UVM_TEST_PTE_MAPPING g_uvm_prot_to_test_pte_mapping[UVM_PROT_MAX] =
11806 {
11807     [UVM_PROT_NONE]              = UVM_TEST_PTE_MAPPING_INVALID,
11808     [UVM_PROT_READ_ONLY]         = UVM_TEST_PTE_MAPPING_READ_ONLY,
11809     [UVM_PROT_READ_WRITE]        = UVM_TEST_PTE_MAPPING_READ_WRITE,
11810     [UVM_PROT_READ_WRITE_ATOMIC] = UVM_TEST_PTE_MAPPING_READ_WRITE_ATOMIC,
11811 };
11812 
11813 NV_STATUS uvm_test_change_pte_mapping(UVM_TEST_CHANGE_PTE_MAPPING_PARAMS *params, struct file *filp)
11814 {
11815     uvm_va_space_t *va_space = uvm_va_space_get(filp);
11816     uvm_va_block_t *block;
11817     struct mm_struct *mm;
11818     NV_STATUS status = NV_OK;
11819     uvm_prot_t curr_prot, new_prot;
11820     uvm_gpu_t *gpu = NULL;
11821     uvm_processor_id_t id;
11822     uvm_tracker_t local_tracker;
11823     uvm_va_block_region_t region;
11824     uvm_va_block_context_t *block_context = NULL;
11825 
11826     if (!PAGE_ALIGNED(params->va))
11827         return NV_ERR_INVALID_ADDRESS;
11828 
11829     if (params->mapping >= UVM_TEST_PTE_MAPPING_MAX)
11830         return NV_ERR_INVALID_ARGUMENT;
11831 
11832     new_prot = g_uvm_test_pte_mapping_to_prot[params->mapping];
11833 
11834     // mmap_lock isn't needed for invalidating CPU mappings, but it will be
11835     // needed for inserting them.
11836     mm = uvm_va_space_mm_or_current_retain_lock(va_space);
11837     uvm_va_space_down_read(va_space);
11838 
11839     if (uvm_uuid_is_cpu(&params->uuid)) {
11840         id = UVM_ID_CPU;
11841     }
11842     else {
11843         gpu = uvm_va_space_get_gpu_by_uuid_with_gpu_va_space(va_space, &params->uuid);
11844         if (!gpu) {
11845             status = NV_ERR_INVALID_DEVICE;
11846             goto out;
11847         }
11848 
11849         // Check if the GPU can access the VA
11850         if (!uvm_gpu_can_address(gpu, params->va, PAGE_SIZE)) {
11851             status = NV_ERR_OUT_OF_RANGE;
11852             goto out;
11853         }
11854 
11855         id = gpu->id;
11856     }
11857 
11858     block_context = uvm_va_block_context_alloc(mm);
11859     if (!block_context) {
11860         status = NV_ERR_NO_MEMORY;
11861         goto out;
11862     }
11863 
11864     if (mm)
11865         status = uvm_va_block_find_create(va_space, params->va, &block_context->hmm.vma, &block);
11866     else
11867         status = uvm_va_block_find_create_managed(va_space, params->va, &block);
11868 
11869     if (status != NV_OK)
11870         goto out;
11871 
11872     // TODO: Bug 3912902: UvmTestChangePteMapping() doesn't work on CPU.
11873     if (UVM_ID_IS_CPU(id) && uvm_va_block_is_hmm(block))
11874         goto out;
11875 
11876     uvm_mutex_lock(&block->lock);
11877 
11878     region = uvm_va_block_region_from_start_size(block, params->va, PAGE_SIZE);
11879     curr_prot = block_page_prot(block, id, region.first);
11880 
11881     if (new_prot == curr_prot) {
11882         status = NV_OK;
11883         goto out_block;
11884     }
11885 
11886     // TODO: Bug 1766124: Upgrades might require revoking other processors'
11887     //       access privileges. We just fail for now. Only downgrades are
11888     //       supported. If we allowed upgrades, we would need to check the mm
11889     //       like we do for revocation below.
11890     if (new_prot > curr_prot) {
11891         status = NV_ERR_INVALID_OPERATION;
11892         goto out_block;
11893     }
11894 
11895     if (new_prot == UVM_PROT_NONE) {
11896         status = uvm_va_block_unmap(block, block_context, id, region, NULL, &block->tracker);
11897     }
11898     else {
11899         UVM_ASSERT(block_is_page_resident_anywhere(block, region.first));
11900 
11901         // Revoking CPU mappings performs a combination of unmap + map. The map
11902         // portion requires a valid mm.
11903         if (UVM_ID_IS_CPU(id) && !uvm_va_range_vma_check(block->va_range, mm)) {
11904             status = NV_ERR_INVALID_STATE;
11905         }
11906         else {
11907             status = uvm_va_block_revoke_prot(block,
11908                                               block_context,
11909                                               id,
11910                                               region,
11911                                               NULL,
11912                                               new_prot + 1,
11913                                               &block->tracker);
11914         }
11915     }
11916 
11917 out_block:
11918     if (status == NV_OK)
11919         status = uvm_tracker_init_from(&local_tracker, &block->tracker);
11920 
11921     uvm_mutex_unlock(&block->lock);
11922 
11923     if (status == NV_OK)
11924         status = uvm_tracker_wait_deinit(&local_tracker);
11925 
11926 out:
11927     uvm_va_space_up_read(va_space);
11928     uvm_va_space_mm_or_current_release_unlock(va_space, mm);
11929 
11930     uvm_va_block_context_free(block_context);
11931 
11932     return status;
11933 }
11934 
11935 NV_STATUS uvm_test_va_block_info(UVM_TEST_VA_BLOCK_INFO_PARAMS *params, struct file *filp)
11936 {
11937     uvm_va_space_t *va_space = uvm_va_space_get(filp);
11938     uvm_va_block_t *va_block;
11939     uvm_va_range_t *va_range;
11940     struct mm_struct *mm;
11941     size_t index;
11942     NV_STATUS status = NV_OK;
11943 
11944     BUILD_BUG_ON(UVM_TEST_VA_BLOCK_SIZE != UVM_VA_BLOCK_SIZE);
11945 
11946     mm = uvm_va_space_mm_or_current_retain_lock(va_space);
11947     uvm_va_space_down_read(va_space);
11948 
11949     va_range = uvm_va_range_find(va_space, params->lookup_address);
11950     if (!va_range) {
11951         status = uvm_hmm_va_block_find(va_space, params->lookup_address, &va_block);
11952         if (status == NV_ERR_OBJECT_NOT_FOUND) {
11953             status = uvm_hmm_va_block_range_bounds(va_space,
11954                                                    mm,
11955                                                    params->lookup_address,
11956                                                    &params->va_block_start,
11957                                                    &params->va_block_end,
11958                                                    NULL);
11959             goto out;
11960         }
11961         else if (status != NV_OK) {
11962             goto out;
11963         }
11964     }
11965     else {
11966         index = uvm_va_range_block_index(va_range, params->lookup_address);
11967         va_block = uvm_va_range_block(va_range, index);
11968         if (!va_block) {
11969             status = NV_ERR_OBJECT_NOT_FOUND;
11970             goto out;
11971         }
11972     }
11973 
11974     params->va_block_start = va_block->start;
11975     params->va_block_end   = va_block->end;
11976 
11977 out:
11978     uvm_va_space_up_read(va_space);
11979     uvm_va_space_mm_or_current_release_unlock(va_space, mm);
11980     return status;
11981 }
11982 
11983 NV_STATUS uvm_test_va_residency_info(UVM_TEST_VA_RESIDENCY_INFO_PARAMS *params, struct file *filp)
11984 {
11985     NV_STATUS status = NV_OK;
11986     uvm_va_space_t *va_space = uvm_va_space_get(filp);
11987     uvm_va_range_t *va_range;
11988     uvm_va_block_t *block = NULL;
11989     struct mm_struct *mm;
11990     NvU32 count = 0;
11991     uvm_processor_mask_t resident_on_mask;
11992     uvm_processor_id_t id;
11993     uvm_page_index_t page_index;
11994     unsigned release_block_count = 0;
11995     NvU64 addr = UVM_ALIGN_DOWN(params->lookup_address, PAGE_SIZE);
11996     size_t index;
11997 
11998     mm = uvm_va_space_mm_or_current_retain_lock(va_space);
11999     uvm_va_space_down_read(va_space);
12000 
12001     // Inline uvm_va_block_find() to get the va_range.
12002     va_range = uvm_va_range_find(va_space, addr);
12003     if (!va_range) {
12004         NvU64 start, end;
12005 
12006         status = uvm_hmm_va_block_find(va_space, addr, &block);
12007         if (status != NV_OK) {
12008             if (status != NV_ERR_OBJECT_NOT_FOUND)
12009                 goto out;
12010             status = uvm_hmm_va_block_range_bounds(va_space, mm, addr, &start, &end, params);
12011             goto out;
12012         }
12013         // Update current CPU mapping information.
12014         status = uvm_hmm_va_block_update_residency_info(block, mm, addr, false);
12015         if (status != NV_OK) {
12016             block = NULL;
12017             goto out;
12018         }
12019     }
12020     else if (va_range->type != UVM_VA_RANGE_TYPE_MANAGED) {
12021         status = NV_ERR_INVALID_ADDRESS;
12022         goto out;
12023     }
12024     else {
12025         index = uvm_va_range_block_index(va_range, addr);
12026         block = uvm_va_range_block(va_range, index);
12027         if (!block) {
12028             params->resident_on_count = 0;
12029             params->populated_on_count = 0;
12030             params->mapped_on_count = 0;
12031 
12032             status = NV_OK;
12033 
12034             goto out;
12035         }
12036     }
12037 
12038     uvm_mutex_lock(&block->lock);
12039 
12040     page_index = uvm_va_block_cpu_page_index(block, addr);
12041     uvm_va_block_page_resident_processors(block, page_index, &resident_on_mask);
12042 
12043     for_each_id_in_mask(id, &resident_on_mask) {
12044         block_phys_page_t block_page = block_phys_page(id, page_index);
12045         uvm_va_space_processor_uuid(va_space, &params->resident_on[count], id);
12046         params->resident_physical_size[count] = block_phys_page_size(block, block_page);
12047         if (UVM_ID_IS_CPU(id)) {
12048             params->resident_physical_address[count] = page_to_phys(uvm_cpu_chunk_get_cpu_page(block, page_index));
12049         }
12050         else {
12051             params->resident_physical_address[count] =
12052                 block_phys_page_address(block, block_page, uvm_va_space_get_gpu(va_space, id)).address;
12053         }
12054         ++count;
12055     }
12056     params->resident_on_count = count;
12057 
12058     count = 0;
12059     for_each_id_in_mask(id, &block->mapped) {
12060         uvm_processor_id_t processor_to_map;
12061         block_phys_page_t block_page;
12062         NvU32 page_size = uvm_va_block_page_size_processor(block, id, page_index);
12063 
12064         if (page_size == 0)
12065             continue;
12066 
12067         uvm_va_space_processor_uuid(va_space, &params->mapped_on[count], id);
12068 
12069         params->mapping_type[count] = g_uvm_prot_to_test_pte_mapping[block_page_prot(block, id, page_index)];
12070         UVM_ASSERT(params->mapping_type[count] != UVM_TEST_PTE_MAPPING_INVALID);
12071         processor_to_map = block_get_processor_to_map(block, id, page_index);
12072         block_page = block_phys_page(processor_to_map, page_index);
12073 
12074         if (!UVM_ID_IS_CPU(id)) {
12075             uvm_gpu_phys_address_t gpu_phys_addr = block_phys_page_address(block,
12076                                                                            block_page,
12077                                                                            uvm_va_space_get_gpu(va_space, id));
12078             params->mapping_physical_address[count] = gpu_phys_addr.address;
12079         }
12080         else {
12081             struct page *page = block_page_get(block, block_page);
12082 
12083             params->mapping_physical_address[count] = page_to_phys(page);
12084         }
12085 
12086         params->page_size[count] = page_size;
12087         ++count;
12088     }
12089 
12090     if (params->resident_on_count == 1) {
12091         if (uvm_processor_mask_test(&resident_on_mask, UVM_ID_CPU)) {
12092             if (uvm_pmm_sysmem_mappings_indirect_supported()) {
12093                 for_each_gpu_id(id) {
12094                     NvU32 page_size = uvm_va_block_page_size_processor(block, id, page_index);
12095                     uvm_reverse_map_t sysmem_page;
12096                     uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index);
12097                     size_t num_pages;
12098                     uvm_gpu_t *gpu;
12099 
12100                     if (!uvm_va_block_gpu_state_get(block, id))
12101                         continue;
12102 
12103                     gpu = uvm_va_space_get_gpu(va_space, id);
12104 
12105                     if (!gpu->parent->access_counters_supported)
12106                         continue;
12107 
12108                     num_pages = uvm_pmm_sysmem_mappings_dma_to_virt(&gpu->pmm_reverse_sysmem_mappings,
12109                                                                     uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu->parent),
12110                                                                     uvm_cpu_chunk_get_size(chunk),
12111                                                                     &sysmem_page,
12112                                                                     1);
12113                     if (page_size > 0)
12114                         UVM_ASSERT(num_pages == 1);
12115                     else
12116                         UVM_ASSERT(num_pages <= 1);
12117 
12118                     if (num_pages == 1) {
12119                         UVM_ASSERT(sysmem_page.va_block == block);
12120                         UVM_ASSERT(uvm_reverse_map_start(&sysmem_page) <= addr);
12121                         UVM_ASSERT(uvm_reverse_map_end(&sysmem_page) > addr);
12122 
12123                         ++release_block_count;
12124                     }
12125                 }
12126             }
12127         }
12128         else {
12129             uvm_gpu_id_t id = uvm_processor_mask_find_first_id(&resident_on_mask);
12130             uvm_reverse_map_t gpu_mapping;
12131             size_t num_pages;
12132             uvm_gpu_t *gpu = uvm_va_space_get_gpu(va_space, id);
12133             uvm_gpu_phys_address_t phys_addr;
12134 
12135             phys_addr = uvm_va_block_gpu_phys_page_address(block, page_index, gpu);
12136             num_pages = uvm_pmm_gpu_phys_to_virt(&gpu->pmm, phys_addr.address, PAGE_SIZE, &gpu_mapping);
12137 
12138             // Chunk may be in TEMP_PINNED state so it may not have a VA block
12139             // assigned. In that case, we don't get a valid translation.
12140             if (num_pages > 0) {
12141                 UVM_ASSERT(num_pages == 1);
12142                 UVM_ASSERT(gpu_mapping.va_block == block);
12143                 UVM_ASSERT(uvm_reverse_map_start(&gpu_mapping) == addr);
12144 
12145                 ++release_block_count;
12146             }
12147         }
12148     }
12149 
12150     params->mapped_on_count = count;
12151 
12152     count = 0;
12153     for_each_processor_id(id) {
12154         if (!block_processor_page_is_populated(block, id, page_index))
12155             continue;
12156 
12157         uvm_va_space_processor_uuid(va_space, &params->populated_on[count], id);
12158         ++count;
12159     }
12160     params->populated_on_count = count;
12161 
12162 out:
12163     if (block) {
12164         if (!params->is_async && status == NV_OK)
12165             status = uvm_tracker_wait(&block->tracker);
12166         uvm_mutex_unlock(&block->lock);
12167         while (release_block_count--)
12168             uvm_va_block_release(block);
12169     }
12170     uvm_va_space_up_read(va_space);
12171     uvm_va_space_mm_or_current_release_unlock(va_space, mm);
12172     return status;
12173 }
12174 
12175 void uvm_va_block_mark_cpu_dirty(uvm_va_block_t *va_block)
12176 {
12177     block_mark_region_cpu_dirty(va_block, uvm_va_block_region_from_block(va_block));
12178 }
12179