1 /*******************************************************************************
2     Copyright (c) 2015-2023 NVIDIA Corporation
3 
4     Permission is hereby granted, free of charge, to any person obtaining a copy
5     of this software and associated documentation files (the "Software"), to
6     deal in the Software without restriction, including without limitation the
7     rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
8     sell copies of the Software, and to permit persons to whom the Software is
9     furnished to do so, subject to the following conditions:
10 
11         The above copyright notice and this permission notice shall be
12         included in all copies or substantial portions of the Software.
13 
14     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17     THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19     FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20     DEALINGS IN THE SOFTWARE.
21 
22 *******************************************************************************/
23 
24 #include "uvm_linux.h"
25 #include "uvm_common.h"
26 #include "uvm_api.h"
27 #include "uvm_gpu.h"
28 #include "uvm_va_space.h"
29 #include "uvm_va_range.h"
30 #include "uvm_va_block.h"
31 #include "uvm_hal_types.h"
32 #include "uvm_kvmalloc.h"
33 #include "uvm_tools.h"
34 #include "uvm_push.h"
35 #include "uvm_hal.h"
36 #include "uvm_perf_thrashing.h"
37 #include "uvm_perf_prefetch.h"
38 #include "uvm_mem.h"
39 #include "uvm_gpu_access_counters.h"
40 #include "uvm_va_space_mm.h"
41 #include "uvm_test_ioctl.h"
42 #include "uvm_conf_computing.h"
43 
44 typedef enum
45 {
46     BLOCK_PTE_OP_MAP,
47     BLOCK_PTE_OP_REVOKE,
48     BLOCK_PTE_OP_COUNT
49 } block_pte_op_t;
50 
51 static NvU64 uvm_perf_authorized_cpu_fault_tracking_window_ns = 300000;
52 
53 static struct kmem_cache *g_uvm_va_block_cache __read_mostly;
54 static struct kmem_cache *g_uvm_va_block_gpu_state_cache __read_mostly;
55 static struct kmem_cache *g_uvm_page_mask_cache __read_mostly;
56 static struct kmem_cache *g_uvm_va_block_context_cache __read_mostly;
57 
58 static int uvm_fault_force_sysmem __read_mostly = 0;
59 module_param(uvm_fault_force_sysmem, int, S_IRUGO|S_IWUSR);
60 MODULE_PARM_DESC(uvm_fault_force_sysmem, "Force (1) using sysmem storage for pages that faulted. Default: 0.");
61 
62 static int uvm_perf_map_remote_on_eviction __read_mostly = 1;
63 module_param(uvm_perf_map_remote_on_eviction, int, S_IRUGO);
64 
65 // Caching is always disabled for mappings to remote memory. The following two
66 // module parameters can be used to force caching for GPU peer/sysmem mappings.
67 //
68 // However, it is important to note that it may not be safe to enable caching
69 // in the general case so the enablement should only be used for experiments.
70 static unsigned uvm_exp_gpu_cache_peermem __read_mostly = 0;
71 module_param(uvm_exp_gpu_cache_peermem, uint, S_IRUGO);
72 MODULE_PARM_DESC(uvm_exp_gpu_cache_peermem,
73                  "Force caching for mappings to peer memory. "
74                  "This is an experimental parameter that may cause correctness issues if used.");
75 
76 static unsigned uvm_exp_gpu_cache_sysmem __read_mostly = 0;
77 module_param(uvm_exp_gpu_cache_sysmem, uint, S_IRUGO);
78 MODULE_PARM_DESC(uvm_exp_gpu_cache_sysmem,
79                  "Force caching for mappings to system memory. "
80                  "This is an experimental parameter that may cause correctness issues if used.");
81 
82 static void block_add_eviction_mappings_entry(void *args);
83 
84 uvm_va_space_t *uvm_va_block_get_va_space_maybe_dead(uvm_va_block_t *va_block)
85 {
86 #if UVM_IS_CONFIG_HMM()
87     if (va_block->hmm.va_space)
88         return va_block->hmm.va_space;
89 #endif
90 
91     if (va_block->va_range)
92         return va_block->va_range->va_space;
93 
94     return NULL;
95 }
96 
97 uvm_va_space_t *uvm_va_block_get_va_space(uvm_va_block_t *va_block)
98 {
99     uvm_va_space_t *va_space;
100 
101     UVM_ASSERT(!uvm_va_block_is_dead(va_block));
102 
103     va_space = uvm_va_block_get_va_space_maybe_dead(va_block);
104     UVM_ASSERT(va_space);
105 
106     return va_space;
107 }
108 
109 bool uvm_va_block_check_policy_is_valid(uvm_va_block_t *va_block,
110                                         const uvm_va_policy_t *policy,
111                                         uvm_va_block_region_t region)
112 {
113     uvm_assert_mutex_locked(&va_block->lock);
114 
115     if (uvm_va_block_is_hmm(va_block)) {
116         const uvm_va_policy_node_t *node;
117 
118         if (uvm_va_policy_is_default(policy)) {
119             // There should only be the default policy within the region.
120             node = uvm_va_policy_node_iter_first(va_block,
121                                                  uvm_va_block_region_start(va_block, region),
122                                                  uvm_va_block_region_end(va_block, region));
123             UVM_ASSERT(!node);
124         }
125         else {
126             // The policy node should cover the region.
127             node = uvm_va_policy_node_from_policy(policy);
128             UVM_ASSERT(node->node.start <= uvm_va_block_region_start(va_block, region));
129             UVM_ASSERT(node->node.end >= uvm_va_block_region_end(va_block, region));
130         }
131     }
132     else {
133         UVM_ASSERT(policy == uvm_va_range_get_policy(va_block->va_range));
134     }
135 
136     return true;
137 }
138 
139 static NvU64 block_gpu_pte_flag_cacheable(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_processor_id_t resident_id)
140 {
141     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
142 
143     UVM_ASSERT(UVM_ID_IS_VALID(resident_id));
144 
145     // Local vidmem is always cached
146     if (uvm_id_equal(resident_id, gpu->id))
147         return UVM_MMU_PTE_FLAGS_CACHED;
148 
149     if (UVM_ID_IS_CPU(resident_id))
150         return uvm_exp_gpu_cache_sysmem == 0 ? UVM_MMU_PTE_FLAGS_NONE : UVM_MMU_PTE_FLAGS_CACHED;
151 
152     UVM_ASSERT(uvm_processor_mask_test(&va_space->can_access[uvm_id_value(gpu->id)], resident_id));
153 
154     return uvm_exp_gpu_cache_peermem == 0 ? UVM_MMU_PTE_FLAGS_NONE : UVM_MMU_PTE_FLAGS_CACHED;
155 }
156 
157 static uvm_gpu_t *block_get_gpu(uvm_va_block_t *block, uvm_gpu_id_t gpu_id)
158 {
159     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
160 
161     return uvm_va_space_get_gpu(va_space, gpu_id);
162 }
163 
164 static const char *block_processor_name(uvm_va_block_t *block, uvm_processor_id_t id)
165 {
166     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
167 
168     return uvm_va_space_processor_name(va_space, id);
169 }
170 
171 static bool block_processor_has_memory(uvm_va_block_t *block, uvm_processor_id_t id)
172 {
173     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
174 
175     return uvm_va_space_processor_has_memory(va_space, id);
176 }
177 
178 static bool is_uvm_fault_force_sysmem_set(void)
179 {
180     // Only enforce this during testing
181     return uvm_enable_builtin_tests && uvm_fault_force_sysmem != 0;
182 }
183 
184 bool uvm_va_space_map_remote_on_eviction(uvm_va_space_t *va_space)
185 {
186     return uvm_perf_map_remote_on_eviction &&
187            uvm_va_space_has_access_counter_migrations(va_space);
188 }
189 
190 static const uvm_processor_mask_t *block_get_uvm_lite_gpus(uvm_va_block_t *va_block)
191 {
192     // Note that for HMM we always return a pointer to a zero bitmap
193     // (not allocated on the stack) since uvm_lite GPUs are not supported.
194     static const uvm_processor_mask_t uvm_lite_gpus = {};
195 
196     if (uvm_va_block_is_hmm(va_block))
197         return &uvm_lite_gpus;
198     else
199         return &va_block->va_range->uvm_lite_gpus;
200 }
201 
202 void uvm_va_block_retry_init(uvm_va_block_retry_t *retry)
203 {
204     if (!retry)
205         return;
206 
207     uvm_tracker_init(&retry->tracker);
208     INIT_LIST_HEAD(&retry->used_chunks);
209     INIT_LIST_HEAD(&retry->free_chunks);
210 }
211 
212 // The bottom bit of uvm_va_block_t::chunks is used to indicate how CPU chunks
213 // are stored.
214 //
215 // CPU chunk storage is handled in three different ways depending on the
216 // type of chunks the VA block owns. This is done to minimize the memory
217 // required to hold metadata.
218 typedef enum
219 {
220     // The uvm_va_block_t::chunk pointer points to a single 2MB
221     // CPU chunk.
222     UVM_CPU_CHUNK_STORAGE_CHUNK = 0,
223 
224     // The uvm_va_block_t::chunks pointer points to a
225     // structure of mixed (64K and 4K) chunks.
226     UVM_CPU_CHUNK_STORAGE_MIXED,
227     UVM_CPU_CHUNK_STORAGE_COUNT,
228 } uvm_cpu_chunk_storage_type_t;
229 
230 #define UVM_CPU_CHUNK_STORAGE_MASK 0x1
231 
232 // The maximum number of slots in the mixed chunk mode (64K + 4K chunks) is
233 // MAX_BIG_PAGES_PER_UVM_VA_BLOCK. Any leading/trailing misaligned pages will
234 // be stored in the first/last entry, respectively.
235 #define MAX_BIG_CPU_CHUNK_SLOTS_PER_UVM_VA_BLOCK MAX_BIG_PAGES_PER_UVM_VA_BLOCK
236 
237 #define MAX_SMALL_CHUNKS_PER_BIG_SLOT (UVM_MIN_BIG_PAGE_SIZE / PAGE_SIZE)
238 
239 // This structure is used when a VA block contains 64K or a mix of 64K and 4K
240 // CPU chunks.
241 // For every 64K CPU chunks, big_chunks will have its corresponding bit set
242 // and the corresponding index in slots will point directly to the
243 // uvm_cpu_chunk_t structure.
244 //
245 // For 4K CPU chunks, the corresponding bit in big_chunks will be clear and
246 // the element in slots will point to an array of 16 uvm_cpu_chunk_t pointers.
247 typedef struct {
248     DECLARE_BITMAP(big_chunks, MAX_BIG_CPU_CHUNK_SLOTS_PER_UVM_VA_BLOCK);
249     void *slots[MAX_BIG_CPU_CHUNK_SLOTS_PER_UVM_VA_BLOCK];
250 } uvm_cpu_chunk_storage_mixed_t;
251 
252 static uvm_va_block_region_t uvm_cpu_chunk_block_region(uvm_va_block_t *va_block,
253                                                         uvm_cpu_chunk_t *chunk,
254                                                         uvm_page_index_t page_index)
255 {
256     UVM_ASSERT(chunk);
257     return uvm_va_block_chunk_region(va_block, uvm_cpu_chunk_get_size(chunk), page_index);
258 }
259 
260 static void *uvm_cpu_storage_get_ptr(uvm_va_block_t *block)
261 {
262     return (void *)(block->cpu.chunks & ~UVM_CPU_CHUNK_STORAGE_MASK);
263 }
264 
265 static uvm_cpu_chunk_storage_type_t uvm_cpu_storage_get_type(uvm_va_block_t *block)
266 {
267     return block->cpu.chunks & UVM_CPU_CHUNK_STORAGE_MASK;
268 }
269 
270 static uvm_page_index_t compute_page_prefix(uvm_va_block_t *va_block, uvm_chunk_size_t size)
271 {
272     return (UVM_ALIGN_UP(va_block->start, size) - va_block->start) / PAGE_SIZE;
273 }
274 
275 static size_t compute_slot_index(uvm_va_block_t *va_block, uvm_page_index_t page_index)
276 {
277     uvm_va_block_region_t block_region = uvm_va_block_region_from_block(va_block);
278     uvm_page_index_t prefix;
279     size_t slot_index;
280 
281     UVM_ASSERT(page_index < block_region.outer);
282     prefix = compute_page_prefix(va_block, UVM_PAGE_SIZE_64K);
283 
284     if (page_index < prefix)
285         return 0;
286 
287     slot_index = ((page_index - prefix) / MAX_SMALL_CHUNKS_PER_BIG_SLOT) + !!prefix;
288     UVM_ASSERT(slot_index < MAX_BIG_CPU_CHUNK_SLOTS_PER_UVM_VA_BLOCK);
289 
290     return slot_index;
291 }
292 
293 static size_t compute_small_index(uvm_va_block_t *va_block, uvm_page_index_t page_index)
294 {
295     size_t prefix = compute_page_prefix(va_block, UVM_PAGE_SIZE_64K);
296 
297     if (page_index < prefix)
298         return page_index;
299 
300     return (page_index - prefix) % MAX_SMALL_CHUNKS_PER_BIG_SLOT;
301 }
302 
303 NV_STATUS uvm_cpu_chunk_insert_in_block(uvm_va_block_t *va_block,
304                                         uvm_cpu_chunk_t *chunk,
305                                         uvm_page_index_t page_index)
306 {
307     uvm_chunk_size_t chunk_size = uvm_cpu_chunk_get_size(chunk);
308     uvm_va_block_region_t chunk_region = uvm_va_block_region(page_index, page_index + uvm_cpu_chunk_num_pages(chunk));
309     size_t slot_index;
310     uvm_cpu_chunk_storage_mixed_t *mixed;
311     uvm_cpu_chunk_t **chunks = NULL;
312 
313     // We only want to use the bottom bit of a pointer.
314     BUILD_BUG_ON(UVM_CPU_CHUNK_STORAGE_COUNT > 2);
315 
316     // We want to protect against two threads manipulating the VA block's CPU
317     // chunks at the same time. However, when a block is split, the new block's
318     // lock is locked without tracking. So, we can't use
319     // uvm_assert_mutex_locked().
320     UVM_ASSERT(mutex_is_locked(&va_block->lock.m));
321 
322     if (chunk_size == UVM_CHUNK_SIZE_2M) {
323         UVM_ASSERT(uvm_va_block_size(va_block) == UVM_PAGE_SIZE_2M);
324         UVM_ASSERT(!va_block->cpu.chunks);
325         va_block->cpu.chunks = (unsigned long)chunk | UVM_CPU_CHUNK_STORAGE_CHUNK;
326     }
327     else {
328         if (!va_block->cpu.chunks) {
329             mixed = uvm_kvmalloc_zero(sizeof(*mixed));
330             if (!mixed)
331                 return NV_ERR_NO_MEMORY;
332 
333             va_block->cpu.chunks = (unsigned long)mixed | UVM_CPU_CHUNK_STORAGE_MIXED;
334         }
335 
336         UVM_ASSERT(uvm_cpu_storage_get_type(va_block) == UVM_CPU_CHUNK_STORAGE_MIXED);
337         mixed = uvm_cpu_storage_get_ptr(va_block);
338         slot_index = compute_slot_index(va_block, page_index);
339         UVM_ASSERT(compute_slot_index(va_block, page_index + uvm_cpu_chunk_num_pages(chunk) - 1) == slot_index);
340         UVM_ASSERT(!test_bit(slot_index, mixed->big_chunks));
341 
342         if (chunk_size == UVM_CHUNK_SIZE_64K) {
343             mixed->slots[slot_index] = chunk;
344             set_bit(slot_index, mixed->big_chunks);
345         }
346         else {
347             size_t small_index;
348 
349             UVM_ASSERT(chunk_size == UVM_CHUNK_SIZE_4K);
350             chunks = mixed->slots[slot_index];
351 
352             if (!chunks) {
353                 chunks = uvm_kvmalloc_zero(sizeof(*chunks) * MAX_SMALL_CHUNKS_PER_BIG_SLOT);
354                 if (!chunks)
355                     return NV_ERR_NO_MEMORY;
356                 mixed->slots[slot_index] = chunks;
357             }
358 
359             small_index = compute_small_index(va_block, page_index);
360             chunks[small_index] = chunk;
361         }
362     }
363 
364     uvm_page_mask_region_fill(&va_block->cpu.allocated, chunk_region);
365     return NV_OK;
366 }
367 
368 uvm_cpu_chunk_t *uvm_cpu_chunk_get_chunk_for_page(uvm_va_block_t *va_block, uvm_page_index_t page_index)
369 {
370     uvm_cpu_chunk_storage_mixed_t *mixed;
371     uvm_cpu_chunk_t *chunk;
372     uvm_cpu_chunk_t **chunks;
373     size_t slot_index;
374 
375     UVM_ASSERT(page_index < uvm_va_block_num_cpu_pages(va_block));
376     if (!uvm_page_mask_test(&va_block->cpu.allocated, page_index))
377         return NULL;
378 
379     UVM_ASSERT(va_block->cpu.chunks);
380 
381     if (uvm_cpu_storage_get_type(va_block) == UVM_CPU_CHUNK_STORAGE_CHUNK) {
382         return uvm_cpu_storage_get_ptr(va_block);
383     }
384     else {
385         mixed = uvm_cpu_storage_get_ptr(va_block);
386         slot_index = compute_slot_index(va_block, page_index);
387         UVM_ASSERT(mixed->slots[slot_index] != NULL);
388         if (test_bit(slot_index, mixed->big_chunks))
389             return mixed->slots[slot_index];
390 
391         chunks = mixed->slots[slot_index];
392         chunk = chunks[compute_small_index(va_block, page_index)];
393     }
394 
395     UVM_ASSERT(chunk);
396     return chunk;
397 }
398 
399 void uvm_cpu_chunk_remove_from_block(uvm_va_block_t *va_block,
400                                      uvm_page_index_t page_index)
401 {
402     uvm_cpu_chunk_storage_mixed_t *mixed;
403     uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, page_index);
404     uvm_va_block_region_t chunk_region = uvm_cpu_chunk_block_region(va_block, chunk, page_index);
405     size_t slot_index;
406     uvm_cpu_chunk_t **chunks;
407 
408     // We want to protect against two threads manipulating the VA block's CPU
409     // chunks at the same time. However, when a block is split, the new block's
410     // lock is locked without tracking. So, we can't use
411     // uvm_assert_mutex_locked().
412     UVM_ASSERT(mutex_is_locked(&va_block->lock.m));
413     UVM_ASSERT(va_block->cpu.chunks);
414     UVM_ASSERT(uvm_va_block_region_num_pages(chunk_region) == uvm_cpu_chunk_num_pages(chunk));
415 
416     if (uvm_cpu_storage_get_type(va_block) == UVM_CPU_CHUNK_STORAGE_CHUNK) {
417         UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_2M);
418         UVM_ASSERT(uvm_cpu_storage_get_ptr(va_block) == chunk);
419         va_block->cpu.chunks = 0;
420     }
421     else {
422         UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) != UVM_CHUNK_SIZE_2M);
423         mixed = uvm_cpu_storage_get_ptr(va_block);
424         slot_index = compute_slot_index(va_block, page_index);
425         UVM_ASSERT(mixed->slots[slot_index] != NULL);
426 
427         if (test_bit(slot_index, mixed->big_chunks)) {
428             UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_64K);
429             UVM_ASSERT(mixed->slots[slot_index] == chunk);
430             mixed->slots[slot_index] = NULL;
431             clear_bit(slot_index, mixed->big_chunks);
432         }
433         else {
434             size_t small_index;
435 
436             UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_4K);
437             chunks = mixed->slots[slot_index];
438             small_index = compute_small_index(va_block, page_index);
439             UVM_ASSERT(chunks[small_index] == chunk);
440             chunks[small_index] = NULL;
441 
442             for (small_index = 0; small_index < MAX_SMALL_CHUNKS_PER_BIG_SLOT; small_index++) {
443                 if (chunks[small_index])
444                     break;
445             }
446 
447             if (small_index == MAX_SMALL_CHUNKS_PER_BIG_SLOT) {
448                 uvm_kvfree(chunks);
449                 mixed->slots[slot_index] = NULL;
450             }
451         }
452     }
453 
454     uvm_page_mask_region_clear(&va_block->cpu.allocated, chunk_region);
455 
456     if (uvm_page_mask_empty(&va_block->cpu.allocated) && va_block->cpu.chunks) {
457         uvm_kvfree(uvm_cpu_storage_get_ptr(va_block));
458         va_block->cpu.chunks = 0;
459     }
460 }
461 
462 struct page *uvm_cpu_chunk_get_cpu_page(uvm_va_block_t *va_block, uvm_page_index_t page_index)
463 {
464     uvm_va_block_region_t chunk_region;
465     uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, page_index);
466 
467     UVM_ASSERT(chunk);
468     UVM_ASSERT(chunk->page);
469     chunk_region = uvm_va_block_chunk_region(va_block, uvm_cpu_chunk_get_size(chunk), page_index);
470     return chunk->page + (page_index - chunk_region.first);
471 }
472 
473 static uvm_cpu_chunk_t *uvm_cpu_chunk_first_in_region(uvm_va_block_t *va_block,
474                                                       uvm_va_block_region_t region,
475                                                       uvm_page_index_t *first_chunk_page)
476 {
477     uvm_cpu_chunk_t *chunk = NULL;
478     uvm_page_index_t page_index;
479 
480     page_index = uvm_va_block_first_page_in_mask(region, &va_block->cpu.allocated);
481     if (page_index < region.outer)
482         chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, page_index);
483 
484     if (first_chunk_page && chunk) {
485         uvm_va_block_region_t chunk_region = uvm_cpu_chunk_block_region(va_block, chunk, page_index);
486         *first_chunk_page = chunk_region.first;
487     }
488 
489     return chunk;
490 }
491 
492 #define for_each_cpu_chunk_in_block_region(chunk, page_index, va_block, region)                                       \
493     for ((chunk) = uvm_cpu_chunk_first_in_region((va_block), (region), &(page_index));                                \
494          (chunk) != NULL;                                                                                             \
495          (chunk) = uvm_cpu_chunk_first_in_region((va_block),                                                          \
496                                                  uvm_va_block_region((page_index) + uvm_cpu_chunk_num_pages((chunk)), \
497                                                                      (region).outer),                                 \
498                                                  &(page_index)))
499 
500 #define for_each_cpu_chunk_in_block_region_safe(chunk, page_index, next_page_index, va_block, region)    \
501     for ((chunk) = uvm_cpu_chunk_first_in_region((va_block), (region), &(page_index)),                   \
502                        (next_page_index) = (page_index) + (chunk ? uvm_cpu_chunk_num_pages(chunk) : 0);  \
503          (chunk) != NULL;                                                                                \
504          (chunk) = uvm_cpu_chunk_first_in_region((va_block),                                             \
505                                                  uvm_va_block_region((next_page_index), (region).outer), \
506                                                  &(page_index)),                                         \
507              (next_page_index) = (page_index) + ((chunk) ? uvm_cpu_chunk_num_pages((chunk)) : 0))
508 
509 #define for_each_cpu_chunk_in_block(chunk, page_index, va_block)        \
510     for_each_cpu_chunk_in_block_region((chunk), (page_index), (va_block), uvm_va_block_region_from_block((va_block)))
511 
512 #define for_each_cpu_chunk_in_block_safe(chunk, page_index, next_page_index, va_block)  \
513     for_each_cpu_chunk_in_block_region_safe((chunk),                                    \
514                                             (page_index),                               \
515                                             (next_page_index),                          \
516                                             (va_block),                                 \
517                                             uvm_va_block_region_from_block((va_block)))
518 
519 struct vm_area_struct *uvm_va_block_find_vma_region(uvm_va_block_t *va_block,
520                                                     struct mm_struct *mm,
521                                                     NvU64 start,
522                                                     uvm_va_block_region_t *region)
523 {
524     struct vm_area_struct *vma;
525     NvU64 end;
526 
527     if (start > va_block->end)
528         return NULL;
529 
530     vma = find_vma_intersection(mm, start, va_block->end + 1);
531     if (!vma)
532         return NULL;
533 
534     if (start < vma->vm_start)
535         start = vma->vm_start;
536 
537     end = vma->vm_end - 1;
538     if (end > va_block->end)
539         end = va_block->end;
540 
541     *region = uvm_va_block_region_from_start_end(va_block, start, end);
542 
543     return vma;
544 }
545 
546 static bool block_check_cpu_chunks(uvm_va_block_t *block)
547 {
548     uvm_cpu_chunk_t *chunk;
549     size_t alloced_pages = 0;
550     uvm_va_block_region_t prev_region = { 0 };
551     uvm_page_index_t page_index;
552 
553     for_each_cpu_chunk_in_block(chunk, page_index, block) {
554         uvm_va_block_region_t chunk_region = uvm_cpu_chunk_block_region(block, chunk, page_index);
555         size_t num_chunk_pages = uvm_cpu_chunk_num_pages(chunk);
556         uvm_page_index_t chunk_page;
557 
558         UVM_ASSERT(prev_region.outer <= chunk_region.first);
559         UVM_ASSERT(IS_ALIGNED(uvm_va_block_region_start(block, chunk_region), uvm_cpu_chunk_get_size(chunk)));
560         UVM_ASSERT(chunk_region.outer <= uvm_va_block_num_cpu_pages(block));
561 
562         alloced_pages += uvm_cpu_chunk_num_pages(chunk);
563         UVM_ASSERT(uvm_page_mask_region_full(&block->cpu.allocated, chunk_region));
564         prev_region = chunk_region;
565 
566         for (chunk_page = page_index; chunk_page < page_index + num_chunk_pages; chunk_page++)
567             UVM_ASSERT(uvm_cpu_chunk_get_chunk_for_page(block, chunk_page) == chunk);
568     }
569 
570     UVM_ASSERT(alloced_pages == uvm_page_mask_weight(&block->cpu.allocated));
571 
572     return true;
573 }
574 
575 // Frees any left-over free chunks and unpins all the used chunks
576 void uvm_va_block_retry_deinit(uvm_va_block_retry_t *retry, uvm_va_block_t *va_block)
577 {
578     uvm_gpu_t *gpu;
579     uvm_gpu_chunk_t *gpu_chunk;
580     uvm_gpu_chunk_t *next_chunk;
581 
582     if (!retry)
583         return;
584 
585     uvm_tracker_deinit(&retry->tracker);
586 
587     // Free any unused chunks
588     list_for_each_entry_safe(gpu_chunk, next_chunk, &retry->free_chunks, list) {
589         list_del_init(&gpu_chunk->list);
590         gpu = uvm_gpu_chunk_get_gpu(gpu_chunk);
591         uvm_pmm_gpu_free(&gpu->pmm, gpu_chunk, NULL);
592     }
593 
594     // Unpin all the used chunks now that we are done
595     list_for_each_entry_safe(gpu_chunk, next_chunk, &retry->used_chunks, list) {
596         list_del_init(&gpu_chunk->list);
597         gpu = uvm_gpu_chunk_get_gpu(gpu_chunk);
598         // HMM should have already moved allocated blocks to the referenced
599         // state so any left over were not migrated and should be freed.
600         if (uvm_va_block_is_hmm(va_block))
601             uvm_pmm_gpu_free(&gpu->pmm, gpu_chunk, NULL);
602         else
603             uvm_pmm_gpu_unpin_allocated(&gpu->pmm, gpu_chunk, va_block);
604     }
605 }
606 
607 static void block_retry_add_free_chunk(uvm_va_block_retry_t *retry, uvm_gpu_chunk_t *gpu_chunk)
608 {
609     list_add_tail(&gpu_chunk->list, &retry->free_chunks);
610 }
611 
612 static void block_retry_add_used_chunk(uvm_va_block_retry_t *retry, uvm_gpu_chunk_t *gpu_chunk)
613 {
614     list_add_tail(&gpu_chunk->list, &retry->used_chunks);
615 }
616 
617 static uvm_gpu_chunk_t *block_retry_get_free_chunk(uvm_va_block_retry_t *retry, uvm_gpu_t *gpu, uvm_chunk_size_t size)
618 {
619     uvm_gpu_chunk_t *gpu_chunk;
620 
621     list_for_each_entry(gpu_chunk, &retry->free_chunks, list) {
622         if (uvm_gpu_chunk_get_gpu(gpu_chunk) == gpu && uvm_gpu_chunk_get_size(gpu_chunk) == size) {
623             list_del_init(&gpu_chunk->list);
624             return gpu_chunk;
625         }
626     }
627 
628     return NULL;
629 }
630 
631 // Encapsulates a reference to a physical page belonging to a specific processor
632 // within a VA block.
633 typedef struct
634 {
635     // Processor the page is on
636     uvm_processor_id_t processor;
637 
638     // The page index
639     uvm_page_index_t page_index;
640 } block_phys_page_t;
641 
642 static block_phys_page_t block_phys_page(uvm_processor_id_t processor, uvm_page_index_t page_index)
643 {
644     return (block_phys_page_t){ processor, page_index };
645 }
646 
647 NV_STATUS uvm_va_block_init(void)
648 {
649     if (uvm_enable_builtin_tests)
650         g_uvm_va_block_cache = NV_KMEM_CACHE_CREATE("uvm_va_block_wrapper_t", uvm_va_block_wrapper_t);
651     else
652         g_uvm_va_block_cache = NV_KMEM_CACHE_CREATE("uvm_va_block_t", uvm_va_block_t);
653 
654     if (!g_uvm_va_block_cache)
655         return NV_ERR_NO_MEMORY;
656 
657     g_uvm_va_block_gpu_state_cache = NV_KMEM_CACHE_CREATE("uvm_va_block_gpu_state_t", uvm_va_block_gpu_state_t);
658     if (!g_uvm_va_block_gpu_state_cache)
659         return NV_ERR_NO_MEMORY;
660 
661     g_uvm_page_mask_cache = NV_KMEM_CACHE_CREATE("uvm_page_mask_t", uvm_page_mask_t);
662     if (!g_uvm_page_mask_cache)
663         return NV_ERR_NO_MEMORY;
664 
665     g_uvm_va_block_context_cache = NV_KMEM_CACHE_CREATE("uvm_va_block_context_t", uvm_va_block_context_t);
666     if (!g_uvm_va_block_context_cache)
667         return NV_ERR_NO_MEMORY;
668 
669     return NV_OK;
670 }
671 
672 void uvm_va_block_exit(void)
673 {
674     kmem_cache_destroy_safe(&g_uvm_va_block_context_cache);
675     kmem_cache_destroy_safe(&g_uvm_page_mask_cache);
676     kmem_cache_destroy_safe(&g_uvm_va_block_gpu_state_cache);
677     kmem_cache_destroy_safe(&g_uvm_va_block_cache);
678 }
679 
680 uvm_va_block_context_t *uvm_va_block_context_alloc(struct mm_struct *mm)
681 {
682     uvm_va_block_context_t *block_context = kmem_cache_alloc(g_uvm_va_block_context_cache, NV_UVM_GFP_FLAGS);
683     if (block_context)
684         uvm_va_block_context_init(block_context, mm);
685 
686     return block_context;
687 }
688 
689 void uvm_va_block_context_free(uvm_va_block_context_t *va_block_context)
690 {
691     if (va_block_context)
692         kmem_cache_free(g_uvm_va_block_context_cache, va_block_context);
693 }
694 
695 // Convert from page_index to chunk_index. The goal is for each system page in
696 // the region [start, start + size) to be covered by the largest naturally-
697 // aligned user chunk size.
698 size_t uvm_va_block_gpu_chunk_index_range(NvU64 start,
699                                           NvU64 size,
700                                           uvm_gpu_t *gpu,
701                                           uvm_page_index_t page_index,
702                                           uvm_chunk_size_t *out_chunk_size)
703 {
704     uvm_chunk_sizes_mask_t chunk_sizes = gpu->parent->mmu_user_chunk_sizes;
705     uvm_chunk_size_t chunk_size, final_chunk_size;
706     size_t num_chunks, num_chunks_total;
707     NvU64 addr, end, aligned_start, aligned_addr, aligned_end, temp_size;
708 
709     UVM_ASSERT(PAGE_ALIGNED(start));
710     UVM_ASSERT(PAGE_ALIGNED(size));
711     UVM_ASSERT(size > 0);
712     UVM_ASSERT(size <= UVM_CHUNK_SIZE_2M);
713     UVM_ASSERT(UVM_ALIGN_DOWN(start, UVM_CHUNK_SIZE_2M) == UVM_ALIGN_DOWN(start + size - 1, UVM_CHUNK_SIZE_2M));
714     BUILD_BUG_ON(UVM_VA_BLOCK_SIZE != UVM_CHUNK_SIZE_2M);
715 
716     // PAGE_SIZE needs to be the lowest natively-supported chunk size in the
717     // mask, since we never deal with chunk sizes smaller than that (although we
718     // may have PTEs mapping pages smaller than that).
719     UVM_ASSERT(uvm_chunk_find_first_size(chunk_sizes) == PAGE_SIZE);
720 
721     // Optimize the ideal Pascal+ case: the whole block is covered by a single
722     // 2M page.
723     if ((chunk_sizes & UVM_CHUNK_SIZE_2M) && size == UVM_CHUNK_SIZE_2M) {
724         UVM_ASSERT(IS_ALIGNED(start, UVM_CHUNK_SIZE_2M));
725         final_chunk_size = UVM_CHUNK_SIZE_2M;
726         num_chunks_total = 0;
727         goto out;
728     }
729 
730     // Only one 2M chunk can fit within a VA block on any GPU architecture, so
731     // remove that size from consideration.
732     chunk_sizes &= ~UVM_CHUNK_SIZE_2M;
733 
734     // Next common case: the whole block is aligned and sized to perfectly fit
735     // the largest page size.
736     final_chunk_size = uvm_chunk_find_last_size(chunk_sizes);
737     if (IS_ALIGNED(start, final_chunk_size) && IS_ALIGNED(size, final_chunk_size)) {
738         num_chunks_total = (size_t)uvm_div_pow2_64(page_index * PAGE_SIZE, final_chunk_size);
739         goto out;
740     }
741 
742     // We didn't hit our special paths. Do it the hard way.
743 
744     num_chunks_total = 0;
745     addr = start + page_index * PAGE_SIZE;
746     end = start + size;
747     final_chunk_size = 0;
748     UVM_ASSERT(addr < end);
749 
750     // The below loop collapses almost completely when chunk_size == PAGE_SIZE
751     // since in that lowest-common-denominator case everything is already
752     // aligned. Skip it and handle that specially after the loop.
753     //
754     // Note that since we removed 2M already above, this loop will only iterate
755     // once on x86 Pascal+ since only 64K is left.
756     chunk_sizes &= ~PAGE_SIZE;
757 
758     // This loop calculates the number of chunks between start and addr by
759     // calculating the number of whole chunks of each size between them,
760     // starting with the largest allowed chunk size. This requires fewer
761     // iterations than if we began from start and kept calculating the next
762     // larger chunk size boundary.
763     for_each_chunk_size_rev(chunk_size, chunk_sizes) {
764         aligned_start = UVM_ALIGN_UP(start, chunk_size);
765         aligned_addr  = UVM_ALIGN_DOWN(addr, chunk_size);
766         aligned_end   = UVM_ALIGN_DOWN(end, chunk_size);
767 
768         // If addr and start are within the same chunk, try smaller
769         if (aligned_start > aligned_addr)
770             continue;
771 
772         // If addr and end are not in the same chunk, then addr is covered by a
773         // single chunk of the current size. Ignore smaller boundaries between
774         // addr and aligned_addr.
775         if (aligned_addr < aligned_end && final_chunk_size == 0) {
776             addr = aligned_addr;
777             final_chunk_size = chunk_size;
778         }
779 
780         // How many chunks of this size are between start and addr? Note that
781         // this might be 0 since aligned_addr and aligned_start could be in the
782         // same chunk.
783         num_chunks = uvm_div_pow2_32(((NvU32)aligned_addr - aligned_start), chunk_size);
784         num_chunks_total += num_chunks;
785 
786         // We've already accounted for these chunks, so "remove" them by
787         // bringing start, addr, and end closer together to calculate the
788         // remaining chunk sizes.
789         temp_size = num_chunks * chunk_size;
790         addr -= temp_size;
791         end -= temp_size;
792 
793         // Once there's no separation between addr and start, and we've
794         // successfully found the right chunk size when taking end into account,
795         // we're done.
796         if (addr == start && final_chunk_size)
797             break;
798     }
799 
800     // Handle PAGE_SIZE cleanup since we skipped it in the loop
801     num_chunks_total += (addr - start) / PAGE_SIZE;
802     if (final_chunk_size == 0)
803         final_chunk_size = PAGE_SIZE;
804 
805 out:
806     if (out_chunk_size)
807         *out_chunk_size = final_chunk_size;
808 
809     return num_chunks_total;
810 }
811 
812 static size_t block_gpu_chunk_index_range(uvm_va_block_t *va_block,
813                                           NvU64 start,
814                                           NvU64 size,
815                                           uvm_gpu_t *gpu,
816                                           uvm_page_index_t page_index,
817                                           uvm_chunk_size_t *out_chunk_size)
818 {
819     if (uvm_va_block_is_hmm(va_block)) {
820         if (out_chunk_size)
821             *out_chunk_size = PAGE_SIZE;
822         return page_index;
823     }
824 
825     return uvm_va_block_gpu_chunk_index_range(start, size, gpu, page_index, out_chunk_size);
826 }
827 
828 static size_t block_gpu_chunk_index(uvm_va_block_t *block,
829                                     uvm_gpu_t *gpu,
830                                     uvm_page_index_t page_index,
831                                     uvm_chunk_size_t *out_chunk_size)
832 {
833     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
834     uvm_chunk_size_t size;
835     uvm_gpu_chunk_t *chunk;
836     size_t index;
837 
838     index = block_gpu_chunk_index_range(block, block->start, uvm_va_block_size(block), gpu, page_index, &size);
839 
840     UVM_ASSERT(size >= PAGE_SIZE);
841 
842     if (gpu_state) {
843         UVM_ASSERT(gpu_state->chunks);
844         chunk = gpu_state->chunks[index];
845         if (chunk) {
846             UVM_ASSERT(uvm_gpu_chunk_get_size(chunk) == size);
847             UVM_ASSERT(chunk->state != UVM_PMM_GPU_CHUNK_STATE_PMA_OWNED);
848             UVM_ASSERT(chunk->state != UVM_PMM_GPU_CHUNK_STATE_FREE);
849         }
850     }
851 
852     if (out_chunk_size)
853         *out_chunk_size = size;
854 
855     return index;
856 }
857 
858 // Compute the size of the chunk known to start at start_page_index
859 static uvm_chunk_size_t block_gpu_chunk_size(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_page_index_t start_page_index)
860 {
861     uvm_chunk_sizes_mask_t chunk_sizes = gpu->parent->mmu_user_chunk_sizes;
862     uvm_chunk_sizes_mask_t start_alignments, pow2_leq_size, allowed_sizes;
863     NvU64 start = uvm_va_block_cpu_page_address(block, start_page_index);
864     NvU64 size = block->end - start + 1;
865 
866     if (uvm_va_block_is_hmm(block))
867         return PAGE_SIZE;
868 
869     // Create a mask of all sizes for which start is aligned. x ^ (x-1) yields a
870     // mask of the rightmost 1 bit in x, as well as all trailing 0 bits in x.
871     // Example: 1011000 -> 0001111
872     start_alignments = (uvm_chunk_sizes_mask_t)(start ^ (start - 1));
873 
874     // Next, compute all sizes (powers of two) which are <= size.
875     pow2_leq_size = (uvm_chunk_sizes_mask_t)rounddown_pow_of_two(size);
876     pow2_leq_size |= pow2_leq_size - 1;
877 
878     // Now and them all together to get our list of GPU-supported chunk sizes
879     // which are aligned to start and will fit within size.
880     allowed_sizes = chunk_sizes & start_alignments & pow2_leq_size;
881 
882     // start and size must always be aligned to at least the smallest supported
883     // chunk size (PAGE_SIZE).
884     UVM_ASSERT(allowed_sizes >= PAGE_SIZE);
885 
886     // Take the largest allowed size
887     return uvm_chunk_find_last_size(allowed_sizes);
888 }
889 
890 static size_t block_num_gpu_chunks(uvm_va_block_t *block, uvm_gpu_t *gpu)
891 {
892     return block_gpu_chunk_index(block, gpu, uvm_va_block_cpu_page_index(block, block->end), NULL) + 1;
893 }
894 
895 static size_t block_num_gpu_chunks_range(uvm_va_block_t *block, NvU64 start, NvU64 size, uvm_gpu_t *gpu)
896 {
897     uvm_page_index_t last_page_index = (size_t)((size / PAGE_SIZE) - 1);
898     return block_gpu_chunk_index_range(block, start, size, gpu, last_page_index, NULL) + 1;
899 }
900 
901 uvm_gpu_chunk_t *uvm_va_block_lookup_gpu_chunk(uvm_va_block_t *va_block, uvm_gpu_t *gpu, NvU64 address)
902 {
903     size_t chunk_index;
904     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id);
905     uvm_page_index_t page_index = uvm_va_block_cpu_page_index(va_block, address);
906 
907     uvm_assert_mutex_locked(&va_block->lock);
908 
909     if (!gpu_state)
910         return NULL;
911 
912     chunk_index = block_gpu_chunk_index(va_block, gpu, page_index, NULL);
913 
914     return gpu_state->chunks[chunk_index];
915 }
916 
917 NV_STATUS uvm_va_block_create(uvm_va_range_t *va_range,
918                               NvU64 start,
919                               NvU64 end,
920                               uvm_va_block_t **out_block)
921 {
922     uvm_va_block_t *block = NULL;
923     NvU64 size = end - start + 1;
924 
925     UVM_ASSERT(PAGE_ALIGNED(start));
926     UVM_ASSERT(PAGE_ALIGNED(end + 1));
927     UVM_ASSERT(PAGE_ALIGNED(size));
928     UVM_ASSERT(size > 0);
929     UVM_ASSERT(size <= UVM_VA_BLOCK_SIZE);
930 
931     if (va_range) {
932         // Create a managed va_block.
933         UVM_ASSERT(start >= va_range->node.start);
934         UVM_ASSERT(end <= va_range->node.end);
935         UVM_ASSERT(va_range->type == UVM_VA_RANGE_TYPE_MANAGED);
936     }
937 
938     // Blocks can't span a block alignment boundary
939     UVM_ASSERT(UVM_VA_BLOCK_ALIGN_DOWN(start) == UVM_VA_BLOCK_ALIGN_DOWN(end));
940 
941     if (uvm_enable_builtin_tests) {
942         uvm_va_block_wrapper_t *block_wrapper = nv_kmem_cache_zalloc(g_uvm_va_block_cache, NV_UVM_GFP_FLAGS);
943 
944         if (block_wrapper)
945             block = &block_wrapper->block;
946     }
947     else {
948         block = nv_kmem_cache_zalloc(g_uvm_va_block_cache, NV_UVM_GFP_FLAGS);
949     }
950 
951     if (!block)
952         return NV_ERR_NO_MEMORY;
953 
954     nv_kref_init(&block->kref);
955     uvm_mutex_init(&block->lock, UVM_LOCK_ORDER_VA_BLOCK);
956     block->start = start;
957     block->end = end;
958     block->va_range = va_range;
959     uvm_tracker_init(&block->tracker);
960     block->prefetch_info.last_migration_proc_id = UVM_ID_INVALID;
961 
962     nv_kthread_q_item_init(&block->eviction_mappings_q_item, block_add_eviction_mappings_entry, block);
963 
964     *out_block = block;
965     return NV_OK;
966 }
967 
968 static void cpu_chunk_remove_sysmem_gpu_mapping(uvm_cpu_chunk_t *chunk, uvm_gpu_t *gpu)
969 {
970     NvU64 gpu_mapping_addr = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu->parent);
971     if (gpu_mapping_addr == 0)
972         return;
973 
974     uvm_pmm_sysmem_mappings_remove_gpu_mapping(&gpu->pmm_reverse_sysmem_mappings, gpu_mapping_addr);
975     uvm_cpu_chunk_unmap_gpu_phys(chunk, gpu->parent);
976 }
977 
978 static NV_STATUS cpu_chunk_add_sysmem_gpu_mapping(uvm_cpu_chunk_t *chunk,
979                                                   uvm_va_block_t *block,
980                                                   uvm_page_index_t page_index,
981                                                   uvm_gpu_t *gpu)
982 {
983     NV_STATUS status;
984     uvm_chunk_size_t chunk_size;
985 
986     // When the Confidential Computing feature is enabled the transfers don't
987     // use the DMA mapping of CPU chunks (since it's protected memory), but
988     // the DMA address of the unprotected dma buffer.
989     if (uvm_conf_computing_mode_enabled(gpu))
990         return NV_OK;
991 
992     status = uvm_cpu_chunk_map_gpu(chunk, gpu);
993     if (status != NV_OK)
994         return status;
995 
996     chunk_size = uvm_cpu_chunk_get_size(chunk);
997 
998     // TODO: Bug 3744779: Handle benign assertion in
999     //       pmm_sysmem_mappings_remove_gpu_mapping() in case of a
1000     //       failure.
1001     status = uvm_pmm_sysmem_mappings_add_gpu_mapping(&gpu->pmm_reverse_sysmem_mappings,
1002                                                      uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu->parent),
1003                                                      uvm_va_block_cpu_page_address(block, page_index),
1004                                                      chunk_size,
1005                                                      block,
1006                                                      UVM_ID_CPU);
1007     if (status != NV_OK)
1008         cpu_chunk_remove_sysmem_gpu_mapping(chunk, gpu);
1009 
1010     return status;
1011 }
1012 
1013 static void block_gpu_unmap_phys_all_cpu_pages(uvm_va_block_t *block, uvm_gpu_t *gpu)
1014 {
1015     uvm_cpu_chunk_t *chunk;
1016     uvm_page_index_t page_index;
1017 
1018     for_each_cpu_chunk_in_block(chunk, page_index, block)
1019         cpu_chunk_remove_sysmem_gpu_mapping(chunk, gpu);
1020 }
1021 
1022 static NV_STATUS block_gpu_map_phys_all_cpu_pages(uvm_va_block_t *block, uvm_gpu_t *gpu)
1023 {
1024     NV_STATUS status;
1025     uvm_cpu_chunk_t *chunk;
1026     NvU64 block_mapping_size = uvm_va_block_size(block);
1027     uvm_page_index_t page_index;
1028 
1029     UVM_ASSERT(IS_ALIGNED(block_mapping_size, UVM_PAGE_SIZE_4K));
1030 
1031     for_each_cpu_chunk_in_block(chunk, page_index, block) {
1032         UVM_ASSERT_MSG(uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu->parent) == 0,
1033                        "GPU%u DMA address 0x%llx\n",
1034                        uvm_id_value(gpu->id),
1035                        uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu->parent));
1036 
1037         status = cpu_chunk_add_sysmem_gpu_mapping(chunk, block, page_index, gpu);
1038         if (status != NV_OK)
1039             goto error;
1040     }
1041 
1042     return NV_OK;
1043 
1044 error:
1045     block_gpu_unmap_phys_all_cpu_pages(block, gpu);
1046     return status;
1047 }
1048 
1049 static NV_STATUS block_sysmem_mappings_add_gpu_chunk(uvm_va_block_t *block,
1050                                                      uvm_gpu_t *local_gpu,
1051                                                      uvm_gpu_chunk_t *chunk,
1052                                                      uvm_gpu_t *accessing_gpu)
1053 {
1054     NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(&local_gpu->pmm, chunk, accessing_gpu);
1055     return uvm_pmm_sysmem_mappings_add_gpu_chunk_mapping(&accessing_gpu->pmm_reverse_sysmem_mappings,
1056                                                          peer_addr,
1057                                                          block->start + chunk->va_block_page_index * PAGE_SIZE,
1058                                                          uvm_gpu_chunk_get_size(chunk),
1059                                                          block,
1060                                                          local_gpu->id);
1061 }
1062 
1063 static void block_sysmem_mappings_remove_gpu_chunk(uvm_gpu_t *local_gpu,
1064                                                    uvm_gpu_chunk_t *chunk,
1065                                                    uvm_gpu_t *accessing_gpu)
1066 {
1067     NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(&local_gpu->pmm, chunk, accessing_gpu);
1068     uvm_pmm_sysmem_mappings_remove_gpu_chunk_mapping(&accessing_gpu->pmm_reverse_sysmem_mappings, peer_addr);
1069 }
1070 
1071 static NV_STATUS block_gpu_map_all_chunks_indirect_peer(uvm_va_block_t *block,
1072                                                         uvm_gpu_t *local_gpu,
1073                                                         uvm_gpu_t *accessing_gpu)
1074 {
1075     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, local_gpu->id);
1076     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
1077     size_t num_chunks, i;
1078     NV_STATUS status;
1079 
1080     UVM_ASSERT(uvm_processor_mask_test(&va_space->indirect_peers[uvm_id_value(local_gpu->id)],
1081                                        accessing_gpu->id));
1082 
1083     // If no chunks are allocated currently, the mappings will be created later
1084     // at chunk allocation.
1085     if (!gpu_state || !gpu_state->chunks)
1086         return NV_OK;
1087 
1088     num_chunks = block_num_gpu_chunks(block, local_gpu);
1089     for (i = 0; i < num_chunks; i++) {
1090         uvm_gpu_chunk_t *chunk = gpu_state->chunks[i];
1091         if (!chunk)
1092             continue;
1093 
1094         status = uvm_pmm_gpu_indirect_peer_map(&local_gpu->pmm, chunk, accessing_gpu);
1095         if (status != NV_OK)
1096             goto error;
1097 
1098         status = block_sysmem_mappings_add_gpu_chunk(block, local_gpu, chunk, accessing_gpu);
1099         if (status != NV_OK)
1100             goto error;
1101     }
1102 
1103     return NV_OK;
1104 
1105 error:
1106     while (i-- > 0) {
1107         uvm_gpu_chunk_t *chunk = gpu_state->chunks[i];
1108         if (chunk) {
1109             // Indirect peer mappings are removed lazily by PMM, so if an error
1110             // occurs the mappings established above will be removed when the
1111             // chunk is freed later on. We only need to remove the sysmem
1112             // reverse mappings.
1113             block_sysmem_mappings_remove_gpu_chunk(local_gpu, chunk, accessing_gpu);
1114         }
1115     }
1116 
1117     return status;
1118 }
1119 
1120 // Mappings for indirect peers are removed lazily by PMM, but we need to remove
1121 // the entries from the reverse map.
1122 static void block_gpu_unmap_all_chunks_indirect_peer(uvm_va_block_t *block,
1123                                                      uvm_gpu_t *local_gpu,
1124                                                      uvm_gpu_t *accessing_gpu)
1125 {
1126     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, local_gpu->id);
1127     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
1128     size_t num_chunks, i;
1129 
1130     UVM_ASSERT(uvm_processor_mask_test(&va_space->indirect_peers[uvm_id_value(local_gpu->id)],
1131                                        accessing_gpu->id));
1132 
1133     // Exit if no chunks are allocated currently.
1134     if (!gpu_state || !gpu_state->chunks)
1135         return;
1136 
1137     num_chunks = block_num_gpu_chunks(block, local_gpu);
1138     for (i = 0; i < num_chunks; i++) {
1139         uvm_gpu_chunk_t *chunk = gpu_state->chunks[i];
1140         if (chunk)
1141             block_sysmem_mappings_remove_gpu_chunk(local_gpu, chunk, accessing_gpu);
1142     }
1143 }
1144 
1145 // Retrieves the gpu_state for the given GPU. The returned pointer is
1146 // internally managed and will be allocated (and freed) automatically,
1147 // rather than by the caller.
1148 static uvm_va_block_gpu_state_t *block_gpu_state_get_alloc(uvm_va_block_t *block, uvm_gpu_t *gpu)
1149 {
1150     NV_STATUS status;
1151     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
1152 
1153     if (gpu_state)
1154         return gpu_state;
1155 
1156     gpu_state = nv_kmem_cache_zalloc(g_uvm_va_block_gpu_state_cache, NV_UVM_GFP_FLAGS);
1157     if (!gpu_state)
1158         return NULL;
1159 
1160     gpu_state->chunks = uvm_kvmalloc_zero(block_num_gpu_chunks(block, gpu) * sizeof(gpu_state->chunks[0]));
1161     if (!gpu_state->chunks)
1162         goto error;
1163 
1164     block->gpus[uvm_id_gpu_index(gpu->id)] = gpu_state;
1165 
1166     status = block_gpu_map_phys_all_cpu_pages(block, gpu);
1167     if (status != NV_OK)
1168         goto error;
1169 
1170     return gpu_state;
1171 
1172 error:
1173     uvm_kvfree(gpu_state->chunks);
1174     kmem_cache_free(g_uvm_va_block_gpu_state_cache, gpu_state);
1175     block->gpus[uvm_id_gpu_index(gpu->id)] = NULL;
1176 
1177     return NULL;
1178 }
1179 
1180 NV_STATUS uvm_va_block_gpu_state_alloc(uvm_va_block_t *va_block)
1181 {
1182     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
1183     uvm_gpu_id_t gpu_id;
1184 
1185     UVM_ASSERT(uvm_va_block_is_hmm(va_block));
1186     uvm_assert_mutex_locked(&va_block->lock);
1187 
1188     for_each_gpu_id_in_mask(gpu_id, &va_space->registered_gpus) {
1189         if (!block_gpu_state_get_alloc(va_block, uvm_va_space_get_gpu(va_space, gpu_id)))
1190             return NV_ERR_NO_MEMORY;
1191     }
1192 
1193     return NV_OK;
1194 }
1195 
1196 void uvm_va_block_unmap_cpu_chunk_on_gpus(uvm_va_block_t *block,
1197                                           uvm_cpu_chunk_t *chunk,
1198                                           uvm_page_index_t page_index)
1199 {
1200     uvm_gpu_id_t id;
1201 
1202     for_each_gpu_id(id) {
1203         if (uvm_va_block_gpu_state_get(block, id))
1204             cpu_chunk_remove_sysmem_gpu_mapping(chunk, block_get_gpu(block, id));
1205     }
1206 }
1207 
1208 NV_STATUS uvm_va_block_map_cpu_chunk_on_gpus(uvm_va_block_t *block,
1209                                              uvm_page_index_t page_index)
1210 {
1211     NV_STATUS status;
1212     uvm_gpu_id_t id;
1213     uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index);
1214     uvm_chunk_size_t chunk_size = uvm_cpu_chunk_get_size(chunk);
1215     uvm_va_block_region_t chunk_region = uvm_va_block_chunk_region(block, chunk_size, page_index);
1216 
1217     // We can't iterate over va_space->registered_gpus because we might be
1218     // on the eviction path, which does not have the VA space lock held. We have
1219     // the VA block lock held however, so the gpu_states can't change.
1220     uvm_assert_mutex_locked(&block->lock);
1221 
1222     for_each_gpu_id(id) {
1223         uvm_gpu_t *gpu;
1224 
1225         if (!uvm_va_block_gpu_state_get(block, id))
1226             continue;
1227 
1228         gpu = block_get_gpu(block, id);
1229         status = cpu_chunk_add_sysmem_gpu_mapping(chunk, block, chunk_region.first, gpu);
1230         if (status != NV_OK)
1231             goto error;
1232     }
1233 
1234     return NV_OK;
1235 
1236 error:
1237     uvm_va_block_unmap_cpu_chunk_on_gpus(block, chunk, page_index);
1238     return status;
1239 }
1240 
1241 void uvm_va_block_remove_cpu_chunks(uvm_va_block_t *va_block, uvm_va_block_region_t region)
1242 {
1243     uvm_cpu_chunk_t *chunk;
1244     uvm_page_index_t page_index, next_page_index;
1245     uvm_va_block_region_t chunk_region;
1246 
1247     for_each_cpu_chunk_in_block_region_safe(chunk, page_index, next_page_index, va_block, region) {
1248         chunk_region = uvm_va_block_region(page_index, page_index + uvm_cpu_chunk_num_pages(chunk));
1249 
1250         uvm_page_mask_region_clear(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ], chunk_region);
1251         uvm_page_mask_region_clear(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE], chunk_region);
1252         uvm_page_mask_region_clear(&va_block->cpu.resident, chunk_region);
1253         uvm_cpu_chunk_remove_from_block(va_block, page_index);
1254         uvm_va_block_unmap_cpu_chunk_on_gpus(va_block, chunk, page_index);
1255         uvm_cpu_chunk_free(chunk);
1256     }
1257 
1258     if (uvm_page_mask_empty(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ]))
1259         uvm_processor_mask_clear(&va_block->mapped, UVM_ID_CPU);
1260     if (uvm_page_mask_empty(&va_block->cpu.resident))
1261         uvm_processor_mask_clear(&va_block->resident, UVM_ID_CPU);
1262 }
1263 
1264 // Create physical mappings to allow other GPUs to access this chunk.
1265 static NV_STATUS block_map_indirect_peers_to_gpu_chunk(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_gpu_chunk_t *chunk)
1266 {
1267     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
1268     uvm_gpu_t *accessing_gpu, *remove_gpu;
1269     NV_STATUS status;
1270 
1271     // Unlike uvm_va_block_map_cpu_chunk_on_gpus, this function isn't called on
1272     // the eviction path, so we can assume that the VA space is locked.
1273     //
1274     // TODO: Bug 2007346: In the future we may want to enable eviction to peers,
1275     //       meaning we may need to allocate peer memory and map it on the
1276     //       eviction path. That will require making sure that peers can't be
1277     //       enabled or disabled either in the VA space or globally within this
1278     //       function.
1279     uvm_assert_rwsem_locked(&va_space->lock);
1280     uvm_assert_mutex_locked(&block->lock);
1281 
1282     for_each_va_space_gpu_in_mask(accessing_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)]) {
1283         status = uvm_pmm_gpu_indirect_peer_map(&gpu->pmm, chunk, accessing_gpu);
1284         if (status != NV_OK)
1285             goto error;
1286 
1287         status = block_sysmem_mappings_add_gpu_chunk(block, gpu, chunk, accessing_gpu);
1288         if (status != NV_OK)
1289             goto error;
1290     }
1291 
1292     return NV_OK;
1293 
1294 error:
1295     for_each_va_space_gpu_in_mask(remove_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)]) {
1296         if (remove_gpu == accessing_gpu)
1297             break;
1298 
1299         // Indirect peer mappings are removed lazily by PMM, so if an error
1300         // occurs the mappings established above will be removed when the
1301         // chunk is freed later on. We only need to remove the sysmem
1302         // reverse mappings.
1303         block_sysmem_mappings_remove_gpu_chunk(gpu, chunk, remove_gpu);
1304     }
1305 
1306     return status;
1307 }
1308 
1309 static void block_unmap_indirect_peers_from_gpu_chunk(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_gpu_chunk_t *chunk)
1310 {
1311     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
1312     uvm_gpu_t *peer_gpu;
1313 
1314     uvm_assert_rwsem_locked(&va_space->lock);
1315     uvm_assert_mutex_locked(&block->lock);
1316 
1317     // Indirect peer mappings are removed lazily by PMM, so we only need to
1318     // remove the sysmem reverse mappings.
1319     for_each_va_space_gpu_in_mask(peer_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)])
1320         block_sysmem_mappings_remove_gpu_chunk(gpu, chunk, peer_gpu);
1321 }
1322 
1323 // Mark a CPU page as dirty.
1324 static void  block_mark_cpu_page_dirty(uvm_va_block_t *block, uvm_page_index_t page_index)
1325 {
1326     uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index);
1327     uvm_va_block_region_t chunk_region = uvm_va_block_chunk_region(block, uvm_cpu_chunk_get_size(chunk), page_index);
1328     uvm_cpu_chunk_mark_dirty(chunk, page_index - chunk_region.first);
1329 }
1330 
1331 // Mark a CPU page as clean.
1332 static void block_mark_cpu_page_clean(uvm_va_block_t *block, uvm_page_index_t page_index)
1333 {
1334     uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index);
1335     uvm_va_block_region_t chunk_region = uvm_va_block_chunk_region(block, uvm_cpu_chunk_get_size(chunk), page_index);
1336     uvm_cpu_chunk_mark_clean(chunk, page_index - chunk_region.first);
1337 }
1338 
1339 // Check if a CPU page is dirty.
1340 static bool block_cpu_page_is_dirty(uvm_va_block_t *block, uvm_page_index_t page_index)
1341 {
1342     uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index);
1343     uvm_va_block_region_t chunk_region = uvm_va_block_chunk_region(block, uvm_cpu_chunk_get_size(chunk), page_index);
1344     return uvm_cpu_chunk_is_dirty(chunk, page_index - chunk_region.first);
1345 }
1346 
1347 static NV_STATUS block_alloc_cpu_chunk(uvm_va_block_t *block,
1348                                        uvm_chunk_size_t alloc_size,
1349                                        uvm_cpu_chunk_alloc_flags_t flags,
1350                                        uvm_cpu_chunk_t **chunk)
1351 {
1352     uvm_va_block_test_t *block_test = uvm_va_block_get_test(block);
1353 
1354     // Return out of memory error if the tests have requested it. As opposed to
1355     // other error injection settings, this one fails N times and then succeeds.
1356     // TODO: Bug 3701182: This will print a warning in Linux kernels newer than
1357     // 5.16.0-rc1+.
1358     if (block_test && block_test->inject_cpu_pages_allocation_error_count) {
1359         if (block_test->inject_cpu_pages_allocation_error_count != ~(NvU32)0)
1360             block_test->inject_cpu_pages_allocation_error_count--;
1361         return NV_ERR_NO_MEMORY;
1362     }
1363 
1364     return uvm_cpu_chunk_alloc(alloc_size, flags, chunk);
1365 }
1366 
1367 // Allocates the input page in the block, if it doesn't already exist
1368 //
1369 // Also maps the page for physical access by all GPUs used by the block, which
1370 // is required for IOMMU support. Skipped on GPUs without access to CPU memory.
1371 // e.g., this happens when the Confidential Computing Feature is enabled.
1372 static NV_STATUS block_populate_pages_cpu(uvm_va_block_t *block,
1373                                           uvm_page_mask_t *populate_page_mask,
1374                                           uvm_va_block_region_t populate_region,
1375                                           uvm_va_block_context_t *block_context)
1376 {
1377     NV_STATUS status = NV_OK;
1378     uvm_cpu_chunk_t *chunk;
1379     uvm_va_block_test_t *block_test = uvm_va_block_get_test(block);
1380     uvm_chunk_sizes_mask_t cpu_allocation_sizes = uvm_cpu_chunk_get_allocation_sizes();
1381     uvm_chunk_size_t alloc_size;
1382     uvm_page_mask_t *resident_mask = &block_context->scratch_page_mask;
1383     uvm_cpu_chunk_alloc_flags_t alloc_flags = UVM_CPU_CHUNK_ALLOC_FLAGS_NONE;
1384     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
1385     uvm_processor_mask_t uvm_lite_gpus;
1386     uvm_page_index_t page_index;
1387     uvm_gpu_id_t id;
1388 
1389     // Check whether all requested pages have already been allocated.
1390     uvm_page_mask_init_from_region(&block_context->scratch_page_mask, populate_region, populate_page_mask);
1391     if (!uvm_page_mask_andnot(&block_context->scratch_page_mask,
1392                               &block_context->scratch_page_mask,
1393                               &block->cpu.allocated))
1394         return NV_OK;
1395 
1396     if (block_test) {
1397         if (block_test->cpu_chunk_allocation_size_mask)
1398             cpu_allocation_sizes &= block_test->cpu_chunk_allocation_size_mask;
1399     }
1400 
1401     uvm_page_mask_zero(resident_mask);
1402     for_each_id_in_mask (id, &block->resident)
1403         uvm_page_mask_or(resident_mask, resident_mask, uvm_va_block_resident_mask_get(block, id));
1404 
1405     // If the VA space has a UVM-Lite GPU registered, only PAGE_SIZE allocations
1406     // should be used in order to avoid extra copies due to dirty compound
1407     // pages. HMM va_blocks also require PAGE_SIZE allocations.
1408     // TODO: Bug 3368756: add support for HMM transparent huge page (THP)
1409     // migrations.
1410     uvm_processor_mask_andnot(&uvm_lite_gpus, &va_space->registered_gpus, &va_space->faultable_processors);
1411     if (!uvm_processor_mask_empty(&uvm_lite_gpus) || uvm_va_block_is_hmm(block))
1412         cpu_allocation_sizes = PAGE_SIZE;
1413 
1414     if (block_context->mm)
1415         alloc_flags |= UVM_CPU_CHUNK_ALLOC_FLAGS_ACCOUNT;
1416 
1417     UVM_ASSERT(cpu_allocation_sizes >= PAGE_SIZE);
1418     UVM_ASSERT(cpu_allocation_sizes & PAGE_SIZE);
1419 
1420     for_each_va_block_page_in_region_mask(page_index, populate_page_mask, populate_region) {
1421         uvm_cpu_chunk_alloc_flags_t chunk_alloc_flags;
1422         uvm_va_block_region_t region = populate_region;
1423 
1424         if (uvm_page_mask_test(&block->cpu.allocated, page_index)) {
1425             page_index = uvm_va_block_next_unset_page_in_mask(populate_region, &block->cpu.allocated, page_index) - 1;
1426             continue;
1427         }
1428 
1429         UVM_ASSERT(!uvm_page_mask_test(&block->cpu.resident, page_index));
1430 
1431         chunk_alloc_flags = alloc_flags;
1432 
1433         // Attempt to allocate CPU pages with the largest physically contiguous
1434         // size from the set of CPU chunk sizes that we can.
1435         // This is accomplished by:
1436         //   1. Aligning the CPU page address down to the allocation size.
1437         //   2. Ensuring that the entire allocation region fits withing the VA
1438         //      block.
1439         //   3. Ensuring that the region covered by the allocation is empty.
1440         for_each_chunk_size_rev(alloc_size, cpu_allocation_sizes) {
1441             NvU64 alloc_virt_addr;
1442 
1443             chunk = NULL;
1444             alloc_virt_addr = UVM_ALIGN_DOWN(uvm_va_block_cpu_page_address(block, page_index), alloc_size);
1445 
1446             if (!uvm_va_block_contains_address(block, alloc_virt_addr) ||
1447                 !uvm_va_block_contains_address(block, alloc_virt_addr + alloc_size - 1))
1448                 continue;
1449 
1450             region = uvm_va_block_region_from_start_end(block, alloc_virt_addr, alloc_virt_addr + alloc_size - 1);
1451 
1452             if (!uvm_page_mask_region_empty(&block->cpu.allocated, region))
1453                 continue;
1454 
1455             // If not all pages in the allocation region are resident somewhere,
1456             // zero out the allocated page.
1457             // This could be wasteful if only a few pages in high-order
1458             // allocation need to be zero'ed out but the alternative is to map
1459             // single sub-pages one-by-one.
1460             if (!uvm_page_mask_region_full(resident_mask, region))
1461                 chunk_alloc_flags |= UVM_CPU_CHUNK_ALLOC_FLAGS_ZERO;
1462 
1463             status = block_alloc_cpu_chunk(block, alloc_size, chunk_alloc_flags, &chunk);
1464             if (status == NV_OK) {
1465                 page_index = region.first;
1466                 break;
1467             }
1468 
1469             UVM_ASSERT(status == NV_ERR_NO_MEMORY);
1470         }
1471 
1472         if (status != NV_OK)
1473             break;
1474 
1475         status = uvm_cpu_chunk_insert_in_block(block, chunk, page_index);
1476         if (status != NV_OK) {
1477             uvm_cpu_chunk_free(chunk);
1478             return status;
1479         }
1480 
1481         status = uvm_va_block_map_cpu_chunk_on_gpus(block, page_index);
1482         if (status != NV_OK)
1483             break;
1484 
1485         // Skip iterating over all pages covered by the allocated chunk.
1486         page_index = region.outer - 1;
1487     }
1488 
1489     if (status != NV_OK && chunk) {
1490         uvm_cpu_chunk_remove_from_block(block, page_index);
1491         uvm_cpu_chunk_free(chunk);
1492     }
1493 
1494     return status;
1495 }
1496 
1497 // Try allocating a chunk. If eviction was required,
1498 // NV_ERR_MORE_PROCESSING_REQUIRED will be returned since the block's lock was
1499 // unlocked and relocked. The caller is responsible for adding the chunk to the
1500 // retry used_chunks list.
1501 static NV_STATUS block_alloc_gpu_chunk(uvm_va_block_t *block,
1502                                        uvm_va_block_retry_t *retry,
1503                                        uvm_gpu_t *gpu,
1504                                        uvm_chunk_size_t size,
1505                                        uvm_gpu_chunk_t **out_gpu_chunk)
1506 {
1507     NV_STATUS status = NV_OK;
1508     uvm_gpu_chunk_t *gpu_chunk;
1509 
1510     // First try getting a free chunk from previously-made allocations.
1511     gpu_chunk = block_retry_get_free_chunk(retry, gpu, size);
1512     if (!gpu_chunk) {
1513         uvm_va_block_test_t *block_test = uvm_va_block_get_test(block);
1514         if (block_test && block_test->user_pages_allocation_retry_force_count > 0) {
1515             // Force eviction by pretending the allocation failed with no memory
1516             --block_test->user_pages_allocation_retry_force_count;
1517             status = NV_ERR_NO_MEMORY;
1518         }
1519         else {
1520             // Try allocating a new one without eviction
1521             status = uvm_pmm_gpu_alloc_user(&gpu->pmm, 1, size, UVM_PMM_ALLOC_FLAGS_NONE, &gpu_chunk, &retry->tracker);
1522         }
1523 
1524         if (status == NV_ERR_NO_MEMORY) {
1525             // If that fails with no memory, try allocating with eviction and
1526             // return back to the caller immediately so that the operation can
1527             // be restarted.
1528             uvm_mutex_unlock(&block->lock);
1529 
1530             status = uvm_pmm_gpu_alloc_user(&gpu->pmm, 1, size, UVM_PMM_ALLOC_FLAGS_EVICT, &gpu_chunk, &retry->tracker);
1531             if (status == NV_OK) {
1532                 block_retry_add_free_chunk(retry, gpu_chunk);
1533                 status = NV_ERR_MORE_PROCESSING_REQUIRED;
1534             }
1535 
1536             uvm_mutex_lock(&block->lock);
1537             return status;
1538         }
1539         else if (status != NV_OK) {
1540             return status;
1541         }
1542     }
1543 
1544     *out_gpu_chunk = gpu_chunk;
1545     return NV_OK;
1546 }
1547 
1548 static bool block_gpu_has_page_tables(uvm_va_block_t *block, uvm_gpu_t *gpu)
1549 {
1550     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
1551 
1552     if (!gpu_state)
1553         return false;
1554 
1555     return gpu_state->page_table_range_4k.table  ||
1556            gpu_state->page_table_range_big.table ||
1557            gpu_state->page_table_range_2m.table;
1558 }
1559 
1560 // A helper to get a known-to-be-present GPU VA space given a VA block that's
1561 // locked. In order to use this function, the caller must know that at least one
1562 // of these conditions is true:
1563 //
1564 // 1) The VA space lock is held
1565 // 2) The VA block has active page tables for the GPU
1566 //
1567 // If the VA space lock is held (#1), then the gpu_va_space obviously can't go
1568 // away.
1569 //
1570 // On the eviction path, we don't have a lock on the VA space state. However,
1571 // since remove_gpu_va_space walks each block to unmap the GPU and free GPU page
1572 // tables before destroying the gpu_va_space, we're guaranteed that if this GPU
1573 // has page tables (#2), the gpu_va_space can't go away while we're holding the
1574 // block lock.
1575 static uvm_gpu_va_space_t *uvm_va_block_get_gpu_va_space(uvm_va_block_t *va_block, uvm_gpu_t *gpu)
1576 {
1577     uvm_gpu_va_space_t *gpu_va_space;
1578     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
1579 
1580     UVM_ASSERT(gpu);
1581 
1582     if (!block_gpu_has_page_tables(va_block, gpu))
1583         uvm_assert_rwsem_locked(&va_space->lock);
1584 
1585     UVM_ASSERT(uvm_processor_mask_test(&va_space->registered_gpu_va_spaces, gpu->id));
1586 
1587     gpu_va_space = va_space->gpu_va_spaces[uvm_id_gpu_index(gpu->id)];
1588 
1589     UVM_ASSERT(uvm_gpu_va_space_state(gpu_va_space) == UVM_GPU_VA_SPACE_STATE_ACTIVE);
1590     UVM_ASSERT(gpu_va_space->va_space == va_space);
1591     UVM_ASSERT(gpu_va_space->gpu == gpu);
1592 
1593     return gpu_va_space;
1594 }
1595 
1596 static bool block_gpu_supports_2m(uvm_va_block_t *block, uvm_gpu_t *gpu)
1597 {
1598     uvm_gpu_va_space_t *gpu_va_space;
1599 
1600     // TODO: Bug 3368756: add HMM support for transparent huge page migrations.
1601     if (uvm_va_block_size(block) < UVM_PAGE_SIZE_2M || uvm_va_block_is_hmm(block))
1602         return false;
1603 
1604     UVM_ASSERT(uvm_va_block_size(block) == UVM_PAGE_SIZE_2M);
1605 
1606     gpu_va_space = uvm_va_block_get_gpu_va_space(block, gpu);
1607     return uvm_mmu_page_size_supported(&gpu_va_space->page_tables, UVM_PAGE_SIZE_2M);
1608 }
1609 
1610 NvU32 uvm_va_block_gpu_big_page_size(uvm_va_block_t *va_block, uvm_gpu_t *gpu)
1611 {
1612     uvm_gpu_va_space_t *gpu_va_space;
1613 
1614     gpu_va_space = uvm_va_block_get_gpu_va_space(va_block, gpu);
1615     return gpu_va_space->page_tables.big_page_size;
1616 }
1617 
1618 static uvm_va_block_region_t range_big_page_region_all(NvU64 start, NvU64 end, NvU32 big_page_size)
1619 {
1620     NvU64 first_addr = UVM_ALIGN_UP(start, big_page_size);
1621     NvU64 outer_addr = UVM_ALIGN_DOWN(end + 1, big_page_size);
1622 
1623     // The range must fit within a VA block
1624     UVM_ASSERT(UVM_VA_BLOCK_ALIGN_DOWN(start) == UVM_VA_BLOCK_ALIGN_DOWN(end));
1625 
1626     if (outer_addr <= first_addr)
1627         return uvm_va_block_region(0, 0);
1628 
1629     return uvm_va_block_region((first_addr - start) / PAGE_SIZE, (outer_addr - start) / PAGE_SIZE);
1630 }
1631 
1632 static size_t range_num_big_pages(NvU64 start, NvU64 end, NvU32 big_page_size)
1633 {
1634     uvm_va_block_region_t region = range_big_page_region_all(start, end, big_page_size);
1635     return (size_t)uvm_div_pow2_64(uvm_va_block_region_size(region), big_page_size);
1636 }
1637 
1638 uvm_va_block_region_t uvm_va_block_big_page_region_all(uvm_va_block_t *va_block, NvU32 big_page_size)
1639 {
1640     return range_big_page_region_all(va_block->start, va_block->end, big_page_size);
1641 }
1642 
1643 uvm_va_block_region_t uvm_va_block_big_page_region_subset(uvm_va_block_t *va_block,
1644                                                           uvm_va_block_region_t region,
1645                                                           NvU32 big_page_size)
1646 {
1647     NvU64 start = uvm_va_block_region_start(va_block, region);
1648     NvU64 end = uvm_va_block_region_end(va_block, region);
1649     uvm_va_block_region_t big_region;
1650 
1651     UVM_ASSERT(start < va_block->end);
1652     UVM_ASSERT(end <= va_block->end);
1653 
1654     big_region = range_big_page_region_all(start, end, big_page_size);
1655     if (big_region.outer) {
1656         big_region.first += region.first;
1657         big_region.outer += region.first;
1658     }
1659 
1660     return big_region;
1661 }
1662 
1663 size_t uvm_va_block_num_big_pages(uvm_va_block_t *va_block, NvU32 big_page_size)
1664 {
1665     return range_num_big_pages(va_block->start, va_block->end, big_page_size);
1666 }
1667 
1668 NvU64 uvm_va_block_big_page_addr(uvm_va_block_t *va_block, size_t big_page_index, NvU32 big_page_size)
1669 {
1670     NvU64 addr = UVM_ALIGN_UP(va_block->start, big_page_size) + (big_page_index * big_page_size);
1671     UVM_ASSERT(addr >= va_block->start);
1672     UVM_ASSERT(addr < va_block->end);
1673     return addr;
1674 }
1675 
1676 uvm_va_block_region_t uvm_va_block_big_page_region(uvm_va_block_t *va_block, size_t big_page_index, NvU32 big_page_size)
1677 {
1678     NvU64 page_addr = uvm_va_block_big_page_addr(va_block, big_page_index, big_page_size);
1679 
1680     // Assume that we don't have to handle multiple big PTEs per system page.
1681     // It's not terribly difficult to implement, but we don't currently have a
1682     // use case.
1683     UVM_ASSERT(big_page_size >= PAGE_SIZE);
1684 
1685     return uvm_va_block_region_from_start_size(va_block, page_addr, big_page_size);
1686 }
1687 
1688 // Returns the big page index (the bit index within
1689 // uvm_va_block_gpu_state_t::big_ptes) corresponding to page_index. If
1690 // page_index cannot be covered by a big PTE due to alignment or block size,
1691 // MAX_BIG_PAGES_PER_UVM_VA_BLOCK is returned.
1692 size_t uvm_va_block_big_page_index(uvm_va_block_t *va_block, uvm_page_index_t page_index, NvU32 big_page_size)
1693 {
1694     uvm_va_block_region_t big_region_all = uvm_va_block_big_page_region_all(va_block, big_page_size);
1695     size_t big_index;
1696 
1697     // Note that this condition also handles the case of having no big pages in
1698     // the block, in which case .first >= .outer.
1699     if (page_index < big_region_all.first || page_index >= big_region_all.outer)
1700         return MAX_BIG_PAGES_PER_UVM_VA_BLOCK;
1701 
1702     big_index = (size_t)uvm_div_pow2_64((page_index - big_region_all.first) * PAGE_SIZE, big_page_size);
1703 
1704     UVM_ASSERT(uvm_va_block_big_page_addr(va_block, big_index, big_page_size) >= va_block->start);
1705     UVM_ASSERT(uvm_va_block_big_page_addr(va_block, big_index, big_page_size) + big_page_size <= va_block->end + 1);
1706 
1707     return big_index;
1708 }
1709 
1710 static void uvm_page_mask_init_from_big_ptes(uvm_va_block_t *block,
1711                                              uvm_gpu_t *gpu,
1712                                              uvm_page_mask_t *mask_out,
1713                                              const unsigned long *big_ptes_in)
1714 {
1715     uvm_va_block_region_t big_region;
1716     size_t big_page_index;
1717     NvU32 big_page_size = uvm_va_block_gpu_big_page_size(block, gpu);
1718 
1719     uvm_page_mask_zero(mask_out);
1720 
1721     for_each_set_bit(big_page_index, big_ptes_in, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) {
1722         big_region = uvm_va_block_big_page_region(block, big_page_index, big_page_size);
1723         uvm_page_mask_region_fill(mask_out, big_region);
1724     }
1725 }
1726 
1727 NvU32 uvm_va_block_page_size_cpu(uvm_va_block_t *va_block, uvm_page_index_t page_index)
1728 {
1729     if (!uvm_page_mask_test(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ], page_index))
1730         return 0;
1731 
1732     UVM_ASSERT(uvm_processor_mask_test(&va_block->mapped, UVM_ID_CPU));
1733 
1734     // Despite the fact that physical CPU memory can be allocated at sizes
1735     // greater than PAGE_SIZE, vm_insert_page(s)() always maps CPU memory
1736     // with 4K PTEs. Until the core kernel adds support for PMD mappings,
1737     // the return value of this function will remain at PAGE_SIZE.
1738     return PAGE_SIZE;
1739 }
1740 
1741 NvU32 uvm_va_block_page_size_gpu(uvm_va_block_t *va_block, uvm_gpu_id_t gpu_id, uvm_page_index_t page_index)
1742 {
1743     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu_id);
1744     size_t big_page_size, big_page_index;
1745 
1746     if (!gpu_state)
1747         return 0;
1748 
1749     if (!uvm_page_mask_test(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ], page_index))
1750         return 0;
1751 
1752     UVM_ASSERT(uvm_processor_mask_test(&va_block->mapped, gpu_id));
1753 
1754     if (gpu_state->pte_is_2m)
1755         return UVM_PAGE_SIZE_2M;
1756 
1757     big_page_size = uvm_va_block_gpu_big_page_size(va_block, block_get_gpu(va_block, gpu_id));
1758     big_page_index = uvm_va_block_big_page_index(va_block, page_index, big_page_size);
1759     if (big_page_index != MAX_BIG_PAGES_PER_UVM_VA_BLOCK && test_bit(big_page_index, gpu_state->big_ptes))
1760         return big_page_size;
1761 
1762     return UVM_PAGE_SIZE_4K;
1763 }
1764 
1765 // Get the size of the physical allocation backing the page, or 0 if not
1766 // resident. Note that this is different from uvm_va_block_page_size_* because
1767 // those return the size of the PTE which maps the page index, which may be
1768 // smaller than the physical allocation.
1769 static NvU32 block_phys_page_size(uvm_va_block_t *block, block_phys_page_t page)
1770 {
1771     uvm_va_block_gpu_state_t *gpu_state;
1772     uvm_chunk_size_t chunk_size;
1773 
1774     if (UVM_ID_IS_CPU(page.processor)) {
1775         uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page.page_index);
1776 
1777         if (!uvm_page_mask_test(&block->cpu.resident, page.page_index))
1778             return 0;
1779 
1780         UVM_ASSERT(uvm_processor_mask_test(&block->resident, UVM_ID_CPU));
1781         return (NvU32)uvm_cpu_chunk_get_size(chunk);
1782     }
1783 
1784     gpu_state = uvm_va_block_gpu_state_get(block, page.processor);
1785     if (!gpu_state || !uvm_page_mask_test(&gpu_state->resident, page.page_index))
1786         return 0;
1787 
1788     UVM_ASSERT(uvm_processor_mask_test(&block->resident, page.processor));
1789     block_gpu_chunk_index(block, block_get_gpu(block, page.processor), page.page_index, &chunk_size);
1790     return (NvU32)chunk_size;
1791 }
1792 
1793 static uvm_pte_bits_cpu_t get_cpu_pte_bit_index(uvm_prot_t prot)
1794 {
1795     uvm_pte_bits_cpu_t pte_bit_index = UVM_PTE_BITS_CPU_MAX;
1796 
1797     // ATOMIC and WRITE are synonyms for the CPU
1798     if (prot == UVM_PROT_READ_WRITE_ATOMIC || prot == UVM_PROT_READ_WRITE)
1799         pte_bit_index = UVM_PTE_BITS_CPU_WRITE;
1800     else if (prot == UVM_PROT_READ_ONLY)
1801         pte_bit_index = UVM_PTE_BITS_CPU_READ;
1802     else
1803         UVM_ASSERT_MSG(false, "Invalid access permissions %s\n", uvm_prot_string(prot));
1804 
1805     return pte_bit_index;
1806 }
1807 
1808 static uvm_pte_bits_gpu_t get_gpu_pte_bit_index(uvm_prot_t prot)
1809 {
1810     uvm_pte_bits_gpu_t pte_bit_index = UVM_PTE_BITS_GPU_MAX;
1811 
1812     if (prot == UVM_PROT_READ_WRITE_ATOMIC)
1813         pte_bit_index = UVM_PTE_BITS_GPU_ATOMIC;
1814     else if (prot == UVM_PROT_READ_WRITE)
1815         pte_bit_index = UVM_PTE_BITS_GPU_WRITE;
1816     else if (prot == UVM_PROT_READ_ONLY)
1817         pte_bit_index = UVM_PTE_BITS_GPU_READ;
1818     else
1819         UVM_ASSERT_MSG(false, "Invalid access permissions %s\n", uvm_prot_string(prot));
1820 
1821     return pte_bit_index;
1822 }
1823 
1824 uvm_page_mask_t *uvm_va_block_resident_mask_get(uvm_va_block_t *block, uvm_processor_id_t processor)
1825 {
1826     uvm_va_block_gpu_state_t *gpu_state;
1827 
1828     if (UVM_ID_IS_CPU(processor))
1829         return &block->cpu.resident;
1830 
1831     gpu_state = uvm_va_block_gpu_state_get(block, processor);
1832 
1833     UVM_ASSERT(gpu_state);
1834     return &gpu_state->resident;
1835 }
1836 
1837 // Get the page residency mask for a processor
1838 //
1839 // Notably this will allocate GPU state if not yet present and if that fails
1840 // NULL is returned.
1841 static uvm_page_mask_t *block_resident_mask_get_alloc(uvm_va_block_t *block, uvm_processor_id_t processor)
1842 {
1843     uvm_va_block_gpu_state_t *gpu_state;
1844 
1845     if (UVM_ID_IS_CPU(processor))
1846         return &block->cpu.resident;
1847 
1848     gpu_state = block_gpu_state_get_alloc(block, block_get_gpu(block, processor));
1849     if (!gpu_state)
1850         return NULL;
1851 
1852     return &gpu_state->resident;
1853 }
1854 
1855 static const uvm_page_mask_t *block_map_with_prot_mask_get(uvm_va_block_t *block,
1856                                                            uvm_processor_id_t processor,
1857                                                            uvm_prot_t prot)
1858 {
1859     uvm_va_block_gpu_state_t *gpu_state;
1860 
1861     if (UVM_ID_IS_CPU(processor))
1862         return &block->cpu.pte_bits[get_cpu_pte_bit_index(prot)];
1863 
1864     gpu_state = uvm_va_block_gpu_state_get(block, processor);
1865 
1866     UVM_ASSERT(gpu_state);
1867     return &gpu_state->pte_bits[get_gpu_pte_bit_index(prot)];
1868 }
1869 
1870 const uvm_page_mask_t *uvm_va_block_map_mask_get(uvm_va_block_t *block, uvm_processor_id_t processor)
1871 {
1872     return block_map_with_prot_mask_get(block, processor, UVM_PROT_READ_ONLY);
1873 }
1874 
1875 static const uvm_page_mask_t *block_evicted_mask_get(uvm_va_block_t *block, uvm_gpu_id_t gpu_id)
1876 {
1877     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu_id);
1878     UVM_ASSERT(gpu_state);
1879 
1880     return &gpu_state->evicted;
1881 }
1882 
1883 static bool block_is_page_resident_anywhere(uvm_va_block_t *block, uvm_page_index_t page_index)
1884 {
1885     uvm_processor_id_t id;
1886     for_each_id_in_mask(id, &block->resident) {
1887         if (uvm_page_mask_test(uvm_va_block_resident_mask_get(block, id), page_index))
1888             return true;
1889     }
1890 
1891     return false;
1892 }
1893 
1894 static bool block_processor_page_is_populated(uvm_va_block_t *block, uvm_processor_id_t proc, uvm_page_index_t page_index)
1895 {
1896     uvm_va_block_gpu_state_t *gpu_state;
1897     size_t chunk_index;
1898 
1899     if (UVM_ID_IS_CPU(proc))
1900         return uvm_page_mask_test(&block->cpu.allocated, page_index);
1901 
1902     gpu_state = uvm_va_block_gpu_state_get(block, proc);
1903     if (!gpu_state)
1904         return false;
1905 
1906     chunk_index = block_gpu_chunk_index(block, block_get_gpu(block, proc), page_index, NULL);
1907     return gpu_state->chunks[chunk_index] != NULL;
1908 }
1909 
1910 static bool block_processor_page_is_resident_on(uvm_va_block_t *block, uvm_processor_id_t proc, uvm_page_index_t page_index)
1911 {
1912     const uvm_page_mask_t *resident_mask;
1913 
1914     if (UVM_ID_IS_CPU(proc)) {
1915         resident_mask = &block->cpu.resident;
1916     }
1917     else {
1918         uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, proc);
1919         if (!gpu_state)
1920             return false;
1921 
1922         resident_mask = &gpu_state->resident;
1923     }
1924 
1925     return uvm_page_mask_test(resident_mask, page_index);
1926 }
1927 
1928 // Compute the gpus that have at least the given access permissions for the
1929 // range described by region and page_mask. The function sets the bit if any
1930 // page in the region has the permissions.
1931 static void block_region_authorized_gpus(uvm_va_block_t *va_block,
1932                                          uvm_va_block_region_t region,
1933                                          uvm_prot_t access_permission,
1934                                          uvm_processor_mask_t *authorized_gpus)
1935 {
1936     uvm_gpu_id_t gpu_id;
1937     uvm_pte_bits_gpu_t search_gpu_bit = get_gpu_pte_bit_index(access_permission);
1938 
1939     uvm_processor_mask_zero(authorized_gpus);
1940 
1941     // Test all GPUs with mappings on the block
1942     for_each_gpu_id_in_mask(gpu_id, &va_block->mapped) {
1943         uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu_id);
1944         if (gpu_state && !uvm_page_mask_region_empty(&gpu_state->pte_bits[search_gpu_bit], region))
1945             uvm_processor_mask_set(authorized_gpus, gpu_id);
1946     }
1947 }
1948 
1949 // Compute the processors that have at least the given access permissions for
1950 // the range described by region and page_mask. The function sets the bit if any
1951 // page in the region has the permissions.
1952 static void block_region_authorized_processors(uvm_va_block_t *va_block,
1953                                                uvm_va_block_region_t region,
1954                                                uvm_prot_t access_permission,
1955                                                uvm_processor_mask_t *authorized_processors)
1956 {
1957     uvm_pte_bits_cpu_t search_cpu_bit = get_cpu_pte_bit_index(access_permission);
1958 
1959     // Compute GPUs
1960     block_region_authorized_gpus(va_block, region, access_permission, authorized_processors);
1961 
1962     // Test CPU
1963     if (uvm_processor_mask_test(&va_block->mapped, UVM_ID_CPU) &&
1964         !uvm_page_mask_region_empty(&va_block->cpu.pte_bits[search_cpu_bit], region)) {
1965         uvm_processor_mask_set(authorized_processors, UVM_ID_CPU);
1966     }
1967 }
1968 
1969 static void block_page_authorized_processors(uvm_va_block_t *va_block,
1970                                              uvm_page_index_t page_index,
1971                                              uvm_prot_t access_permission,
1972                                              uvm_processor_mask_t *authorized_processors)
1973 {
1974     block_region_authorized_processors(va_block,
1975                                        uvm_va_block_region_for_page(page_index),
1976                                        access_permission,
1977                                        authorized_processors);
1978 }
1979 
1980 static bool block_is_gpu_authorized_on_whole_region(uvm_va_block_t *va_block,
1981                                                     uvm_va_block_region_t region,
1982                                                     uvm_gpu_id_t gpu_id,
1983                                                     uvm_prot_t required_prot)
1984 {
1985     uvm_pte_bits_gpu_t search_gpu_bit = get_gpu_pte_bit_index(required_prot);
1986     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu_id);
1987 
1988     if (!gpu_state)
1989         return false;
1990 
1991     return uvm_page_mask_region_full(&gpu_state->pte_bits[search_gpu_bit], region);
1992 }
1993 
1994 static bool block_is_processor_authorized_on_whole_region(uvm_va_block_t *va_block,
1995                                                           uvm_va_block_region_t region,
1996                                                           uvm_processor_id_t processor_id,
1997                                                           uvm_prot_t required_prot)
1998 {
1999     if (UVM_ID_IS_CPU(processor_id)) {
2000         uvm_pte_bits_cpu_t search_cpu_bit = get_cpu_pte_bit_index(required_prot);
2001 
2002         return uvm_page_mask_region_full(&va_block->cpu.pte_bits[search_cpu_bit], region);
2003     }
2004     else {
2005         return block_is_gpu_authorized_on_whole_region(va_block, region, processor_id, required_prot);
2006     }
2007 }
2008 
2009 bool uvm_va_block_page_is_gpu_authorized(uvm_va_block_t *va_block,
2010                                          uvm_page_index_t page_index,
2011                                          uvm_gpu_id_t gpu_id,
2012                                          uvm_prot_t required_prot)
2013 {
2014     return block_is_gpu_authorized_on_whole_region(va_block,
2015                                                    uvm_va_block_region_for_page(page_index),
2016                                                    gpu_id,
2017                                                    required_prot);
2018 }
2019 
2020 static bool block_page_is_processor_authorized(uvm_va_block_t *va_block,
2021                                                uvm_page_index_t page_index,
2022                                                uvm_processor_id_t processor_id,
2023                                                uvm_prot_t required_prot)
2024 {
2025     return block_is_processor_authorized_on_whole_region(va_block,
2026                                                          uvm_va_block_region_for_page(page_index),
2027                                                          processor_id,
2028                                                          required_prot);
2029 }
2030 
2031 // Compute the gpus that have a copy of the given page resident in their memory
2032 static void block_page_resident_gpus(uvm_va_block_t *va_block,
2033                                      uvm_page_index_t page_index,
2034                                      uvm_processor_mask_t *resident_gpus)
2035 {
2036     uvm_gpu_id_t id;
2037     uvm_processor_mask_zero(resident_gpus);
2038 
2039     for_each_gpu_id_in_mask(id, &va_block->resident) {
2040         if (uvm_page_mask_test(uvm_va_block_resident_mask_get(va_block, id), page_index)) {
2041             UVM_ASSERT(block_processor_page_is_populated(va_block, id, page_index));
2042             uvm_processor_mask_set(resident_gpus, id);
2043         }
2044     }
2045 }
2046 
2047 void uvm_va_block_page_resident_processors(uvm_va_block_t *va_block,
2048                                            uvm_page_index_t page_index,
2049                                            uvm_processor_mask_t *resident_processors)
2050 {
2051     block_page_resident_gpus(va_block, page_index, resident_processors);
2052 
2053     if (uvm_page_mask_test(uvm_va_block_resident_mask_get(va_block, UVM_ID_CPU), page_index)) {
2054         UVM_ASSERT(block_processor_page_is_populated(va_block, UVM_ID_CPU, page_index));
2055         uvm_processor_mask_set(resident_processors, UVM_ID_CPU);
2056     }
2057 }
2058 
2059 NvU32 uvm_va_block_page_resident_processors_count(uvm_va_block_t *va_block, uvm_page_index_t page_index)
2060 {
2061     uvm_processor_mask_t resident_processors;
2062     uvm_va_block_page_resident_processors(va_block, page_index, &resident_processors);
2063 
2064     return uvm_processor_mask_get_count(&resident_processors);
2065 }
2066 
2067 static uvm_processor_id_t block_page_get_closest_resident_in_mask(uvm_va_block_t *va_block,
2068                                                                   uvm_page_index_t page_index,
2069                                                                   uvm_processor_id_t processor,
2070                                                                   const uvm_processor_mask_t *processor_mask)
2071 {
2072     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
2073     uvm_processor_mask_t search_mask;
2074     uvm_processor_id_t id;
2075 
2076     if (processor_mask)
2077         uvm_processor_mask_and(&search_mask, processor_mask, &va_block->resident);
2078     else
2079         uvm_processor_mask_copy(&search_mask, &va_block->resident);
2080 
2081     for_each_closest_id(id, &search_mask, processor, va_space) {
2082         if (uvm_page_mask_test(uvm_va_block_resident_mask_get(va_block, id), page_index))
2083             return id;
2084     }
2085 
2086     return UVM_ID_INVALID;
2087 }
2088 
2089 uvm_processor_id_t uvm_va_block_page_get_closest_resident(uvm_va_block_t *va_block,
2090                                                           uvm_page_index_t page_index,
2091                                                           uvm_processor_id_t processor)
2092 {
2093     return block_page_get_closest_resident_in_mask(va_block, page_index, processor, NULL);
2094 }
2095 
2096 // We don't track the specific aperture of each mapped page. Instead, we assume
2097 // that each virtual mapping from a given processor always targets the closest
2098 // processor on which that page is resident (with special rules for UVM-Lite).
2099 //
2100 // This function verifies that assumption: before a page becomes resident on a
2101 // new location, assert that no processor has a valid mapping to a farther
2102 // processor on that page.
2103 static bool block_check_resident_proximity(uvm_va_block_t *block, uvm_page_index_t page_index, uvm_processor_id_t new_residency)
2104 {
2105     uvm_processor_mask_t resident_procs, mapped_procs;
2106     uvm_processor_id_t mapped_id, closest_id;
2107     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
2108 
2109     uvm_processor_mask_andnot(&mapped_procs, &block->mapped, block_get_uvm_lite_gpus(block));
2110 
2111     for_each_id_in_mask(mapped_id, &mapped_procs) {
2112         if (!uvm_page_mask_test(uvm_va_block_map_mask_get(block, mapped_id), page_index))
2113             continue;
2114 
2115         uvm_va_block_page_resident_processors(block, page_index, &resident_procs);
2116         UVM_ASSERT(!uvm_processor_mask_empty(&resident_procs));
2117         UVM_ASSERT(!uvm_processor_mask_test(&resident_procs, new_residency));
2118         uvm_processor_mask_set(&resident_procs, new_residency);
2119         closest_id = uvm_processor_mask_find_closest_id(va_space, &resident_procs, mapped_id);
2120         UVM_ASSERT(!uvm_id_equal(closest_id, new_residency));
2121     }
2122 
2123     return true;
2124 }
2125 
2126 // Returns the processor to which page_index should be mapped on gpu
2127 static uvm_processor_id_t block_gpu_get_processor_to_map(uvm_va_block_t *block,
2128                                                          uvm_gpu_t *gpu,
2129                                                          uvm_page_index_t page_index)
2130 {
2131     uvm_processor_id_t dest_id;
2132 
2133     // UVM-Lite GPUs can only map pages on the preferred location
2134     if (uvm_processor_mask_test(block_get_uvm_lite_gpus(block), gpu->id))
2135         return uvm_va_range_get_policy(block->va_range)->preferred_location;
2136 
2137     // Otherwise we always map the closest resident processor
2138     dest_id = uvm_va_block_page_get_closest_resident(block, page_index, gpu->id);
2139     UVM_ASSERT(UVM_ID_IS_VALID(dest_id));
2140     return dest_id;
2141 }
2142 
2143 // Returns the processor to which page_index should be mapped on mapping_id
2144 static uvm_processor_id_t block_get_processor_to_map(uvm_va_block_t *block,
2145                                                      uvm_processor_id_t mapping_id,
2146                                                      uvm_page_index_t page_index)
2147 {
2148 
2149     if (UVM_ID_IS_CPU(mapping_id))
2150         return uvm_va_block_page_get_closest_resident(block, page_index, mapping_id);
2151 
2152     return block_gpu_get_processor_to_map(block, block_get_gpu(block, mapping_id), page_index);
2153 }
2154 
2155 static void block_get_mapped_processors(uvm_va_block_t *block,
2156                                         uvm_processor_id_t resident_id,
2157                                         uvm_page_index_t page_index,
2158                                         uvm_processor_mask_t *mapped_procs)
2159 {
2160     uvm_processor_id_t mapped_id;
2161 
2162     uvm_processor_mask_zero(mapped_procs);
2163 
2164     for_each_id_in_mask(mapped_id, &block->mapped) {
2165         if (uvm_page_mask_test(uvm_va_block_map_mask_get(block, mapped_id), page_index)) {
2166             uvm_processor_id_t to_map_id = block_get_processor_to_map(block, mapped_id, page_index);
2167 
2168             if (uvm_id_equal(to_map_id, resident_id))
2169                 uvm_processor_mask_set(mapped_procs, mapped_id);
2170         }
2171     }
2172 }
2173 
2174 // We use block_gpu_get_processor_to_map to find the destination processor of a
2175 // given GPU mapping. This function is called when the mapping is established to
2176 // sanity check that the destination of the mapping matches the query.
2177 static bool block_check_mapping_residency_region(uvm_va_block_t *block,
2178                                                  uvm_gpu_t *gpu,
2179                                                  uvm_processor_id_t mapping_dest,
2180                                                  uvm_va_block_region_t region,
2181                                                  const uvm_page_mask_t *page_mask)
2182 {
2183     uvm_page_index_t page_index;
2184     for_each_va_block_page_in_region_mask(page_index, page_mask, region) {
2185         NvU64 va = uvm_va_block_cpu_page_address(block, page_index);
2186         uvm_processor_id_t proc_to_map = block_gpu_get_processor_to_map(block, gpu, page_index);
2187         UVM_ASSERT_MSG(uvm_id_equal(mapping_dest, proc_to_map),
2188                        "VA 0x%llx on %s: mapping %s, supposed to map %s",
2189                        va,
2190                        uvm_gpu_name(gpu),
2191                        block_processor_name(block, mapping_dest),
2192                        block_processor_name(block, proc_to_map));
2193     }
2194     return true;
2195 }
2196 
2197 static bool block_check_mapping_residency(uvm_va_block_t *block,
2198                                           uvm_gpu_t *gpu,
2199                                           uvm_processor_id_t mapping_dest,
2200                                           const uvm_page_mask_t *page_mask)
2201 {
2202     return block_check_mapping_residency_region(block,
2203                                                 gpu,
2204                                                 mapping_dest,
2205                                                 uvm_va_block_region_from_block(block),
2206                                                 page_mask);
2207 }
2208 
2209 // Check that there are no mappings targeting resident_id from any processor in
2210 // the block.
2211 static bool block_check_processor_not_mapped(uvm_va_block_t *block, uvm_processor_id_t resident_id)
2212 {
2213     uvm_processor_id_t mapped_id;
2214     uvm_page_index_t page_index;
2215 
2216     for_each_id_in_mask(mapped_id, &block->mapped) {
2217         const uvm_page_mask_t *map_mask = uvm_va_block_map_mask_get(block, mapped_id);
2218 
2219         for_each_va_block_page_in_mask(page_index, map_mask, block) {
2220             uvm_processor_id_t to_map_id = block_get_processor_to_map(block, mapped_id, page_index);
2221             UVM_ASSERT(!uvm_id_equal(to_map_id, resident_id));
2222         }
2223     }
2224 
2225     return true;
2226 }
2227 
2228 // Zero all pages of the newly-populated chunk which are not resident anywhere
2229 // else in the system, adding that work to the block's tracker. In all cases,
2230 // this function adds a dependency on passed in tracker to the block's tracker.
2231 static NV_STATUS block_zero_new_gpu_chunk(uvm_va_block_t *block,
2232                                           uvm_gpu_t *gpu,
2233                                           uvm_gpu_chunk_t *chunk,
2234                                           uvm_va_block_region_t chunk_region,
2235                                           uvm_tracker_t *tracker)
2236 {
2237     uvm_va_block_gpu_state_t *gpu_state;
2238     NV_STATUS status;
2239     uvm_gpu_address_t memset_addr_base, memset_addr;
2240     uvm_push_t push;
2241     uvm_gpu_id_t id;
2242     uvm_va_block_region_t subregion;
2243     uvm_page_mask_t *zero_mask;
2244 
2245     UVM_ASSERT(uvm_va_block_region_size(chunk_region) == uvm_gpu_chunk_get_size(chunk));
2246 
2247     if (chunk->is_zero)
2248         return NV_OK;
2249 
2250     gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
2251     zero_mask = kmem_cache_alloc(g_uvm_page_mask_cache, NV_UVM_GFP_FLAGS);
2252 
2253     if (!zero_mask)
2254         return NV_ERR_NO_MEMORY;
2255 
2256     // Tradeoff: zeroing entire chunk vs zeroing only the pages needed for the
2257     // operation.
2258     //
2259     // We may over-zero the page with this approach. For example, we might be
2260     // populating a 2MB chunk because only a single page within that chunk needs
2261     // to be made resident. If we also zero non-resident pages outside of the
2262     // strict region, we could waste the effort if those pages are populated on
2263     // another processor later and migrated here.
2264     //
2265     // We zero all non-resident pages in the chunk anyway for two reasons:
2266     //
2267     // 1) Efficiency. It's better to do all zeros as pipelined transfers once
2268     //    rather than scatter them around for each populate operation.
2269     //
2270     // 2) Optimizing the common case of block_populate_gpu_chunk being called
2271     //    for already-populated chunks. If we zero once at initial populate, we
2272     //    can simply check whether the chunk is present in the array. Otherwise
2273     //    we'd have to recompute the "is any page resident" mask every time.
2274 
2275     // Roll up all pages in chunk_region which are resident somewhere
2276     uvm_page_mask_zero(zero_mask);
2277     for_each_id_in_mask(id, &block->resident)
2278         uvm_page_mask_or(zero_mask, zero_mask, uvm_va_block_resident_mask_get(block, id));
2279 
2280     // If all pages in the chunk are resident somewhere, we don't need to clear
2281     // anything. Just make sure the chunk is tracked properly.
2282     if (uvm_page_mask_region_full(zero_mask, chunk_region)) {
2283         status = uvm_tracker_add_tracker_safe(&block->tracker, tracker);
2284         goto out;
2285     }
2286 
2287     // Complement to get the pages which are not resident anywhere. These
2288     // are the pages which must be zeroed.
2289     uvm_page_mask_complement(zero_mask, zero_mask);
2290 
2291     memset_addr_base = uvm_gpu_address_copy(gpu, uvm_gpu_phys_address(UVM_APERTURE_VID, chunk->address));
2292     memset_addr = memset_addr_base;
2293 
2294     status = uvm_push_begin_acquire(gpu->channel_manager,
2295                                     UVM_CHANNEL_TYPE_GPU_INTERNAL,
2296                                     tracker,
2297                                     &push,
2298                                     "Zero out chunk [0x%llx, 0x%llx) for region [0x%llx, 0x%llx) in va block [0x%llx, 0x%llx)",
2299                                     chunk->address,
2300                                     chunk->address + uvm_gpu_chunk_get_size(chunk),
2301                                     uvm_va_block_region_start(block, chunk_region),
2302                                     uvm_va_block_region_end(block, chunk_region) + 1,
2303                                     block->start,
2304                                     block->end + 1);
2305     if (status != NV_OK)
2306         goto out;
2307 
2308     for_each_va_block_subregion_in_mask(subregion, zero_mask, chunk_region) {
2309         // Pipeline the memsets since they never overlap with each other
2310         uvm_push_set_flag(&push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
2311 
2312         // We'll push one membar later for all memsets in this loop
2313         uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
2314 
2315         memset_addr.address = memset_addr_base.address + (subregion.first - chunk_region.first) * PAGE_SIZE;
2316         gpu->parent->ce_hal->memset_8(&push, memset_addr, 0, uvm_va_block_region_size(subregion));
2317     }
2318 
2319     // A membar from this GPU is required between this memset and any PTE write
2320     // pointing this or another GPU to this chunk. Otherwise an engine could
2321     // read the PTE then access the page before the memset write is visible to
2322     // that engine.
2323     //
2324     // This memset writes GPU memory, so local mappings need only a GPU-local
2325     // membar. We can't easily determine here whether a peer GPU will ever map
2326     // this page in the future, so always use a sysmembar. uvm_push_end provides
2327     // one by default.
2328     //
2329     // TODO: Bug 1766424: Use GPU-local membars if no peer can currently map
2330     //       this page. When peer access gets enabled, do a MEMBAR_SYS at that
2331     //       point.
2332     uvm_push_end(&push);
2333     status = uvm_tracker_add_push_safe(&block->tracker, &push);
2334 
2335 out:
2336     if (zero_mask)
2337         kmem_cache_free(g_uvm_page_mask_cache, zero_mask);
2338 
2339     return status;
2340 }
2341 
2342 static NV_STATUS block_populate_gpu_chunk(uvm_va_block_t *block,
2343                                           uvm_va_block_retry_t *retry,
2344                                           uvm_gpu_t *gpu,
2345                                           size_t chunk_index,
2346                                           uvm_va_block_region_t chunk_region)
2347 {
2348     uvm_va_block_gpu_state_t *gpu_state = block_gpu_state_get_alloc(block, gpu);
2349     uvm_gpu_chunk_t *chunk = NULL;
2350     uvm_chunk_size_t chunk_size = uvm_va_block_region_size(chunk_region);
2351     uvm_va_block_test_t *block_test = uvm_va_block_get_test(block);
2352     NV_STATUS status;
2353 
2354     if (!gpu_state)
2355         return NV_ERR_NO_MEMORY;
2356 
2357     uvm_assert_mutex_locked(&block->lock);
2358     UVM_ASSERT(chunk_index < block_num_gpu_chunks(block, gpu));
2359     UVM_ASSERT(chunk_size & gpu->parent->mmu_user_chunk_sizes);
2360 
2361     // We zero chunks as necessary at initial population, so if the chunk is
2362     // already populated we're done. See the comment in
2363     // block_zero_new_gpu_chunk.
2364     if (gpu_state->chunks[chunk_index])
2365         return NV_OK;
2366 
2367     UVM_ASSERT(uvm_page_mask_region_empty(&gpu_state->resident, chunk_region));
2368 
2369     status = block_alloc_gpu_chunk(block, retry, gpu, chunk_size, &chunk);
2370     if (status != NV_OK)
2371         return status;
2372 
2373     // In some configurations such as SR-IOV heavy, the chunk cannot be
2374     // referenced using its physical address. Create a virtual mapping.
2375     status = uvm_mmu_chunk_map(chunk);
2376     if (status != NV_OK)
2377         goto chunk_free;
2378 
2379     status = block_zero_new_gpu_chunk(block, gpu, chunk, chunk_region, &retry->tracker);
2380     if (status != NV_OK)
2381         goto chunk_unmap;
2382 
2383     // It is safe to modify the page index field without holding any PMM locks
2384     // because the chunk is pinned, which means that none of the other fields in
2385     // the bitmap can change.
2386     chunk->va_block_page_index = chunk_region.first;
2387 
2388     // va_block_page_index is a bitfield of size PAGE_SHIFT. Make sure at
2389     // compile-time that it can store VA Block page indexes.
2390     BUILD_BUG_ON(PAGES_PER_UVM_VA_BLOCK >= PAGE_SIZE);
2391 
2392     status = block_map_indirect_peers_to_gpu_chunk(block, gpu, chunk);
2393     if (status != NV_OK)
2394         goto chunk_unmap;
2395 
2396     if (block_test && block_test->inject_populate_error) {
2397         block_test->inject_populate_error = false;
2398 
2399         // Use NV_ERR_MORE_PROCESSING_REQUIRED to force a retry rather than
2400         // causing a fatal OOM failure.
2401         status = NV_ERR_MORE_PROCESSING_REQUIRED;
2402         goto chunk_unmap_indirect_peers;
2403     }
2404 
2405     // Record the used chunk so that it can be unpinned at the end of the whole
2406     // operation.
2407     block_retry_add_used_chunk(retry, chunk);
2408     gpu_state->chunks[chunk_index] = chunk;
2409 
2410     return NV_OK;
2411 
2412 chunk_unmap_indirect_peers:
2413     block_unmap_indirect_peers_from_gpu_chunk(block, gpu, chunk);
2414 
2415 chunk_unmap:
2416     uvm_mmu_chunk_unmap(chunk, &block->tracker);
2417 
2418 chunk_free:
2419     // block_zero_new_gpu_chunk may have pushed memsets on this chunk which it
2420     // placed in the block tracker.
2421     uvm_pmm_gpu_free(&gpu->pmm, chunk, &block->tracker);
2422 
2423     return status;
2424 }
2425 
2426 // Populate all chunks which cover the given region and page mask.
2427 static NV_STATUS block_populate_pages_gpu(uvm_va_block_t *block,
2428                                           uvm_va_block_retry_t *retry,
2429                                           uvm_gpu_t *gpu,
2430                                           uvm_va_block_region_t region,
2431                                           const uvm_page_mask_t *populate_mask)
2432 {
2433     uvm_va_block_region_t chunk_region, check_region;
2434     size_t chunk_index;
2435     uvm_page_index_t page_index;
2436     uvm_chunk_size_t chunk_size;
2437     NV_STATUS status;
2438 
2439     page_index = uvm_va_block_first_page_in_mask(region, populate_mask);
2440     if (page_index == region.outer)
2441         return NV_OK;
2442 
2443     chunk_index = block_gpu_chunk_index(block, gpu, page_index, &chunk_size);
2444     chunk_region = uvm_va_block_chunk_region(block, chunk_size, page_index);
2445 
2446     while (1) {
2447         check_region = uvm_va_block_region(max(chunk_region.first, region.first),
2448                                            min(chunk_region.outer, region.outer));
2449         page_index = uvm_va_block_first_page_in_mask(check_region, populate_mask);
2450         if (page_index != check_region.outer) {
2451             status = block_populate_gpu_chunk(block, retry, gpu, chunk_index, chunk_region);
2452             if (status != NV_OK)
2453                 return status;
2454         }
2455 
2456         if (check_region.outer == region.outer)
2457             break;
2458 
2459         ++chunk_index;
2460         chunk_size = block_gpu_chunk_size(block, gpu, chunk_region.outer);
2461         chunk_region = uvm_va_block_region(chunk_region.outer, chunk_region.outer + (chunk_size / PAGE_SIZE));
2462     }
2463 
2464     return NV_OK;
2465 }
2466 
2467 static NV_STATUS block_populate_pages(uvm_va_block_t *block,
2468                                       uvm_va_block_retry_t *retry,
2469                                       uvm_va_block_context_t *block_context,
2470                                       uvm_processor_id_t dest_id,
2471                                       uvm_va_block_region_t region,
2472                                       const uvm_page_mask_t *page_mask)
2473 {
2474     NV_STATUS status;
2475     const uvm_page_mask_t *resident_mask = block_resident_mask_get_alloc(block, dest_id);
2476     uvm_page_mask_t *populate_page_mask = &block_context->make_resident.page_mask;
2477     uvm_memcg_context_t memcg_context;
2478 
2479     if (!resident_mask)
2480         return NV_ERR_NO_MEMORY;
2481 
2482     if (page_mask)
2483         uvm_page_mask_andnot(populate_page_mask, page_mask, resident_mask);
2484     else
2485         uvm_page_mask_complement(populate_page_mask, resident_mask);
2486 
2487     if (UVM_ID_IS_GPU(dest_id))
2488         return block_populate_pages_gpu(block, retry, block_get_gpu(block, dest_id), region, populate_page_mask);
2489 
2490     uvm_memcg_context_start(&memcg_context, block_context->mm);
2491     status = block_populate_pages_cpu(block, populate_page_mask, region, block_context);
2492     uvm_memcg_context_end(&memcg_context);
2493     return status;
2494 }
2495 
2496 static const uvm_processor_mask_t *block_get_can_copy_from_mask(uvm_va_block_t *block, uvm_processor_id_t from)
2497 {
2498     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
2499 
2500     return &va_space->can_copy_from[uvm_id_value(from)];
2501 }
2502 
2503 static bool block_can_copy_from(uvm_va_block_t *va_block, uvm_processor_id_t from, uvm_processor_id_t to)
2504 {
2505     return uvm_processor_mask_test(block_get_can_copy_from_mask(va_block, to), from);
2506 }
2507 
2508 // Get the chunk containing the given page, along with the offset of that page
2509 // within the chunk.
2510 static uvm_gpu_chunk_t *block_phys_page_chunk(uvm_va_block_t *block, block_phys_page_t block_page, size_t *chunk_offset)
2511 {
2512     uvm_gpu_t *gpu = block_get_gpu(block, block_page.processor);
2513     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, block_page.processor);
2514     size_t chunk_index;
2515     uvm_gpu_chunk_t *chunk;
2516     uvm_chunk_size_t chunk_size;
2517 
2518     UVM_ASSERT(gpu_state);
2519 
2520     chunk_index = block_gpu_chunk_index(block, gpu, block_page.page_index, &chunk_size);
2521     chunk = gpu_state->chunks[chunk_index];
2522     UVM_ASSERT(chunk);
2523 
2524     if (chunk_offset) {
2525         size_t page_offset = block_page.page_index -
2526                              uvm_va_block_chunk_region(block,chunk_size, block_page.page_index).first;
2527         *chunk_offset = page_offset * PAGE_SIZE;
2528     }
2529 
2530     return chunk;
2531 }
2532 
2533 // Get the physical GPU address of a block's page from the POV of the specified GPU
2534 // This is the address that should be used for making PTEs for the specified GPU.
2535 static uvm_gpu_phys_address_t block_phys_page_address(uvm_va_block_t *block,
2536                                                       block_phys_page_t block_page,
2537                                                       uvm_gpu_t *gpu)
2538 {
2539     uvm_va_block_gpu_state_t *accessing_gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
2540     size_t chunk_offset;
2541     uvm_gpu_chunk_t *chunk;
2542 
2543     UVM_ASSERT(accessing_gpu_state);
2544 
2545     if (UVM_ID_IS_CPU(block_page.processor)) {
2546         uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, block_page.page_index);
2547         NvU64 dma_addr = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu->parent);
2548         uvm_va_block_region_t chunk_region = uvm_va_block_chunk_region(block,
2549                                                                        uvm_cpu_chunk_get_size(chunk),
2550                                                                        block_page.page_index);
2551 
2552         // The page should be mapped for physical access already as we do that
2553         // eagerly on CPU page population and GPU state alloc.
2554         UVM_ASSERT(dma_addr != 0);
2555         dma_addr += (block_page.page_index - chunk_region.first) * PAGE_SIZE;
2556 
2557         return uvm_gpu_phys_address(UVM_APERTURE_SYS, dma_addr);
2558     }
2559 
2560     chunk = block_phys_page_chunk(block, block_page, &chunk_offset);
2561 
2562     if (uvm_id_equal(block_page.processor, gpu->id)) {
2563         return uvm_gpu_phys_address(UVM_APERTURE_VID, chunk->address + chunk_offset);
2564     }
2565     else {
2566         uvm_gpu_phys_address_t phys_addr;
2567         uvm_gpu_t *owning_gpu = block_get_gpu(block, block_page.processor);
2568         uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
2569 
2570         UVM_ASSERT(uvm_va_space_peer_enabled(va_space, gpu, owning_gpu));
2571         phys_addr = uvm_pmm_gpu_peer_phys_address(&owning_gpu->pmm, chunk, gpu);
2572         phys_addr.address += chunk_offset;
2573         return phys_addr;
2574     }
2575 }
2576 
2577 // Get the physical GPU address of a block's page from the POV of the specified
2578 // GPU, suitable for accessing the memory from UVM-internal CE channels.
2579 //
2580 // Notably this is may be different from block_phys_page_address() to handle CE
2581 // limitations in addressing physical memory directly.
2582 static uvm_gpu_address_t block_phys_page_copy_address(uvm_va_block_t *block,
2583                                                       block_phys_page_t block_page,
2584                                                       uvm_gpu_t *gpu)
2585 {
2586     uvm_gpu_t *owning_gpu;
2587     size_t chunk_offset;
2588     uvm_gpu_chunk_t *chunk;
2589     uvm_gpu_address_t copy_addr;
2590     uvm_va_space_t *va_space;
2591 
2592     UVM_ASSERT_MSG(block_can_copy_from(block, gpu->id, block_page.processor),
2593                    "from %s to %s\n",
2594                    block_processor_name(block, gpu->id),
2595                    block_processor_name(block, block_page.processor));
2596 
2597     // CPU and local GPU accesses can rely on block_phys_page_address, but the
2598     // resulting physical address may need to be converted into virtual.
2599     if (UVM_ID_IS_CPU(block_page.processor) || uvm_id_equal(block_page.processor, gpu->id))
2600         return uvm_gpu_address_copy(gpu, block_phys_page_address(block, block_page, gpu));
2601 
2602     va_space = uvm_va_block_get_va_space(block);
2603 
2604     // See the comments on the peer_identity_mappings_supported assignments in
2605     // the HAL for why we disable direct copies between peers.
2606     owning_gpu = block_get_gpu(block, block_page.processor);
2607 
2608     UVM_ASSERT(uvm_va_space_peer_enabled(va_space, gpu, owning_gpu));
2609 
2610     chunk = block_phys_page_chunk(block, block_page, &chunk_offset);
2611     copy_addr = uvm_pmm_gpu_peer_copy_address(&owning_gpu->pmm, chunk, gpu);
2612     copy_addr.address += chunk_offset;
2613     return copy_addr;
2614 }
2615 
2616 uvm_gpu_phys_address_t uvm_va_block_res_phys_page_address(uvm_va_block_t *va_block,
2617                                                           uvm_page_index_t page_index,
2618                                                           uvm_processor_id_t residency,
2619                                                           uvm_gpu_t *gpu)
2620 {
2621     uvm_assert_mutex_locked(&va_block->lock);
2622 
2623     return block_phys_page_address(va_block, block_phys_page(residency, page_index), gpu);
2624 }
2625 
2626 uvm_gpu_phys_address_t uvm_va_block_gpu_phys_page_address(uvm_va_block_t *va_block,
2627                                                           uvm_page_index_t page_index,
2628                                                           uvm_gpu_t *gpu)
2629 {
2630     return uvm_va_block_res_phys_page_address(va_block, page_index, gpu->id, gpu);
2631 }
2632 
2633 typedef struct
2634 {
2635     // Location of the memory
2636     uvm_processor_id_t id;
2637 
2638     // Whether the whole block has a single physically-contiguous chunk of
2639     // storage on the processor.
2640     bool is_block_contig;
2641 
2642     // Starting address of the physically-contiguous allocation, from the view
2643     // of the copying GPU. Valid only if is_block_contig.
2644     uvm_gpu_address_t gpu_address;
2645 } block_copy_addr_t;
2646 
2647 typedef struct
2648 {
2649     block_copy_addr_t src;
2650     block_copy_addr_t dst;
2651     uvm_conf_computing_dma_buffer_t *dma_buffer;
2652 } block_copy_state_t;
2653 
2654 // Begin a push appropriate for copying data from src_id processor to dst_id processor.
2655 // One of src_id and dst_id needs to be a GPU.
2656 static NV_STATUS block_copy_begin_push(uvm_va_block_t *va_block,
2657                                        block_copy_state_t *copy_state,
2658                                        uvm_tracker_t *tracker,
2659                                        uvm_push_t *push)
2660 {
2661     uvm_gpu_t *gpu;
2662     NV_STATUS status;
2663     uvm_channel_type_t channel_type;
2664     uvm_tracker_t *tracker_ptr = tracker;
2665     uvm_processor_id_t dst_id = copy_state->dst.id;
2666     uvm_processor_id_t src_id = copy_state->src.id;
2667     uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
2668 
2669     UVM_ASSERT_MSG(!uvm_id_equal(src_id, dst_id),
2670                    "Unexpected copy to self, processor %s\n",
2671                    block_processor_name(va_block, src_id));
2672 
2673     if (UVM_ID_IS_CPU(src_id)) {
2674         gpu = block_get_gpu(va_block, dst_id);
2675         channel_type = UVM_CHANNEL_TYPE_CPU_TO_GPU;
2676     }
2677     else if (UVM_ID_IS_CPU(dst_id)) {
2678         gpu = block_get_gpu(va_block, src_id);
2679         channel_type = UVM_CHANNEL_TYPE_GPU_TO_CPU;
2680     }
2681     else {
2682         // For GPU to GPU copies, prefer to "push" the data from the source as
2683         // that works better at least for P2P over PCI-E.
2684         gpu = block_get_gpu(va_block, src_id);
2685 
2686         channel_type = UVM_CHANNEL_TYPE_GPU_TO_GPU;
2687     }
2688 
2689     UVM_ASSERT_MSG(block_can_copy_from(va_block, gpu->id, dst_id),
2690                    "GPU %s dst %s src %s\n",
2691                    block_processor_name(va_block, gpu->id),
2692                    block_processor_name(va_block, dst_id),
2693                    block_processor_name(va_block, src_id));
2694     UVM_ASSERT_MSG(block_can_copy_from(va_block, gpu->id, src_id),
2695                    "GPU %s dst %s src %s\n",
2696                    block_processor_name(va_block, gpu->id),
2697                    block_processor_name(va_block, dst_id),
2698                    block_processor_name(va_block, src_id));
2699 
2700     if (channel_type == UVM_CHANNEL_TYPE_GPU_TO_GPU) {
2701         uvm_gpu_t *dst_gpu = block_get_gpu(va_block, dst_id);
2702         return uvm_push_begin_acquire_gpu_to_gpu(gpu->channel_manager,
2703                                                  dst_gpu,
2704                                                  tracker,
2705                                                  push,
2706                                                  "Copy from %s to %s for block [0x%llx, 0x%llx]",
2707                                                  block_processor_name(va_block, src_id),
2708                                                  block_processor_name(va_block, dst_id),
2709                                                  va_block->start,
2710                                                  va_block->end);
2711     }
2712 
2713     if (uvm_conf_computing_mode_enabled(gpu)) {
2714         // When the Confidential Feature is enabled, additional dependencies
2715         // apply to the input tracker as well as the dma_buffer tracker.
2716         // * In the CPU to GPU case, because UVM performs CPU side
2717         //   crypto-operations first before the GPU copy, we both need to
2718         //   ensure that the dma_buffer and the input tracker are completed.
2719         // * In the GPU to CPU case, the GPU copy happens first, but the same
2720         //   principles apply. Hence, UVM acquires the input tracker and the
2721         //   dma buffer.
2722         status = uvm_tracker_overwrite_safe(&local_tracker, tracker);
2723         if (status != NV_OK)
2724             goto error;
2725 
2726         UVM_ASSERT(copy_state->dma_buffer == NULL);
2727         status = uvm_conf_computing_dma_buffer_alloc(&gpu->conf_computing.dma_buffer_pool,
2728                                                      &copy_state->dma_buffer,
2729                                                      &local_tracker);
2730 
2731         if (status != NV_OK)
2732             goto error;
2733 
2734         if (channel_type == UVM_CHANNEL_TYPE_CPU_TO_GPU) {
2735             status = uvm_tracker_wait(&local_tracker);
2736             if (status != NV_OK)
2737                 goto error;
2738         }
2739 
2740         tracker_ptr = &local_tracker;
2741     }
2742 
2743     status = uvm_push_begin_acquire(gpu->channel_manager,
2744                                     channel_type,
2745                                     tracker_ptr,
2746                                     push,
2747                                     "Copy from %s to %s for block [0x%llx, 0x%llx]",
2748                                     block_processor_name(va_block, src_id),
2749                                     block_processor_name(va_block, dst_id),
2750                                     va_block->start,
2751                                     va_block->end);
2752 
2753 error:
2754     // Caller is responsible for freeing the DMA buffer on error
2755     uvm_tracker_deinit(&local_tracker);
2756     return status;
2757 }
2758 
2759 // A page is clean iff...
2760 // the destination is the preferred location and
2761 // the source is the CPU and
2762 // the destination does not support faults/eviction and
2763 // the CPU page is not dirty
2764 static bool block_page_is_clean(uvm_va_block_t *block,
2765                                 uvm_processor_id_t dst_id,
2766                                 uvm_processor_id_t src_id,
2767                                 uvm_page_index_t page_index)
2768 {
2769     return !uvm_va_block_is_hmm(block) &&
2770            uvm_id_equal(dst_id, uvm_va_range_get_policy(block->va_range)->preferred_location) &&
2771            UVM_ID_IS_CPU(src_id) &&
2772            !block_get_gpu(block, dst_id)->parent->isr.replayable_faults.handling &&
2773            !block_cpu_page_is_dirty(block, page_index);
2774 }
2775 
2776 // When the destination is the CPU...
2777 // if the source is the preferred location, mark as clean
2778 // otherwise, mark as dirty
2779 static void block_update_page_dirty_state(uvm_va_block_t *block,
2780                                           uvm_processor_id_t dst_id,
2781                                           uvm_processor_id_t src_id,
2782                                           uvm_page_index_t page_index)
2783 {
2784     if (UVM_ID_IS_GPU(dst_id))
2785         return;
2786 
2787     if (uvm_id_equal(src_id, uvm_va_range_get_policy(block->va_range)->preferred_location))
2788         block_mark_cpu_page_clean(block, page_index);
2789     else
2790         block_mark_cpu_page_dirty(block, page_index);
2791 }
2792 
2793 static void block_mark_memory_used(uvm_va_block_t *block, uvm_processor_id_t id)
2794 {
2795     uvm_gpu_t *gpu;
2796 
2797     if (UVM_ID_IS_CPU(id))
2798         return;
2799 
2800     gpu = block_get_gpu(block, id);
2801 
2802     // If the block is of the max size and the GPU supports eviction, mark the
2803     // root chunk as used in PMM.
2804     // HMM always allocates PAGE_SIZE GPU chunks so skip HMM va_blocks.
2805     if (!uvm_va_block_is_hmm(block) &&
2806         uvm_va_block_size(block) == UVM_CHUNK_SIZE_MAX &&
2807         uvm_gpu_supports_eviction(gpu)) {
2808         // The chunk has to be there if this GPU is resident
2809         UVM_ASSERT(uvm_processor_mask_test(&block->resident, id));
2810         uvm_pmm_gpu_mark_root_chunk_used(&gpu->pmm, uvm_va_block_gpu_state_get(block, gpu->id)->chunks[0]);
2811     }
2812 }
2813 
2814 static void block_set_resident_processor(uvm_va_block_t *block, uvm_processor_id_t id)
2815 {
2816     UVM_ASSERT(!uvm_page_mask_empty(uvm_va_block_resident_mask_get(block, id)));
2817 
2818     if (uvm_processor_mask_test_and_set(&block->resident, id))
2819         return;
2820 
2821     block_mark_memory_used(block, id);
2822 }
2823 
2824 static void block_clear_resident_processor(uvm_va_block_t *block, uvm_processor_id_t id)
2825 {
2826     uvm_gpu_t *gpu;
2827 
2828     UVM_ASSERT(uvm_page_mask_empty(uvm_va_block_resident_mask_get(block, id)));
2829 
2830     if (!uvm_processor_mask_test_and_clear(&block->resident, id))
2831         return;
2832 
2833     if (UVM_ID_IS_CPU(id))
2834         return;
2835 
2836     gpu = block_get_gpu(block, id);
2837 
2838     // If the block is of the max size and the GPU supports eviction, mark the
2839     // root chunk as unused in PMM.
2840     if (!uvm_va_block_is_hmm(block) &&
2841         uvm_va_block_size(block) == UVM_CHUNK_SIZE_MAX &&
2842         uvm_gpu_supports_eviction(gpu)) {
2843         // The chunk may not be there any more when residency is cleared.
2844         uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
2845         if (gpu_state && gpu_state->chunks[0])
2846             uvm_pmm_gpu_mark_root_chunk_unused(&gpu->pmm, gpu_state->chunks[0]);
2847     }
2848 }
2849 
2850 static bool block_phys_copy_contig_check(uvm_va_block_t *block,
2851                                          uvm_page_index_t page_index,
2852                                          const uvm_gpu_address_t *base_address,
2853                                          uvm_processor_id_t proc_id,
2854                                          uvm_gpu_t *copying_gpu)
2855 {
2856     uvm_gpu_address_t page_address;
2857     uvm_gpu_address_t contig_address = *base_address;
2858 
2859     contig_address.address += page_index * PAGE_SIZE;
2860 
2861     page_address = block_phys_page_copy_address(block, block_phys_page(proc_id, page_index), copying_gpu);
2862 
2863     return uvm_gpu_addr_cmp(page_address, contig_address) == 0;
2864 }
2865 
2866 // Check if the VA block has a single physically-contiguous chunk of storage
2867 // on the processor.
2868 static bool is_block_phys_contig(uvm_va_block_t *block, uvm_processor_id_t id)
2869 {
2870     uvm_cpu_chunk_t *chunk;
2871 
2872     if (UVM_ID_IS_GPU(id))
2873         return uvm_va_block_size(block) == block_gpu_chunk_size(block, block_get_gpu(block, id), 0);
2874 
2875     chunk = uvm_cpu_chunk_first_in_region(block, uvm_va_block_region_from_block(block), NULL);
2876     return chunk && (uvm_va_block_size(block) == uvm_cpu_chunk_get_size(chunk));
2877 }
2878 
2879 static uvm_va_block_region_t block_phys_contig_region(uvm_va_block_t *block,
2880                                                       uvm_page_index_t page_index,
2881                                                       uvm_processor_id_t resident_id)
2882 {
2883     if (UVM_ID_IS_CPU(resident_id)) {
2884         uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index);
2885         return uvm_cpu_chunk_block_region(block, chunk, page_index);
2886     }
2887     else {
2888         uvm_chunk_size_t chunk_size;
2889         (void)block_gpu_chunk_index(block, block_get_gpu(block, resident_id), page_index, &chunk_size);
2890         return uvm_va_block_chunk_region(block, chunk_size, page_index);
2891     }
2892 }
2893 
2894 // Like block_phys_page_copy_address, but uses the address cached in bca when
2895 // possible.
2896 static uvm_gpu_address_t block_copy_get_address(uvm_va_block_t *block,
2897                                                 block_copy_addr_t *bca,
2898                                                 uvm_page_index_t page_index,
2899                                                 uvm_gpu_t *copying_gpu)
2900 {
2901     if (bca->is_block_contig) {
2902         uvm_gpu_address_t addr = bca->gpu_address;
2903         addr.address += page_index * PAGE_SIZE;
2904         UVM_ASSERT(block_phys_copy_contig_check(block, page_index, &bca->gpu_address, bca->id, copying_gpu));
2905         return addr;
2906     }
2907 
2908     return block_phys_page_copy_address(block, block_phys_page(bca->id, page_index), copying_gpu);
2909 }
2910 
2911 // When the Confidential Computing feature is enabled, the function performs
2912 // CPU side page encryption and GPU side decryption to the CPR.
2913 // GPU operations respect the caller's membar previously set in the push.
2914 static void conf_computing_block_copy_push_cpu_to_gpu(uvm_va_block_t *block,
2915                                                       block_copy_state_t *copy_state,
2916                                                       uvm_va_block_region_t region,
2917                                                       uvm_push_t *push)
2918 {
2919     uvm_push_flag_t membar_flag = 0;
2920     uvm_gpu_t *gpu = uvm_push_get_gpu(push);
2921     uvm_page_index_t page_index = region.first;
2922     uvm_conf_computing_dma_buffer_t *dma_buffer = copy_state->dma_buffer;
2923     struct page *src_page = uvm_cpu_chunk_get_cpu_page(block, page_index);
2924     uvm_gpu_address_t staging_buffer = uvm_mem_gpu_address_virtual_kernel(dma_buffer->alloc, gpu);
2925     uvm_gpu_address_t auth_tag_buffer = uvm_mem_gpu_address_virtual_kernel(dma_buffer->auth_tag, gpu);
2926     char *cpu_auth_tag_buffer = (char *)uvm_mem_get_cpu_addr_kernel(dma_buffer->auth_tag) +
2927                                         (page_index * UVM_CONF_COMPUTING_AUTH_TAG_SIZE);
2928     uvm_gpu_address_t dst_address = block_copy_get_address(block, &copy_state->dst, page_index, gpu);
2929     char *cpu_va_staging_buffer = (char *)uvm_mem_get_cpu_addr_kernel(dma_buffer->alloc) + (page_index * PAGE_SIZE);
2930 
2931     UVM_ASSERT(UVM_ID_IS_CPU(copy_state->src.id));
2932     UVM_ASSERT(UVM_ID_IS_GPU(copy_state->dst.id));
2933 
2934     UVM_ASSERT(uvm_conf_computing_mode_enabled(gpu));
2935 
2936     // See comment in block_copy_begin_push.
2937     UVM_ASSERT(uvm_tracker_is_completed(&block->tracker));
2938 
2939     staging_buffer.address += page_index * PAGE_SIZE;
2940     auth_tag_buffer.address += page_index * UVM_CONF_COMPUTING_AUTH_TAG_SIZE;
2941 
2942     if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE))
2943         membar_flag = UVM_PUSH_FLAG_NEXT_MEMBAR_NONE;
2944     else if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU))
2945         membar_flag = UVM_PUSH_FLAG_NEXT_MEMBAR_GPU;
2946 
2947     // kmap() only guarantees PAGE_SIZE contiguity, all encryption and
2948     // decryption must happen on a PAGE_SIZE basis.
2949     for_each_va_block_page_in_region(page_index, region) {
2950         void *src_cpu_virt_addr;
2951 
2952         // The caller guarantees that all pages in region are contiguous,
2953         // meaning they're guaranteed to be part of the same compound page.
2954         UVM_ASSERT(src_page == uvm_cpu_chunk_get_cpu_page(block, page_index));
2955 
2956         src_cpu_virt_addr = kmap(src_page);
2957         uvm_conf_computing_cpu_encrypt(push->channel,
2958                                        cpu_va_staging_buffer,
2959                                        src_cpu_virt_addr,
2960                                        NULL,
2961                                        PAGE_SIZE,
2962                                        cpu_auth_tag_buffer);
2963         kunmap(src_page);
2964 
2965         // First LCE operation should be non-pipelined to guarantee ordering as
2966         // we do not know when was the last non-pipelined copy.
2967         // Last one applies the membar originally planned for the push if any
2968         // TODO: 3857691: Inherit policy instead of forcing first invocation to
2969         // be non pipelined.
2970         if (page_index > region.first)
2971             uvm_push_set_flag(push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
2972 
2973         if (page_index < (region.outer - 1))
2974             uvm_push_set_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
2975         else if (membar_flag)
2976             uvm_push_set_flag(push, membar_flag);
2977 
2978         gpu->parent->ce_hal->decrypt(push, dst_address, staging_buffer, PAGE_SIZE, auth_tag_buffer);
2979 
2980         src_page++;
2981         dst_address.address += PAGE_SIZE;
2982         cpu_va_staging_buffer += PAGE_SIZE;
2983         staging_buffer.address += PAGE_SIZE;
2984         cpu_auth_tag_buffer += UVM_CONF_COMPUTING_AUTH_TAG_SIZE;
2985         auth_tag_buffer.address += UVM_CONF_COMPUTING_AUTH_TAG_SIZE;
2986     }
2987 }
2988 
2989 // When the Confidential Computing feature is enabled, the function performs
2990 // GPU side page encryption. GPU operations respect the caller's membar
2991 // previously set in the push.
2992 static void conf_computing_block_copy_push_gpu_to_cpu(uvm_va_block_t *block,
2993                                                       block_copy_state_t *copy_state,
2994                                                       uvm_va_block_region_t region,
2995                                                       uvm_push_t *push)
2996 {
2997     uvm_push_flag_t membar_flag = 0;
2998     uvm_gpu_t *gpu = uvm_push_get_gpu(push);
2999     uvm_page_index_t page_index = region.first;
3000     uvm_conf_computing_dma_buffer_t *dma_buffer = copy_state->dma_buffer;
3001     uvm_gpu_address_t staging_buffer = uvm_mem_gpu_address_virtual_kernel(dma_buffer->alloc, gpu);
3002     uvm_gpu_address_t auth_tag_buffer = uvm_mem_gpu_address_virtual_kernel(dma_buffer->auth_tag, gpu);
3003     uvm_gpu_address_t src_address = block_copy_get_address(block, &copy_state->src, page_index, gpu);
3004 
3005     UVM_ASSERT(UVM_ID_IS_GPU(copy_state->src.id));
3006     UVM_ASSERT(UVM_ID_IS_CPU(copy_state->dst.id));
3007 
3008     UVM_ASSERT(uvm_conf_computing_mode_enabled(gpu));
3009 
3010     staging_buffer.address += page_index * PAGE_SIZE;
3011     auth_tag_buffer.address += page_index * UVM_CONF_COMPUTING_AUTH_TAG_SIZE;
3012 
3013     if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE))
3014         membar_flag = UVM_PUSH_FLAG_NEXT_MEMBAR_NONE;
3015     else if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU))
3016         membar_flag = UVM_PUSH_FLAG_NEXT_MEMBAR_GPU;
3017 
3018     // Because we use kmap() for mapping pages for CPU side
3019     // crypto-operations and it only guarantees PAGE_SIZE contiguity, all
3020     // encryptions and decryptions must happen on a PAGE_SIZE basis.
3021     for_each_va_block_page_in_region(page_index, region) {
3022         uvm_conf_computing_log_gpu_encryption(push->channel, &dma_buffer->decrypt_iv[page_index]);
3023 
3024         // First LCE operation should be non-pipelined to guarantee ordering as
3025         // we do not know when was the last non-pipelined copy.
3026         // Last one applies the membar originally planned for the push if any
3027         // TODO: 3857691: Inherit policy instead of forcing first invocation to
3028         // be non pipelined.
3029         if (page_index > region.first)
3030             uvm_push_set_flag(push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
3031 
3032         if (page_index < (region.outer - 1))
3033             uvm_push_set_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
3034         else if (membar_flag)
3035             uvm_push_set_flag(push, membar_flag);
3036 
3037         gpu->parent->ce_hal->encrypt(push, staging_buffer, src_address, PAGE_SIZE, auth_tag_buffer);
3038 
3039         src_address.address += PAGE_SIZE;
3040         staging_buffer.address += PAGE_SIZE;
3041         auth_tag_buffer.address += UVM_CONF_COMPUTING_AUTH_TAG_SIZE;
3042     }
3043 
3044     uvm_page_mask_region_fill(&dma_buffer->encrypted_page_mask, region);
3045 }
3046 
3047 static NV_STATUS conf_computing_copy_pages_finish(uvm_va_block_t *block,
3048                                                   block_copy_state_t *copy_state,
3049                                                   uvm_push_t *push)
3050 {
3051     NV_STATUS status;
3052     uvm_page_index_t page_index;
3053     uvm_conf_computing_dma_buffer_t *dma_buffer = copy_state->dma_buffer;
3054     uvm_page_mask_t *encrypted_page_mask = &dma_buffer->encrypted_page_mask;
3055     void *auth_tag_buffer_base = uvm_mem_get_cpu_addr_kernel(dma_buffer->auth_tag);
3056     void *staging_buffer_base = uvm_mem_get_cpu_addr_kernel(dma_buffer->alloc);
3057 
3058     UVM_ASSERT(uvm_conf_computing_mode_enabled(push->gpu));
3059 
3060     if (UVM_ID_IS_GPU(copy_state->dst.id))
3061         return NV_OK;
3062 
3063     UVM_ASSERT(UVM_ID_IS_GPU(copy_state->src.id));
3064 
3065     status = uvm_push_wait(push);
3066     if (status != NV_OK)
3067         return status;
3068 
3069     // kmap() only guarantees PAGE_SIZE contiguity, all encryption and
3070     // decryption must happen on a PAGE_SIZE basis.
3071     for_each_va_block_page_in_mask(page_index, encrypted_page_mask, block) {
3072         struct page *dst_page = uvm_cpu_chunk_get_cpu_page(block, page_index);
3073         void *staging_buffer = (char *)staging_buffer_base + (page_index * PAGE_SIZE);
3074         void *auth_tag_buffer = (char *)auth_tag_buffer_base + (page_index * UVM_CONF_COMPUTING_AUTH_TAG_SIZE);
3075         void *cpu_page_address = kmap(dst_page);
3076 
3077         status = uvm_conf_computing_cpu_decrypt(push->channel,
3078                                                 cpu_page_address,
3079                                                 staging_buffer,
3080                                                 &dma_buffer->decrypt_iv[page_index],
3081                                                 PAGE_SIZE,
3082                                                 auth_tag_buffer);
3083         kunmap(dst_page);
3084         if (status != NV_OK) {
3085             // TODO: Bug 3814087: [UVM][HCC] Handle CSL auth_tag verification
3086             //                    failures & other failures gracefully.
3087             // uvm_conf_computing_cpu_decrypt() can fail if the authentication
3088             // tag verification fails. May this happen, it is considered a
3089             // critical failure and cannot be recovered.
3090             uvm_global_set_fatal_error(status);
3091             return status;
3092         }
3093     }
3094 
3095     return NV_OK;
3096 }
3097 
3098 static void block_copy_push(uvm_va_block_t *block,
3099                             block_copy_state_t *copy_state,
3100                             uvm_va_block_region_t region,
3101                             uvm_push_t *push)
3102 {
3103     uvm_gpu_address_t gpu_dst_address;
3104     uvm_gpu_address_t gpu_src_address;
3105     uvm_gpu_t *gpu = uvm_push_get_gpu(push);
3106 
3107     uvm_push_set_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
3108 
3109     if (uvm_conf_computing_mode_enabled(gpu)) {
3110         if (UVM_ID_IS_CPU(copy_state->src.id))
3111             conf_computing_block_copy_push_cpu_to_gpu(block, copy_state, region, push);
3112         else
3113             conf_computing_block_copy_push_gpu_to_cpu(block, copy_state, region, push);
3114 
3115         return;
3116     }
3117 
3118     gpu_dst_address = block_copy_get_address(block, &copy_state->dst, region.first, gpu);
3119     gpu_src_address = block_copy_get_address(block, &copy_state->src, region.first, gpu);
3120     gpu->parent->ce_hal->memcopy(push, gpu_dst_address, gpu_src_address, uvm_va_block_region_size(region));
3121 }
3122 
3123 static NV_STATUS block_copy_end_push(uvm_va_block_t *block,
3124                                      block_copy_state_t *copy_state,
3125                                      uvm_tracker_t *copy_tracker,
3126                                      NV_STATUS push_status,
3127                                      uvm_push_t *push)
3128 {
3129     NV_STATUS tracker_status;
3130 
3131     // TODO: Bug 1766424: If the destination is a GPU and the copy was done
3132     //       by that GPU, use a GPU-local membar if no peer can currently
3133     //       map this page. When peer access gets enabled, do a MEMBAR_SYS
3134     //       at that point.
3135     uvm_push_end(push);
3136 
3137     if ((push_status == NV_OK) && uvm_conf_computing_mode_enabled(push->gpu))
3138         push_status = conf_computing_copy_pages_finish(block, copy_state, push);
3139 
3140     tracker_status = uvm_tracker_add_push_safe(copy_tracker, push);
3141     if (push_status == NV_OK)
3142         push_status = tracker_status;
3143 
3144     if (uvm_conf_computing_mode_enabled(push->gpu)) {
3145         uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
3146 
3147         uvm_tracker_overwrite_with_push(&local_tracker, push);
3148         uvm_conf_computing_dma_buffer_free(&push->gpu->conf_computing.dma_buffer_pool,
3149                                            copy_state->dma_buffer,
3150                                            &local_tracker);
3151         copy_state->dma_buffer = NULL;
3152         uvm_tracker_deinit(&local_tracker);
3153     }
3154 
3155     return push_status;
3156 }
3157 
3158 // Copies pages resident on the src_id processor to the dst_id processor
3159 //
3160 // The function adds the pages that were successfully copied to the output
3161 // migrated_pages mask and returns the number of pages in copied_pages. These
3162 // fields are reliable even if an error is returned.
3163 //
3164 // Acquires the block's tracker and adds all of its pushes to the copy_tracker.
3165 static NV_STATUS block_copy_resident_pages_between(uvm_va_block_t *block,
3166                                                    uvm_va_block_context_t *block_context,
3167                                                    uvm_processor_id_t dst_id,
3168                                                    uvm_processor_id_t src_id,
3169                                                    uvm_va_block_region_t region,
3170                                                    uvm_page_mask_t *copy_mask,
3171                                                    const uvm_page_mask_t *prefetch_page_mask,
3172                                                    uvm_va_block_transfer_mode_t transfer_mode,
3173                                                    uvm_page_mask_t *migrated_pages,
3174                                                    NvU32 *copied_pages,
3175                                                    uvm_tracker_t *copy_tracker)
3176 {
3177     NV_STATUS status = NV_OK;
3178     uvm_page_mask_t *dst_resident_mask = uvm_va_block_resident_mask_get(block, dst_id);
3179     uvm_gpu_t *copying_gpu = NULL;
3180     uvm_push_t push;
3181     uvm_page_index_t page_index;
3182     uvm_page_index_t contig_start_index = region.outer;
3183     uvm_page_index_t last_index = region.outer;
3184     uvm_range_group_range_t *rgr = NULL;
3185     bool rgr_has_changed = false;
3186     uvm_make_resident_cause_t cause = block_context->make_resident.cause;
3187     uvm_make_resident_cause_t contig_cause = cause;
3188     const bool may_prefetch = (cause == UVM_MAKE_RESIDENT_CAUSE_REPLAYABLE_FAULT ||
3189                                cause == UVM_MAKE_RESIDENT_CAUSE_NON_REPLAYABLE_FAULT ||
3190                                cause == UVM_MAKE_RESIDENT_CAUSE_ACCESS_COUNTER) && !!prefetch_page_mask;
3191     block_copy_state_t copy_state = {0};
3192     uvm_va_range_t *va_range = block->va_range;
3193     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
3194 
3195     copy_state.src.id = src_id;
3196     copy_state.dst.id = dst_id;
3197     copy_state.src.is_block_contig = is_block_phys_contig(block, src_id);
3198     copy_state.dst.is_block_contig = is_block_phys_contig(block, dst_id);
3199 
3200     *copied_pages = 0;
3201 
3202     // If there are no pages to be copied, exit early
3203     if (!uvm_page_mask_andnot(copy_mask, copy_mask, dst_resident_mask) ||
3204         !uvm_page_mask_andnot(copy_mask, copy_mask, migrated_pages))
3205         return NV_OK;
3206 
3207     // uvm_range_group_range_iter_first should only be called when the va_space
3208     // lock is held, which is always the case unless an eviction is taking
3209     // place.
3210     if (cause != UVM_MAKE_RESIDENT_CAUSE_EVICTION) {
3211         rgr = uvm_range_group_range_iter_first(va_space,
3212                                                uvm_va_block_region_start(block, region),
3213                                                uvm_va_block_region_end(block, region));
3214         rgr_has_changed = true;
3215     }
3216 
3217     if (UVM_ID_IS_CPU(dst_id)) {
3218         uvm_memcg_context_t memcg_context;
3219 
3220         // To support staging through CPU, populate CPU pages on demand.
3221         // GPU destinations should have their pages populated already, but
3222         // that might change if we add staging through GPUs.
3223         uvm_memcg_context_start(&memcg_context, block_context->mm);
3224         status = block_populate_pages_cpu(block, copy_mask, region, block_context);
3225         uvm_memcg_context_end(&memcg_context);
3226         if (status != NV_OK)
3227             return status;
3228     }
3229 
3230     // TODO: Bug 3745051: This function is complicated and needs refactoring
3231     for_each_va_block_page_in_region_mask(page_index, copy_mask, region) {
3232         NvU64 page_start = uvm_va_block_cpu_page_address(block, page_index);
3233         uvm_make_resident_cause_t page_cause = (may_prefetch && uvm_page_mask_test(prefetch_page_mask, page_index)) ?
3234                                                 UVM_MAKE_RESIDENT_CAUSE_PREFETCH:
3235                                                 cause;
3236 
3237         UVM_ASSERT(block_check_resident_proximity(block, page_index, dst_id));
3238         UVM_ASSERT(block_processor_page_is_populated(block, dst_id, page_index));
3239 
3240         // If we're not evicting and we're migrating away from the preferred
3241         // location, then we should add the range group range to the list of
3242         // migrated ranges in the range group. It's safe to skip this because
3243         // the use of range_group's migrated_ranges list is a UVM-Lite
3244         // optimization - eviction is not supported on UVM-Lite GPUs.
3245         if (cause != UVM_MAKE_RESIDENT_CAUSE_EVICTION && !uvm_va_block_is_hmm(block) &&
3246             uvm_id_equal(src_id, uvm_va_range_get_policy(va_range)->preferred_location)) {
3247             // rgr_has_changed is used to minimize the number of times the
3248             // migrated_ranges_lock is taken. It is set to false when the range
3249             // group range pointed by rgr is added to the migrated_ranges list,
3250             // and it is just set back to true when we move to a different
3251             // range group range.
3252 
3253             // The current page could be after the end of rgr. Iterate over the
3254             // range group ranges until rgr's end location is greater than or
3255             // equal to the current page.
3256             while (rgr && rgr->node.end < page_start) {
3257                 rgr = uvm_range_group_range_iter_next(va_space, rgr, uvm_va_block_region_end(block, region));
3258                 rgr_has_changed = true;
3259             }
3260 
3261             // Check whether the current page lies within rgr. A single page
3262             // must entirely reside within a range group range. Since we've
3263             // incremented rgr until its end is higher than page_start, we now
3264             // check if page_start lies within rgr.
3265             if (rgr && rgr_has_changed && page_start >= rgr->node.start && page_start <= rgr->node.end) {
3266                 uvm_spin_lock(&rgr->range_group->migrated_ranges_lock);
3267                 if (list_empty(&rgr->range_group_migrated_list_node))
3268                     list_move_tail(&rgr->range_group_migrated_list_node, &rgr->range_group->migrated_ranges);
3269                 uvm_spin_unlock(&rgr->range_group->migrated_ranges_lock);
3270 
3271                 rgr_has_changed = false;
3272             }
3273         }
3274 
3275         // No need to copy pages that haven't changed.  Just clear residency
3276         // information
3277         if (block_page_is_clean(block, dst_id, src_id, page_index))
3278             continue;
3279 
3280         if (!copying_gpu) {
3281             status = block_copy_begin_push(block, &copy_state, &block->tracker, &push);
3282 
3283             if (status != NV_OK)
3284                 break;
3285             copying_gpu = uvm_push_get_gpu(&push);
3286 
3287             // Record all processors involved in the copy
3288             uvm_processor_mask_set(&block_context->make_resident.all_involved_processors, copying_gpu->id);
3289             uvm_processor_mask_set(&block_context->make_resident.all_involved_processors, dst_id);
3290             uvm_processor_mask_set(&block_context->make_resident.all_involved_processors, src_id);
3291 
3292             // This function is called just once per VA block and needs to
3293             // receive the "main" cause for the migration (it mainly checks if
3294             // we are in the eviction path). Therefore, we pass cause instead
3295             // of contig_cause
3296             uvm_tools_record_block_migration_begin(block, &push, dst_id, src_id, page_start, cause);
3297         }
3298         else {
3299             uvm_push_set_flag(&push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
3300         }
3301 
3302         if (!uvm_va_block_is_hmm(block))
3303             block_update_page_dirty_state(block, dst_id, src_id, page_index);
3304 
3305         if (last_index == region.outer) {
3306             bool can_cache_src_phys_addr = copy_state.src.is_block_contig;
3307             bool can_cache_dst_phys_addr = copy_state.dst.is_block_contig;
3308             contig_start_index = page_index;
3309             contig_cause = page_cause;
3310 
3311             // When CC is enabled, transfers between GPU and CPU don't rely on
3312             // any GPU mapping of CPU chunks, physical or virtual.
3313             if (UVM_ID_IS_CPU(src_id) && uvm_conf_computing_mode_enabled(copying_gpu))
3314                 can_cache_src_phys_addr = false;
3315             if (UVM_ID_IS_CPU(dst_id) && uvm_conf_computing_mode_enabled(copying_gpu))
3316                 can_cache_dst_phys_addr = false;
3317             // Computing the physical address is a non-trivial operation and
3318             // seems to be a performance limiter on systems with 2 or more
3319             // NVLINK links. Therefore, for physically-contiguous block
3320             // storage, we cache the start address and compute the page address
3321             // using the page index.
3322             if (can_cache_src_phys_addr) {
3323                 copy_state.src.gpu_address = block_phys_page_copy_address(block,
3324                                                                           block_phys_page(src_id, 0),
3325                                                                           copying_gpu);
3326             }
3327             if (can_cache_dst_phys_addr) {
3328                 copy_state.dst.gpu_address = block_phys_page_copy_address(block,
3329                                                                           block_phys_page(dst_id, 0),
3330                                                                           copying_gpu);
3331             }
3332         }
3333         else if ((page_index != last_index + 1) || contig_cause != page_cause) {
3334             uvm_va_block_region_t contig_region = uvm_va_block_region(contig_start_index, last_index + 1);
3335             UVM_ASSERT(uvm_va_block_region_contains_region(region, contig_region));
3336 
3337             // If both src and dst are physically-contiguous, consolidate copies
3338             // of contiguous pages into a single method.
3339             if (copy_state.src.is_block_contig && copy_state.dst.is_block_contig)
3340                 block_copy_push(block, &copy_state, contig_region, &push);
3341 
3342             uvm_perf_event_notify_migration(&va_space->perf_events,
3343                                             &push,
3344                                             block,
3345                                             dst_id,
3346                                             src_id,
3347                                             uvm_va_block_region_start(block, contig_region),
3348                                             uvm_va_block_region_size(contig_region),
3349                                             transfer_mode,
3350                                             contig_cause,
3351                                             &block_context->make_resident);
3352 
3353             contig_start_index = page_index;
3354             contig_cause = page_cause;
3355         }
3356 
3357         if (!copy_state.src.is_block_contig || !copy_state.dst.is_block_contig)
3358             block_copy_push(block, &copy_state, uvm_va_block_region_for_page(page_index), &push);
3359 
3360         last_index = page_index;
3361     }
3362 
3363     // Copy the remaining pages
3364     if (copying_gpu) {
3365         uvm_va_block_region_t contig_region = uvm_va_block_region(contig_start_index, last_index + 1);
3366         UVM_ASSERT(uvm_va_block_region_contains_region(region, contig_region));
3367 
3368         if (copy_state.src.is_block_contig && copy_state.dst.is_block_contig)
3369             block_copy_push(block, &copy_state, contig_region, &push);
3370 
3371         uvm_perf_event_notify_migration(&va_space->perf_events,
3372                                         &push,
3373                                         block,
3374                                         dst_id,
3375                                         src_id,
3376                                         uvm_va_block_region_start(block, contig_region),
3377                                         uvm_va_block_region_size(contig_region),
3378                                         transfer_mode,
3379                                         contig_cause,
3380                                         &block_context->make_resident);
3381 
3382         status = block_copy_end_push(block, &copy_state, copy_tracker, status, &push);
3383     }
3384 
3385     // Update VA block status bits
3386     //
3387     // Only update the bits for the pages that succeeded
3388     if (status != NV_OK)
3389         uvm_page_mask_region_clear(copy_mask, uvm_va_block_region(page_index, PAGES_PER_UVM_VA_BLOCK));
3390 
3391     *copied_pages = uvm_page_mask_weight(copy_mask);
3392     if (*copied_pages)
3393         uvm_page_mask_or(migrated_pages, migrated_pages, copy_mask);
3394 
3395     return status;
3396 }
3397 
3398 // Copy resident pages to the destination from all source processors in the
3399 // src_processor_mask
3400 //
3401 // The function adds the pages that were successfully copied to the output
3402 // migrated_pages mask and returns the number of pages in copied_pages. These
3403 // fields are reliable even if an error is returned.
3404 static NV_STATUS block_copy_resident_pages_mask(uvm_va_block_t *block,
3405                                                 uvm_va_block_context_t *block_context,
3406                                                 uvm_processor_id_t dst_id,
3407                                                 const uvm_processor_mask_t *src_processor_mask,
3408                                                 uvm_va_block_region_t region,
3409                                                 const uvm_page_mask_t *page_mask,
3410                                                 const uvm_page_mask_t *prefetch_page_mask,
3411                                                 uvm_va_block_transfer_mode_t transfer_mode,
3412                                                 NvU32 max_pages_to_copy,
3413                                                 uvm_page_mask_t *migrated_pages,
3414                                                 NvU32 *copied_pages_out,
3415                                                 uvm_tracker_t *tracker_out)
3416 {
3417     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
3418     uvm_processor_id_t src_id;
3419     uvm_processor_mask_t search_mask;
3420     uvm_page_mask_t *copy_mask = &block_context->make_resident.copy_resident_pages_mask;
3421 
3422     uvm_processor_mask_copy(&search_mask, src_processor_mask);
3423 
3424     *copied_pages_out = 0;
3425 
3426     for_each_closest_id(src_id, &search_mask, dst_id, va_space) {
3427         uvm_page_mask_t *src_resident_mask = uvm_va_block_resident_mask_get(block, src_id);
3428         NV_STATUS status;
3429         NvU32 copied_pages_from_src;
3430 
3431         UVM_ASSERT(!uvm_id_equal(src_id, dst_id));
3432 
3433         uvm_page_mask_init_from_region(copy_mask, region, src_resident_mask);
3434 
3435         if (page_mask)
3436             uvm_page_mask_and(copy_mask, copy_mask, page_mask);
3437 
3438         status = block_copy_resident_pages_between(block,
3439                                                    block_context,
3440                                                    dst_id,
3441                                                    src_id,
3442                                                    region,
3443                                                    copy_mask,
3444                                                    prefetch_page_mask,
3445                                                    transfer_mode,
3446                                                    migrated_pages,
3447                                                    &copied_pages_from_src,
3448                                                    tracker_out);
3449         *copied_pages_out += copied_pages_from_src;
3450         UVM_ASSERT(*copied_pages_out <= max_pages_to_copy);
3451 
3452         if (status != NV_OK)
3453             return status;
3454 
3455         // Break out once we copied max pages already
3456         if (*copied_pages_out == max_pages_to_copy)
3457             break;
3458     }
3459 
3460     return NV_OK;
3461 }
3462 
3463 static void break_read_duplication_in_region(uvm_va_block_t *block,
3464                                              uvm_va_block_context_t *block_context,
3465                                              uvm_processor_id_t dst_id,
3466                                              uvm_va_block_region_t region,
3467                                              const uvm_page_mask_t *page_mask)
3468 {
3469     uvm_processor_id_t id;
3470     uvm_page_mask_t *break_pages_in_region = &block_context->scratch_page_mask;
3471 
3472     uvm_page_mask_init_from_region(break_pages_in_region, region, page_mask);
3473 
3474     UVM_ASSERT(uvm_page_mask_subset(break_pages_in_region, uvm_va_block_resident_mask_get(block, dst_id)));
3475 
3476     // Clear read_duplicated bit for all pages in region
3477     uvm_page_mask_andnot(&block->read_duplicated_pages, &block->read_duplicated_pages, break_pages_in_region);
3478 
3479     // Clear residency bits for all processors other than dst_id
3480     for_each_id_in_mask(id, &block->resident) {
3481         uvm_page_mask_t *other_resident_mask;
3482 
3483         if (uvm_id_equal(id, dst_id))
3484             continue;
3485 
3486         other_resident_mask = uvm_va_block_resident_mask_get(block, id);
3487 
3488         if (!uvm_page_mask_andnot(other_resident_mask, other_resident_mask, break_pages_in_region))
3489             block_clear_resident_processor(block, id);
3490     }
3491 }
3492 
3493 static void block_copy_set_first_touch_residency(uvm_va_block_t *block,
3494                                                  uvm_va_block_context_t *block_context,
3495                                                  uvm_processor_id_t dst_id,
3496                                                  uvm_va_block_region_t region,
3497                                                  const uvm_page_mask_t *page_mask)
3498 {
3499     uvm_page_index_t page_index;
3500     uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(block, dst_id);
3501     uvm_page_mask_t *first_touch_mask = &block_context->make_resident.page_mask;
3502 
3503     if (page_mask)
3504         uvm_page_mask_andnot(first_touch_mask, page_mask, resident_mask);
3505     else
3506         uvm_page_mask_complement(first_touch_mask, resident_mask);
3507 
3508     uvm_page_mask_region_clear_outside(first_touch_mask, region);
3509 
3510     for_each_va_block_page_in_mask(page_index, first_touch_mask, block) {
3511         UVM_ASSERT(!block_is_page_resident_anywhere(block, page_index));
3512         UVM_ASSERT(block_processor_page_is_populated(block, dst_id, page_index));
3513         UVM_ASSERT(block_check_resident_proximity(block, page_index, dst_id));
3514     }
3515 
3516     uvm_page_mask_or(resident_mask, resident_mask, first_touch_mask);
3517     if (!uvm_page_mask_empty(resident_mask))
3518         block_set_resident_processor(block, dst_id);
3519 
3520     // Add them to the output mask, too
3521     uvm_page_mask_or(&block_context->make_resident.pages_changed_residency,
3522                      &block_context->make_resident.pages_changed_residency,
3523                      first_touch_mask);
3524 }
3525 
3526 // Copy resident pages from other processors to the destination.
3527 // All the pages on the destination need to be populated by the caller first.
3528 // Pages not resident anywhere else need to be zeroed out as well.
3529 // The transfer_mode is only used to tell uvm_perf_event_notify_migration()
3530 // whether the copy is for a migration or read duplication.
3531 static NV_STATUS block_copy_resident_pages(uvm_va_block_t *block,
3532                                            uvm_va_block_context_t *block_context,
3533                                            uvm_processor_id_t dst_id,
3534                                            uvm_va_block_region_t region,
3535                                            const uvm_page_mask_t *page_mask,
3536                                            const uvm_page_mask_t *prefetch_page_mask,
3537                                            uvm_va_block_transfer_mode_t transfer_mode)
3538 {
3539     NV_STATUS status = NV_OK;
3540     NV_STATUS tracker_status;
3541     uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
3542     uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(block, dst_id);
3543     NvU32 missing_pages_count;
3544     NvU32 pages_copied;
3545     NvU32 pages_copied_to_cpu;
3546     uvm_processor_mask_t src_processor_mask;
3547     uvm_page_mask_t *copy_page_mask = &block_context->make_resident.page_mask;
3548     uvm_page_mask_t *migrated_pages = &block_context->make_resident.pages_migrated;
3549     uvm_page_mask_t *staged_pages = &block_context->make_resident.pages_staged;
3550 
3551     uvm_page_mask_zero(migrated_pages);
3552     uvm_page_mask_zero(staged_pages);
3553 
3554     if (page_mask)
3555         uvm_page_mask_andnot(copy_page_mask, page_mask, resident_mask);
3556     else
3557         uvm_page_mask_complement(copy_page_mask, resident_mask);
3558 
3559     missing_pages_count = uvm_page_mask_region_weight(copy_page_mask, region);
3560 
3561     if (missing_pages_count == 0)
3562         goto out;
3563 
3564     // TODO: Bug 1753731: Add P2P2P copies staged through a GPU
3565     // TODO: Bug 1753731: When a page is resident in multiple locations due to
3566     //       read-duplication, spread out the source of the copy so we don't
3567     //       bottleneck on a single location.
3568 
3569     uvm_processor_mask_zero(&src_processor_mask);
3570 
3571     if (!uvm_id_equal(dst_id, UVM_ID_CPU)) {
3572         // If the destination is a GPU, first copy everything from processors
3573         // with copy access supported. Notably this will copy pages from the CPU
3574         // as well even if later some extra copies from CPU are required for
3575         // staged copies.
3576         uvm_processor_mask_and(&src_processor_mask, block_get_can_copy_from_mask(block, dst_id), &block->resident);
3577         uvm_processor_mask_clear(&src_processor_mask, dst_id);
3578 
3579         status = block_copy_resident_pages_mask(block,
3580                                                 block_context,
3581                                                 dst_id,
3582                                                 &src_processor_mask,
3583                                                 region,
3584                                                 copy_page_mask,
3585                                                 prefetch_page_mask,
3586                                                 transfer_mode,
3587                                                 missing_pages_count,
3588                                                 migrated_pages,
3589                                                 &pages_copied,
3590                                                 &local_tracker);
3591 
3592         UVM_ASSERT(missing_pages_count >= pages_copied);
3593         missing_pages_count -= pages_copied;
3594 
3595         if (status != NV_OK)
3596             goto out;
3597 
3598         if (missing_pages_count == 0)
3599             goto out;
3600 
3601         if (pages_copied)
3602             uvm_page_mask_andnot(copy_page_mask, copy_page_mask, migrated_pages);
3603     }
3604 
3605     // Now copy from everywhere else to the CPU. This is both for when the
3606     // destination is the CPU (src_processor_mask empty) and for a staged copy
3607     // (src_processor_mask containing processors with copy access to dst_id).
3608     uvm_processor_mask_andnot(&src_processor_mask, &block->resident, &src_processor_mask);
3609     uvm_processor_mask_clear(&src_processor_mask, dst_id);
3610     uvm_processor_mask_clear(&src_processor_mask, UVM_ID_CPU);
3611 
3612     status = block_copy_resident_pages_mask(block,
3613                                             block_context,
3614                                             UVM_ID_CPU,
3615                                             &src_processor_mask,
3616                                             region,
3617                                             copy_page_mask,
3618                                             prefetch_page_mask,
3619                                             transfer_mode,
3620                                             missing_pages_count,
3621                                             staged_pages,
3622                                             &pages_copied_to_cpu,
3623                                             &local_tracker);
3624     if (status != NV_OK)
3625         goto out;
3626 
3627     // If destination is the CPU then we copied everything there above
3628     if (UVM_ID_IS_CPU(dst_id)) {
3629         uvm_page_mask_or(migrated_pages, migrated_pages, staged_pages);
3630         missing_pages_count -= pages_copied_to_cpu;
3631 
3632         goto out;
3633     }
3634 
3635     // Add everything to the block's tracker so that the
3636     // block_copy_resident_pages_between() call below will acquire it.
3637     status = uvm_tracker_add_tracker_safe(&block->tracker, &local_tracker);
3638     if (status != NV_OK)
3639         goto out;
3640     uvm_tracker_clear(&local_tracker);
3641 
3642     // Now copy staged pages from the CPU to the destination.
3643     status = block_copy_resident_pages_between(block,
3644                                                block_context,
3645                                                dst_id,
3646                                                UVM_ID_CPU,
3647                                                region,
3648                                                staged_pages,
3649                                                prefetch_page_mask,
3650                                                transfer_mode,
3651                                                migrated_pages,
3652                                                &pages_copied,
3653                                                &local_tracker);
3654 
3655     UVM_ASSERT(missing_pages_count >= pages_copied);
3656     missing_pages_count -= pages_copied;
3657 
3658     if (status != NV_OK)
3659         goto out;
3660 
3661     // If we get here, that means we were staging the copy through the CPU and
3662     // we should copy as many pages from the CPU as we copied to the CPU.
3663     UVM_ASSERT(pages_copied == pages_copied_to_cpu);
3664 
3665 out:
3666     // Add everything from the local tracker to the block's tracker.
3667     // Notably this is also needed for handling
3668     // block_copy_resident_pages_between() failures in the first loop.
3669     tracker_status = uvm_tracker_add_tracker_safe(&block->tracker, &local_tracker);
3670     uvm_tracker_deinit(&local_tracker);
3671 
3672     return status == NV_OK ? tracker_status : status;
3673 }
3674 
3675 NV_STATUS uvm_va_block_make_resident_copy(uvm_va_block_t *va_block,
3676                                           uvm_va_block_retry_t *va_block_retry,
3677                                           uvm_va_block_context_t *va_block_context,
3678                                           uvm_processor_id_t dest_id,
3679                                           uvm_va_block_region_t region,
3680                                           const uvm_page_mask_t *page_mask,
3681                                           const uvm_page_mask_t *prefetch_page_mask,
3682                                           uvm_make_resident_cause_t cause)
3683 {
3684     NV_STATUS status;
3685     uvm_processor_mask_t unmap_processor_mask;
3686     uvm_page_mask_t *unmap_page_mask = &va_block_context->make_resident.page_mask;
3687     uvm_page_mask_t *resident_mask;
3688 
3689     va_block_context->make_resident.dest_id = dest_id;
3690     va_block_context->make_resident.cause = cause;
3691 
3692     if (prefetch_page_mask) {
3693         UVM_ASSERT(cause == UVM_MAKE_RESIDENT_CAUSE_REPLAYABLE_FAULT ||
3694                    cause == UVM_MAKE_RESIDENT_CAUSE_NON_REPLAYABLE_FAULT ||
3695                    cause == UVM_MAKE_RESIDENT_CAUSE_ACCESS_COUNTER);
3696     }
3697 
3698     uvm_assert_mutex_locked(&va_block->lock);
3699     UVM_ASSERT(uvm_va_block_is_hmm(va_block) || va_block->va_range->type == UVM_VA_RANGE_TYPE_MANAGED);
3700     UVM_ASSERT(uvm_va_block_check_policy_is_valid(va_block, va_block_context->policy, region));
3701 
3702     resident_mask = block_resident_mask_get_alloc(va_block, dest_id);
3703     if (!resident_mask)
3704         return NV_ERR_NO_MEMORY;
3705 
3706     // Unmap all mapped processors except for UVM-Lite GPUs as their mappings
3707     // are largely persistent.
3708     uvm_processor_mask_andnot(&unmap_processor_mask, &va_block->mapped, block_get_uvm_lite_gpus(va_block));
3709 
3710     if (page_mask)
3711         uvm_page_mask_andnot(unmap_page_mask, page_mask, resident_mask);
3712     else
3713         uvm_page_mask_complement(unmap_page_mask, resident_mask);
3714     uvm_page_mask_region_clear_outside(unmap_page_mask, region);
3715 
3716     // Unmap all pages not resident on the destination
3717     status = uvm_va_block_unmap_mask(va_block, va_block_context, &unmap_processor_mask, region, unmap_page_mask);
3718     if (status != NV_OK)
3719         return status;
3720 
3721     if (page_mask)
3722         uvm_page_mask_and(unmap_page_mask, page_mask, &va_block->read_duplicated_pages);
3723     else
3724         uvm_page_mask_init_from_region(unmap_page_mask, region, &va_block->read_duplicated_pages);
3725     uvm_page_mask_region_clear_outside(unmap_page_mask, region);
3726 
3727     // Also unmap read-duplicated pages excluding dest_id
3728     uvm_processor_mask_clear(&unmap_processor_mask, dest_id);
3729     status = uvm_va_block_unmap_mask(va_block, va_block_context, &unmap_processor_mask, region, unmap_page_mask);
3730     if (status != NV_OK)
3731         return status;
3732 
3733     uvm_tools_record_read_duplicate_invalidate(va_block,
3734                                                dest_id,
3735                                                region,
3736                                                unmap_page_mask);
3737 
3738     // Note that block_populate_pages and block_copy_resident_pages also use
3739     // va_block_context->make_resident.page_mask.
3740     unmap_page_mask = NULL;
3741 
3742     status = block_populate_pages(va_block, va_block_retry, va_block_context, dest_id, region, page_mask);
3743     if (status != NV_OK)
3744         return status;
3745 
3746     return block_copy_resident_pages(va_block,
3747                                      va_block_context,
3748                                      dest_id,
3749                                      region,
3750                                      page_mask,
3751                                      prefetch_page_mask,
3752                                      UVM_VA_BLOCK_TRANSFER_MODE_MOVE);
3753 }
3754 
3755 static void block_make_resident_clear_evicted(uvm_va_block_t *va_block,
3756                                               uvm_processor_id_t dst_id,
3757                                               uvm_page_mask_t *page_mask)
3758 {
3759     uvm_va_block_gpu_state_t *dst_gpu_state = uvm_va_block_gpu_state_get(va_block, dst_id);
3760 
3761     UVM_ASSERT(dst_gpu_state);
3762 
3763     if (!uvm_page_mask_andnot(&dst_gpu_state->evicted, &dst_gpu_state->evicted, page_mask))
3764         uvm_processor_mask_clear(&va_block->evicted_gpus, dst_id);
3765 }
3766 
3767 static void block_make_resident_update_state(uvm_va_block_t *va_block,
3768                                              uvm_va_block_context_t *va_block_context,
3769                                              uvm_processor_id_t dst_id,
3770                                              uvm_va_block_region_t region,
3771                                              uvm_page_mask_t *copy_mask,
3772                                              uvm_make_resident_cause_t cause)
3773 {
3774     uvm_page_mask_t *dst_resident_mask = uvm_va_block_resident_mask_get(va_block, dst_id);
3775 
3776     uvm_page_mask_or(dst_resident_mask, dst_resident_mask, copy_mask);
3777     block_set_resident_processor(va_block, dst_id);
3778 
3779     // Accumulate the pages that migrated into the output mask.
3780     uvm_page_mask_or(&va_block_context->make_resident.pages_changed_residency,
3781                      &va_block_context->make_resident.pages_changed_residency,
3782                      copy_mask);
3783 
3784     // Any move operation implies that mappings have been removed from all
3785     // non-UVM-Lite GPUs.
3786     uvm_page_mask_andnot(&va_block->maybe_mapped_pages, &va_block->maybe_mapped_pages, copy_mask);
3787 
3788     // If we are migrating due to an eviction, set the GPU as evicted and
3789     // mark the evicted pages. If we are migrating away from the CPU this
3790     // means that those pages are not evicted.
3791     if (cause == UVM_MAKE_RESIDENT_CAUSE_EVICTION) {
3792         uvm_processor_id_t src_id;
3793 
3794         UVM_ASSERT(UVM_ID_IS_CPU(dst_id));
3795 
3796         // Note that the destination is the CPU so this loop excludes it.
3797         for_each_gpu_id_in_mask(src_id, &va_block_context->make_resident.all_involved_processors) {
3798             uvm_va_block_gpu_state_t *src_gpu_state = uvm_va_block_gpu_state_get(va_block, src_id);
3799 
3800             UVM_ASSERT(src_gpu_state);
3801 
3802             uvm_page_mask_or(&src_gpu_state->evicted, &src_gpu_state->evicted, copy_mask);
3803             uvm_processor_mask_set(&va_block->evicted_gpus, src_id);
3804         }
3805     }
3806     else if (UVM_ID_IS_GPU(dst_id) && uvm_processor_mask_test(&va_block->evicted_gpus, dst_id))
3807         block_make_resident_clear_evicted(va_block, dst_id, copy_mask);
3808 }
3809 
3810 void uvm_va_block_make_resident_finish(uvm_va_block_t *va_block,
3811                                        uvm_va_block_context_t *va_block_context,
3812                                        uvm_va_block_region_t region,
3813                                        const uvm_page_mask_t *page_mask)
3814 {
3815     uvm_page_mask_t *migrated_pages = &va_block_context->make_resident.pages_migrated;
3816     uvm_processor_id_t dst_id = va_block_context->make_resident.dest_id;
3817 
3818     uvm_assert_mutex_locked(&va_block->lock);
3819 
3820     if (page_mask)
3821         uvm_page_mask_and(migrated_pages, migrated_pages, page_mask);
3822 
3823     if (!uvm_page_mask_empty(migrated_pages)) {
3824         // The migrated pages are now resident on the destination.
3825         block_make_resident_update_state(va_block,
3826                                          va_block_context,
3827                                          dst_id,
3828                                          region,
3829                                          migrated_pages,
3830                                          va_block_context->make_resident.cause);
3831     }
3832 
3833     // Pages that weren't resident anywhere else were populated at the
3834     // destination directly. Mark them as resident now.
3835     block_copy_set_first_touch_residency(va_block, va_block_context, dst_id, region, page_mask);
3836 
3837     // Break read duplication and clear residency from other processors.
3838     break_read_duplication_in_region(va_block, va_block_context, dst_id, region, page_mask);
3839 
3840     // Update eviction heuristics, if needed. Notably this could repeat the call
3841     // done in block_set_resident_processor(), but that doesn't do anything bad
3842     // and it's simpler to keep it in both places.
3843     //
3844     // Skip this if we didn't do anything (the input region and/or page mask was
3845     // empty).
3846     if (uvm_processor_mask_test(&va_block->resident, dst_id))
3847         block_mark_memory_used(va_block, dst_id);
3848 }
3849 
3850 NV_STATUS uvm_va_block_make_resident(uvm_va_block_t *va_block,
3851                                      uvm_va_block_retry_t *va_block_retry,
3852                                      uvm_va_block_context_t *va_block_context,
3853                                      uvm_processor_id_t dest_id,
3854                                      uvm_va_block_region_t region,
3855                                      const uvm_page_mask_t *page_mask,
3856                                      const uvm_page_mask_t *prefetch_page_mask,
3857                                      uvm_make_resident_cause_t cause)
3858 {
3859     NV_STATUS status;
3860 
3861     status = uvm_va_block_make_resident_copy(va_block,
3862                                              va_block_retry,
3863                                              va_block_context,
3864                                              dest_id,
3865                                              region,
3866                                              page_mask,
3867                                              prefetch_page_mask,
3868                                              cause);
3869     if (status != NV_OK)
3870         return status;
3871 
3872     uvm_va_block_make_resident_finish(va_block,
3873                                       va_block_context,
3874                                       region,
3875                                       page_mask);
3876 
3877     return NV_OK;
3878 }
3879 
3880 // Combination function which prepares the input {region, page_mask} for
3881 // entering read-duplication. It:
3882 // - Unmaps all processors but revoke_id
3883 // - Revokes write access from revoke_id
3884 static NV_STATUS block_prep_read_duplicate_mapping(uvm_va_block_t *va_block,
3885                                                    uvm_va_block_context_t *va_block_context,
3886                                                    uvm_processor_id_t revoke_id,
3887                                                    uvm_va_block_region_t region,
3888                                                    const uvm_page_mask_t *page_mask)
3889 {
3890     uvm_processor_mask_t unmap_processor_mask;
3891     uvm_processor_id_t unmap_id;
3892     uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
3893     NV_STATUS status, tracker_status;
3894 
3895     // Unmap everybody except revoke_id
3896     uvm_processor_mask_andnot(&unmap_processor_mask, &va_block->mapped, block_get_uvm_lite_gpus(va_block));
3897     uvm_processor_mask_clear(&unmap_processor_mask, revoke_id);
3898 
3899     for_each_id_in_mask(unmap_id, &unmap_processor_mask) {
3900         status = uvm_va_block_unmap(va_block,
3901                                     va_block_context,
3902                                     unmap_id,
3903                                     region,
3904                                     page_mask,
3905                                     &local_tracker);
3906         if (status != NV_OK)
3907             goto out;
3908     }
3909 
3910     // Revoke WRITE/ATOMIC access permissions from the remaining mapped
3911     // processor.
3912     status = uvm_va_block_revoke_prot(va_block,
3913                                       va_block_context,
3914                                       revoke_id,
3915                                       region,
3916                                       page_mask,
3917                                       UVM_PROT_READ_WRITE,
3918                                       &local_tracker);
3919     if (status != NV_OK)
3920         goto out;
3921 
3922 out:
3923     tracker_status = uvm_tracker_add_tracker_safe(&va_block->tracker, &local_tracker);
3924     uvm_tracker_deinit(&local_tracker);
3925     return status == NV_OK ? tracker_status : status;
3926 }
3927 
3928 NV_STATUS uvm_va_block_make_resident_read_duplicate(uvm_va_block_t *va_block,
3929                                                     uvm_va_block_retry_t *va_block_retry,
3930                                                     uvm_va_block_context_t *va_block_context,
3931                                                     uvm_processor_id_t dest_id,
3932                                                     uvm_va_block_region_t region,
3933                                                     const uvm_page_mask_t *page_mask,
3934                                                     const uvm_page_mask_t *prefetch_page_mask,
3935                                                     uvm_make_resident_cause_t cause)
3936 {
3937     NV_STATUS status = NV_OK;
3938     uvm_processor_id_t src_id;
3939     uvm_page_mask_t *dst_resident_mask;
3940     uvm_page_mask_t *cpu_resident_mask;
3941     uvm_page_mask_t *migrated_pages;
3942     uvm_page_mask_t *staged_pages;
3943     uvm_page_mask_t *first_touch_mask;
3944 
3945     // TODO: Bug 3660922: need to implement HMM read duplication support.
3946     UVM_ASSERT(!uvm_va_block_is_hmm(va_block));
3947     UVM_ASSERT(va_block_context->policy == uvm_va_range_get_policy(va_block->va_range));
3948 
3949     va_block_context->make_resident.dest_id = dest_id;
3950     va_block_context->make_resident.cause = cause;
3951 
3952     if (prefetch_page_mask) {
3953         // TODO: Bug 1877578: investigate automatic read-duplicate policies
3954         UVM_ASSERT(cause == UVM_MAKE_RESIDENT_CAUSE_REPLAYABLE_FAULT ||
3955                    cause == UVM_MAKE_RESIDENT_CAUSE_NON_REPLAYABLE_FAULT ||
3956                    cause == UVM_MAKE_RESIDENT_CAUSE_ACCESS_COUNTER);
3957     }
3958 
3959     uvm_assert_mutex_locked(&va_block->lock);
3960     UVM_ASSERT(!uvm_va_block_is_dead(va_block));
3961 
3962     // For pages that are entering read-duplication we need to unmap remote
3963     // mappings and revoke RW and higher access permissions.
3964     //
3965     // The current implementation:
3966     // - Unmaps pages from all processors but the one with the resident copy
3967     // - Revokes write access from the processor with the resident copy
3968     for_each_id_in_mask(src_id, &va_block->resident) {
3969         // Note that the below calls to block_populate_pages and
3970         // block_copy_resident_pages also use
3971         // va_block_context->make_resident.page_mask.
3972         uvm_page_mask_t *preprocess_page_mask = &va_block_context->make_resident.page_mask;
3973         const uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, src_id);
3974         UVM_ASSERT(!uvm_page_mask_empty(resident_mask));
3975 
3976         if (page_mask)
3977             uvm_page_mask_andnot(preprocess_page_mask, page_mask, &va_block->read_duplicated_pages);
3978         else
3979             uvm_page_mask_complement(preprocess_page_mask, &va_block->read_duplicated_pages);
3980 
3981         // If there are no pages that need to be unmapped/revoked, skip to the
3982         // next processor
3983         if (!uvm_page_mask_and(preprocess_page_mask, preprocess_page_mask, resident_mask))
3984             continue;
3985 
3986         status = block_prep_read_duplicate_mapping(va_block, va_block_context, src_id, region, preprocess_page_mask);
3987         if (status != NV_OK)
3988             return status;
3989     }
3990 
3991     status = block_populate_pages(va_block, va_block_retry, va_block_context, dest_id, region, page_mask);
3992     if (status != NV_OK)
3993         return status;
3994 
3995     status = block_copy_resident_pages(va_block,
3996                                        va_block_context,
3997                                        dest_id,
3998                                        region,
3999                                        page_mask,
4000                                        prefetch_page_mask,
4001                                        UVM_VA_BLOCK_TRANSFER_MODE_COPY);
4002     if (status != NV_OK)
4003         return status;
4004 
4005     // Pages that weren't resident anywhere else were populated at the
4006     // destination directly. Mark them as resident now, since there were no
4007     // errors from block_copy_resident_pages() above.
4008     // Note that va_block_context->scratch_page_mask is passed to
4009     // block_copy_set_first_touch_residency() which is generally unsafe but in
4010     // this case, block_copy_set_first_touch_residency() copies page_mask
4011     // before scratch_page_mask could be clobbered.
4012     migrated_pages = &va_block_context->make_resident.pages_migrated;
4013     first_touch_mask = &va_block_context->scratch_page_mask;
4014     uvm_page_mask_init_from_region(first_touch_mask, region, page_mask);
4015     uvm_page_mask_andnot(first_touch_mask, first_touch_mask, migrated_pages);
4016 
4017     if (!uvm_page_mask_empty(first_touch_mask))
4018         block_copy_set_first_touch_residency(va_block, va_block_context, dest_id, region, first_touch_mask);
4019 
4020     staged_pages = &va_block_context->make_resident.pages_staged;
4021     if (!UVM_ID_IS_CPU(dest_id) && !uvm_page_mask_empty(staged_pages)) {
4022         cpu_resident_mask = uvm_va_block_resident_mask_get(va_block, UVM_ID_CPU);
4023         uvm_page_mask_or(cpu_resident_mask, cpu_resident_mask, staged_pages);
4024         block_set_resident_processor(va_block, UVM_ID_CPU);
4025         uvm_page_mask_or(&va_block->read_duplicated_pages, &va_block->read_duplicated_pages, staged_pages);
4026         uvm_tools_record_read_duplicate(va_block, UVM_ID_CPU, region, staged_pages);
4027     }
4028 
4029     if (!uvm_page_mask_empty(migrated_pages)) {
4030         dst_resident_mask = uvm_va_block_resident_mask_get(va_block, dest_id);
4031         uvm_page_mask_or(dst_resident_mask, dst_resident_mask, migrated_pages);
4032         block_set_resident_processor(va_block, dest_id);
4033         uvm_page_mask_or(&va_block->read_duplicated_pages, &va_block->read_duplicated_pages, migrated_pages);
4034         uvm_tools_record_read_duplicate(va_block, dest_id, region, migrated_pages);
4035     }
4036 
4037     UVM_ASSERT(cause != UVM_MAKE_RESIDENT_CAUSE_EVICTION);
4038     if (UVM_ID_IS_GPU(dest_id) && uvm_processor_mask_test(&va_block->evicted_gpus, dest_id))
4039         block_make_resident_clear_evicted(va_block, dest_id, migrated_pages);
4040 
4041     // Update eviction heuristics, if needed. Notably this could repeat the call
4042     // done in block_set_resident_processor(), but that doesn't do anything bad
4043     // and it's simpler to keep it in both places.
4044     //
4045     // Skip this if we didn't do anything (the input region and/or page mask was
4046     // empty).
4047     if (uvm_processor_mask_test(&va_block->resident, dest_id))
4048         block_mark_memory_used(va_block, dest_id);
4049 
4050     return NV_OK;
4051 }
4052 
4053 // Looks up the current CPU mapping state of page from the
4054 // block->cpu.pte_bits bitmaps. If write access is enabled,
4055 // UVM_PROT_READ_WRITE_ATOMIC is returned instead of UVM_PROT_READ_WRITE, since
4056 // write access implies atomic access for CPUs.
4057 static uvm_prot_t block_page_prot_cpu(uvm_va_block_t *block, uvm_page_index_t page_index)
4058 {
4059     uvm_prot_t prot;
4060 
4061     UVM_ASSERT(!uvm_va_block_is_dead(block));
4062 
4063     if (uvm_page_mask_test(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE], page_index))
4064         prot = UVM_PROT_READ_WRITE_ATOMIC;
4065     else if (uvm_page_mask_test(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ], page_index))
4066         prot = UVM_PROT_READ_ONLY;
4067     else
4068         prot = UVM_PROT_NONE;
4069 
4070     return prot;
4071 }
4072 
4073 // Looks up the current GPU mapping state of page from the
4074 // block->gpus[i]->pte_bits bitmaps.
4075 static uvm_prot_t block_page_prot_gpu(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_page_index_t page_index)
4076 {
4077     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
4078     uvm_prot_t prot;
4079 
4080     UVM_ASSERT(!uvm_va_block_is_dead(block));
4081 
4082     if (!gpu_state)
4083         return UVM_PROT_NONE;
4084 
4085     if (uvm_page_mask_test(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_ATOMIC], page_index))
4086         prot = UVM_PROT_READ_WRITE_ATOMIC;
4087     else if (uvm_page_mask_test(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_WRITE], page_index))
4088         prot = UVM_PROT_READ_WRITE;
4089     else if (uvm_page_mask_test(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ], page_index))
4090         prot = UVM_PROT_READ_ONLY;
4091     else
4092         prot = UVM_PROT_NONE;
4093 
4094     return prot;
4095 }
4096 
4097 static uvm_prot_t block_page_prot(uvm_va_block_t *block, uvm_processor_id_t id, uvm_page_index_t page_index)
4098 {
4099     if (UVM_ID_IS_CPU(id))
4100         return block_page_prot_cpu(block, page_index);
4101     else
4102         return block_page_prot_gpu(block, block_get_gpu(block, id), page_index);
4103 }
4104 
4105 // Returns true if the block has any valid CPU PTE mapping in the block region.
4106 static bool block_has_valid_mapping_cpu(uvm_va_block_t *block, uvm_va_block_region_t region)
4107 {
4108     size_t valid_page;
4109 
4110     UVM_ASSERT(region.outer <= uvm_va_block_num_cpu_pages(block));
4111 
4112     // Early-out: check whether any address in this block has a CPU mapping
4113     if (!uvm_processor_mask_test(&block->mapped, UVM_ID_CPU)) {
4114         UVM_ASSERT(uvm_page_mask_empty(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ]));
4115         UVM_ASSERT(uvm_page_mask_empty(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE]));
4116         return false;
4117     }
4118 
4119     // All valid mappings have at least read permissions so we only need to
4120     // inspect the read bits.
4121     valid_page = uvm_va_block_first_page_in_mask(region, &block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ]);
4122     if (valid_page == region.outer)
4123         return false;
4124 
4125     UVM_ASSERT(block_page_prot_cpu(block, valid_page) != UVM_PROT_NONE);
4126     return true;
4127 }
4128 
4129 static bool block_check_chunk_indirect_peers(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_gpu_chunk_t *chunk)
4130 {
4131     uvm_gpu_t *accessing_gpu;
4132     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
4133 
4134     if (!uvm_pmm_sysmem_mappings_indirect_supported())
4135         return true;
4136 
4137     for_each_va_space_gpu_in_mask(accessing_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)]) {
4138         NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(&gpu->pmm, chunk, accessing_gpu);
4139         uvm_reverse_map_t reverse_map;
4140         size_t num_mappings;
4141 
4142         num_mappings = uvm_pmm_sysmem_mappings_dma_to_virt(&accessing_gpu->pmm_reverse_sysmem_mappings,
4143                                                            peer_addr,
4144                                                            uvm_gpu_chunk_get_size(chunk),
4145                                                            &reverse_map,
4146                                                            1);
4147         UVM_ASSERT(num_mappings == 1);
4148         UVM_ASSERT(reverse_map.va_block == block);
4149         UVM_ASSERT(reverse_map.region.first == chunk->va_block_page_index);
4150         UVM_ASSERT(uvm_va_block_region_size(reverse_map.region) == uvm_gpu_chunk_get_size(chunk));
4151 
4152         uvm_va_block_release_no_destroy(reverse_map.va_block);
4153     }
4154 
4155     return true;
4156 }
4157 
4158 // Sanity check the given GPU's chunks array
4159 static bool block_check_gpu_chunks(uvm_va_block_t *block, uvm_gpu_id_t id)
4160 {
4161     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, id);
4162     uvm_gpu_t *gpu;
4163     size_t i, num_chunks;
4164     uvm_page_index_t page_index;
4165     uvm_chunk_size_t chunk_size;
4166 
4167     if (!gpu_state)
4168         return true;
4169 
4170     gpu = block_get_gpu(block, id);
4171 
4172     num_chunks = block_num_gpu_chunks(block, gpu);
4173     for (page_index = 0, i = 0; i < num_chunks; i++) {
4174         uvm_gpu_chunk_t *chunk = gpu_state->chunks[i];
4175         size_t chunk_index = block_gpu_chunk_index(block, gpu, page_index, &chunk_size);
4176 
4177         if (chunk_index != i) {
4178             UVM_ERR_PRINT("chunk index mismatch: calculated %zu, is in %zu. VA block [0x%llx, 0x%llx) GPU %u page_index: %u\n",
4179                            chunk_index,
4180                            i,
4181                            block->start,
4182                            block->end + 1,
4183                            uvm_id_value(id),
4184                            page_index);
4185             return false;
4186         }
4187 
4188         if (chunk) {
4189             if (chunk_size != uvm_gpu_chunk_get_size(chunk)) {
4190                 UVM_ERR_PRINT("chunk size mismatch: calc %u, actual %u. VA block [0x%llx, 0x%llx) GPU: %u page_index: %u chunk index: %zu\n",
4191                               chunk_size,
4192                               uvm_gpu_chunk_get_size(chunk),
4193                               block->start,
4194                               block->end + 1,
4195                               uvm_id_value(id),
4196                               page_index,
4197                               i);
4198                 return false;
4199             }
4200 
4201             if (chunk->state != UVM_PMM_GPU_CHUNK_STATE_ALLOCATED) {
4202                 UVM_ERR_PRINT("Invalid chunk state %s. VA block [0x%llx, 0x%llx) GPU: %u page_index: %u chunk index: %zu chunk_size: %u\n",
4203                               uvm_pmm_gpu_chunk_state_string(chunk->state),
4204                               block->start,
4205                               block->end + 1,
4206                               uvm_id_value(id),
4207                               page_index,
4208                               i,
4209                               chunk_size);
4210                 return false;
4211             }
4212 
4213             UVM_ASSERT(chunk->va_block == block);
4214             UVM_ASSERT(chunk->va_block_page_index == page_index);
4215 
4216             UVM_ASSERT(block_check_chunk_indirect_peers(block, gpu, chunk));
4217         }
4218 
4219         page_index += chunk_size / PAGE_SIZE;
4220     }
4221 
4222     return true;
4223 }
4224 
4225 static bool block_check_chunks(uvm_va_block_t *va_block)
4226 {
4227     uvm_gpu_id_t id;
4228 
4229     for_each_gpu_id(id) {
4230         if (!block_check_gpu_chunks(va_block, id))
4231             return false;
4232     }
4233 
4234     return block_check_cpu_chunks(va_block);
4235 }
4236 
4237 // Sanity checks for page mappings
4238 static bool block_check_mappings_page(uvm_va_block_t *block, uvm_page_index_t page_index)
4239 {
4240     uvm_processor_mask_t atomic_mappings, write_mappings, read_mappings;
4241     uvm_processor_mask_t lite_read_mappings, lite_atomic_mappings;
4242     uvm_processor_mask_t remaining_mappings, temp_mappings;
4243     uvm_processor_mask_t resident_processors;
4244     const uvm_processor_mask_t *residency_accessible_from = NULL;
4245     const uvm_processor_mask_t *residency_has_native_atomics = NULL;
4246     uvm_processor_id_t residency, id;
4247     uvm_va_range_t *va_range = block->va_range;
4248     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
4249     uvm_processor_id_t preferred_location = va_range ?
4250                                             uvm_va_range_get_policy(va_range)->preferred_location :
4251                                             UVM_ID_INVALID;
4252     const uvm_processor_mask_t *uvm_lite_gpus = block_get_uvm_lite_gpus(block);
4253 
4254     block_page_authorized_processors(block, page_index, UVM_PROT_READ_WRITE_ATOMIC, &atomic_mappings);
4255     block_page_authorized_processors(block, page_index, UVM_PROT_READ_WRITE, &write_mappings);
4256     block_page_authorized_processors(block, page_index, UVM_PROT_READ_ONLY, &read_mappings);
4257 
4258     // Each access bit implies all accesses below it
4259     UVM_ASSERT(uvm_processor_mask_subset(&atomic_mappings, &write_mappings));
4260     UVM_ASSERT(uvm_processor_mask_subset(&write_mappings, &read_mappings));
4261     UVM_ASSERT(uvm_processor_mask_subset(&read_mappings, &block->mapped));
4262 
4263     uvm_va_block_page_resident_processors(block, page_index, &resident_processors);
4264     UVM_ASSERT(uvm_processor_mask_subset(&resident_processors, &block->resident));
4265 
4266     // Sanity check block_get_mapped_processors
4267     uvm_processor_mask_copy(&remaining_mappings, &read_mappings);
4268     for_each_id_in_mask(residency, &resident_processors) {
4269         block_get_mapped_processors(block, residency, page_index, &temp_mappings);
4270         UVM_ASSERT(uvm_processor_mask_subset(&temp_mappings, &remaining_mappings));
4271         uvm_processor_mask_andnot(&remaining_mappings, &remaining_mappings, &temp_mappings);
4272     }
4273 
4274     // Any remaining mappings point to non-resident locations, so they must be
4275     // UVM-Lite mappings.
4276     UVM_ASSERT(uvm_processor_mask_subset(&remaining_mappings, uvm_lite_gpus));
4277 
4278     residency = uvm_processor_mask_find_first_id(&resident_processors);
4279 
4280     if (uvm_processor_mask_get_count(&resident_processors) > 0) {
4281         residency_accessible_from    = &va_space->accessible_from[uvm_id_value(residency)];
4282         residency_has_native_atomics = &va_space->has_native_atomics[uvm_id_value(residency)];
4283     }
4284 
4285     // If the page is not resident, there should be no valid mappings
4286     UVM_ASSERT_MSG(uvm_processor_mask_get_count(&resident_processors) > 0 ||
4287                    uvm_processor_mask_get_count(&read_mappings) == 0,
4288                    "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx - SWA: 0x%lx - RD: 0x%lx\n",
4289                    *resident_processors.bitmap,
4290                    *read_mappings.bitmap, *write_mappings.bitmap, *atomic_mappings.bitmap,
4291                    *va_space->system_wide_atomics_enabled_processors.bitmap,
4292                    *block->read_duplicated_pages.bitmap);
4293 
4294     // Test read_duplicated_pages mask
4295     UVM_ASSERT_MSG((uvm_processor_mask_get_count(&resident_processors) <= 1 &&
4296                      !uvm_page_mask_test(&block->read_duplicated_pages, page_index)) ||
4297                    (uvm_processor_mask_get_count(&resident_processors) > 1 &&
4298                      uvm_page_mask_test(&block->read_duplicated_pages, page_index)),
4299                    "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx - SWA: 0x%lx - RD: 0x%lx\n",
4300                    *resident_processors.bitmap,
4301                    *read_mappings.bitmap, *write_mappings.bitmap, *atomic_mappings.bitmap,
4302                    *va_space->system_wide_atomics_enabled_processors.bitmap,
4303                    *block->read_duplicated_pages.bitmap);
4304 
4305     if (!uvm_processor_mask_empty(uvm_lite_gpus))
4306         UVM_ASSERT(UVM_ID_IS_VALID(preferred_location));
4307 
4308     // UVM-Lite checks. Since the range group is made non-migratable before the
4309     // actual migrations for that range group happen, we can only make those
4310     // checks which are valid on both migratable and non-migratable range
4311     // groups.
4312     uvm_processor_mask_and(&lite_read_mappings, &read_mappings, uvm_lite_gpus);
4313     uvm_processor_mask_and(&lite_atomic_mappings, &atomic_mappings, uvm_lite_gpus);
4314 
4315     // Any mapping from a UVM-Lite GPU must be atomic...
4316     UVM_ASSERT(uvm_processor_mask_equal(&lite_read_mappings, &lite_atomic_mappings));
4317 
4318     // ... and must have access to preferred_location
4319     if (UVM_ID_IS_VALID(preferred_location)) {
4320         const uvm_processor_mask_t *preferred_location_accessible_from;
4321 
4322         preferred_location_accessible_from = &va_space->accessible_from[uvm_id_value(preferred_location)];
4323         UVM_ASSERT(uvm_processor_mask_subset(&lite_atomic_mappings, preferred_location_accessible_from));
4324     }
4325 
4326     for_each_id_in_mask(id, &lite_atomic_mappings)
4327         UVM_ASSERT(uvm_processor_mask_test(&va_space->can_access[uvm_id_value(id)], preferred_location));
4328 
4329     // Exclude uvm_lite_gpus from mappings' masks after UVM-Lite tests
4330     uvm_processor_mask_andnot(&read_mappings, &read_mappings, uvm_lite_gpus);
4331     uvm_processor_mask_andnot(&write_mappings, &write_mappings, uvm_lite_gpus);
4332     uvm_processor_mask_andnot(&atomic_mappings, &atomic_mappings, uvm_lite_gpus);
4333 
4334     // Pages set to zero in maybe_mapped_pages must not be mapped on any
4335     // non-UVM-Lite GPU
4336     if (!uvm_page_mask_test(&block->maybe_mapped_pages, page_index)) {
4337         UVM_ASSERT_MSG(uvm_processor_mask_get_count(&read_mappings) == 0,
4338                        "Resident: 0x%lx - Mappings Block: 0x%lx / Page R: 0x%lx W: 0x%lx A: 0x%lx\n",
4339                        *resident_processors.bitmap,
4340                        *block->mapped.bitmap,
4341                        *read_mappings.bitmap, *write_mappings.bitmap, *atomic_mappings.bitmap);
4342     }
4343 
4344     // atomic mappings from GPUs with disabled system-wide atomics are treated
4345     // as write mappings. Therefore, we remove them from the atomic mappings mask
4346     uvm_processor_mask_and(&atomic_mappings, &atomic_mappings, &va_space->system_wide_atomics_enabled_processors);
4347 
4348     if (!uvm_processor_mask_empty(&read_mappings)) {
4349         // Read-duplicate: if a page is resident in multiple locations, it
4350         // must be resident locally on each mapped processor.
4351         if (uvm_processor_mask_get_count(&resident_processors) > 1) {
4352             UVM_ASSERT_MSG(uvm_processor_mask_subset(&read_mappings, &resident_processors),
4353                            "Read-duplicate copies from remote processors\n"
4354                            "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx - SWA: 0x%lx - RD: 0x%lx\n",
4355                            *resident_processors.bitmap,
4356                            *read_mappings.bitmap, *write_mappings.bitmap, *atomic_mappings.bitmap,
4357                            *va_space->system_wide_atomics_enabled_processors.bitmap,
4358                            *block->read_duplicated_pages.bitmap);
4359         }
4360         else {
4361             // Processors with mappings must have access to the processor that
4362             // has the valid copy
4363             UVM_ASSERT_MSG(uvm_processor_mask_subset(&read_mappings, residency_accessible_from),
4364                            "Not all processors have access to %s\n"
4365                            "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx -"
4366                            "Access: 0x%lx - Native Atomics: 0x%lx - SWA: 0x%lx\n",
4367                            uvm_va_space_processor_name(va_space, residency),
4368                            *resident_processors.bitmap,
4369                            *read_mappings.bitmap,
4370                            *write_mappings.bitmap,
4371                            *atomic_mappings.bitmap,
4372                            *residency_accessible_from->bitmap,
4373                            *residency_has_native_atomics->bitmap,
4374                            *va_space->system_wide_atomics_enabled_processors.bitmap);
4375             for_each_id_in_mask(id, &read_mappings) {
4376                 UVM_ASSERT(uvm_processor_mask_test(&va_space->can_access[uvm_id_value(id)], residency));
4377 
4378                 if (uvm_processor_mask_test(&va_space->indirect_peers[uvm_id_value(residency)], id)) {
4379                     uvm_gpu_t *resident_gpu = uvm_va_space_get_gpu(va_space, residency);
4380                     uvm_gpu_t *mapped_gpu = uvm_va_space_get_gpu(va_space, id);
4381                     uvm_gpu_chunk_t *chunk = block_phys_page_chunk(block, block_phys_page(residency, page_index), NULL);
4382 
4383                     // This function will assert if no mapping exists
4384                     (void)uvm_pmm_gpu_indirect_peer_addr(&resident_gpu->pmm, chunk, mapped_gpu);
4385                 }
4386             }
4387         }
4388     }
4389 
4390     // If any processor has a writable mapping, there must only be one copy of
4391     // the page in the system
4392     if (!uvm_processor_mask_empty(&write_mappings)) {
4393         UVM_ASSERT_MSG(uvm_processor_mask_get_count(&resident_processors) == 1,
4394                        "Too many resident copies for pages with write_mappings\n"
4395                        "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx - SWA: 0x%lx - RD: 0x%lx\n",
4396                        *resident_processors.bitmap,
4397                        *read_mappings.bitmap,
4398                        *write_mappings.bitmap,
4399                        *atomic_mappings.bitmap,
4400                        *va_space->system_wide_atomics_enabled_processors.bitmap,
4401                        *block->read_duplicated_pages.bitmap);
4402     }
4403 
4404     if (!uvm_processor_mask_empty(&atomic_mappings)) {
4405         uvm_processor_mask_t native_atomics;
4406 
4407         uvm_processor_mask_and(&native_atomics, &atomic_mappings, residency_has_native_atomics);
4408 
4409         if (uvm_processor_mask_empty(&native_atomics)) {
4410             // No other faultable processor should be able to write
4411             uvm_processor_mask_and(&write_mappings, &write_mappings, &va_space->faultable_processors);
4412 
4413             UVM_ASSERT_MSG(uvm_processor_mask_get_count(&write_mappings) == 1,
4414                            "Too many write mappings to %s from processors with non-native atomics\n"
4415                            "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx -"
4416                            "Access: 0x%lx - Native Atomics: 0x%lx - SWA: 0x%lx\n",
4417                            uvm_va_space_processor_name(va_space, residency),
4418                            *resident_processors.bitmap,
4419                            *read_mappings.bitmap,
4420                            *write_mappings.bitmap,
4421                            *atomic_mappings.bitmap,
4422                            *residency_accessible_from->bitmap,
4423                            *residency_has_native_atomics->bitmap,
4424                            *va_space->system_wide_atomics_enabled_processors.bitmap);
4425 
4426             // Only one processor outside of the native group can have atomics enabled
4427             UVM_ASSERT_MSG(uvm_processor_mask_get_count(&atomic_mappings) == 1,
4428                            "Too many atomics mappings to %s from processors with non-native atomics\n"
4429                            "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx -"
4430                            "Access: 0x%lx - Native Atomics: 0x%lx - SWA: 0x%lx\n",
4431                            uvm_va_space_processor_name(va_space, residency),
4432                            *resident_processors.bitmap,
4433                            *read_mappings.bitmap,
4434                            *write_mappings.bitmap,
4435                            *atomic_mappings.bitmap,
4436                            *residency_accessible_from->bitmap,
4437                            *residency_has_native_atomics->bitmap,
4438                            *va_space->system_wide_atomics_enabled_processors.bitmap);
4439         }
4440         else {
4441             uvm_processor_mask_t non_native_atomics;
4442 
4443             // One or more processors within the native group have atomics enabled.
4444             // All processors outside of that group may have write but not atomic
4445             // permissions.
4446             uvm_processor_mask_andnot(&non_native_atomics, &atomic_mappings, residency_has_native_atomics);
4447 
4448             UVM_ASSERT_MSG(uvm_processor_mask_empty(&non_native_atomics),
4449                            "atomic mappings to %s from processors native and non-native\n"
4450                            "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx -"
4451                            "Access: 0x%lx - Native Atomics: 0x%lx - SWA: 0x%lx\n",
4452                            uvm_va_space_processor_name(va_space, residency),
4453                            *resident_processors.bitmap,
4454                            *read_mappings.bitmap,
4455                            *write_mappings.bitmap,
4456                            *atomic_mappings.bitmap,
4457                            *residency_accessible_from->bitmap,
4458                            *residency_has_native_atomics->bitmap,
4459                            *va_space->system_wide_atomics_enabled_processors.bitmap);
4460         }
4461     }
4462 
4463     return true;
4464 }
4465 
4466 static bool block_check_mappings_ptes(uvm_va_block_t *block, uvm_gpu_t *gpu)
4467 {
4468     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
4469     uvm_va_block_gpu_state_t *resident_gpu_state;
4470     uvm_pte_bits_gpu_t pte_bit;
4471     uvm_processor_id_t resident_id;
4472     uvm_prot_t prot;
4473     NvU32 big_page_size;
4474     size_t num_big_pages, big_page_index;
4475     uvm_va_block_region_t big_region, chunk_region;
4476     uvm_gpu_chunk_t *chunk;
4477 
4478     if (!gpu_state->page_table_range_4k.table)
4479         UVM_ASSERT(!gpu_state->activated_4k);
4480 
4481     if (!gpu_state->page_table_range_big.table) {
4482         UVM_ASSERT(!gpu_state->initialized_big);
4483         UVM_ASSERT(!gpu_state->activated_big);
4484     }
4485 
4486     // It's only safe to check the PTE mappings if we have page tables. See
4487     // uvm_va_block_get_gpu_va_space.
4488     if (!block_gpu_has_page_tables(block, gpu)) {
4489         UVM_ASSERT(!uvm_processor_mask_test(&block->mapped, gpu->id));
4490         return true;
4491     }
4492 
4493     big_page_size = uvm_va_block_gpu_big_page_size(block, gpu);
4494     num_big_pages = uvm_va_block_num_big_pages(block, big_page_size);
4495 
4496     if (block_gpu_supports_2m(block, gpu)) {
4497         if (gpu_state->page_table_range_big.table || gpu_state->page_table_range_4k.table) {
4498             // 2M blocks require the 2M entry to be allocated for the lower
4499             // ranges to also be allocated.
4500             UVM_ASSERT(gpu_state->page_table_range_2m.table);
4501         }
4502         else if (gpu_state->page_table_range_2m.table) {
4503             // If the 2M entry is present but the lower ones aren't, the PTE
4504             // must be 2M.
4505             UVM_ASSERT(gpu_state->pte_is_2m);
4506         }
4507     }
4508     else {
4509         UVM_ASSERT(!gpu_state->page_table_range_2m.table);
4510         if (num_big_pages == 0)
4511             UVM_ASSERT(!gpu_state->page_table_range_big.table);
4512     }
4513 
4514     // If we have the big table and it's in use then it must have been
4515     // initialized, even if it doesn't currently contain active PTEs.
4516     if ((!block_gpu_supports_2m(block, gpu) && gpu_state->page_table_range_big.table) ||
4517         (block_gpu_supports_2m(block, gpu) && !gpu_state->pte_is_2m && gpu_state->activated_big))
4518         UVM_ASSERT(gpu_state->initialized_big);
4519 
4520     if (gpu_state->pte_is_2m) {
4521         UVM_ASSERT(block_gpu_supports_2m(block, gpu));
4522         UVM_ASSERT(gpu_state->page_table_range_2m.table);
4523         UVM_ASSERT(bitmap_empty(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK));
4524         UVM_ASSERT(!gpu_state->force_4k_ptes);
4525 
4526         // GPU architectures which support 2M pages only support 64K as the big
4527         // page size. All of the 2M code assumes that
4528         // MAX_BIG_PAGES_PER_UVM_VA_BLOCK covers a 2M PTE exactly (bitmap_full,
4529         // bitmap_complement, etc).
4530         BUILD_BUG_ON((UVM_PAGE_SIZE_2M / UVM_PAGE_SIZE_64K) != MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
4531 
4532         prot = block_page_prot_gpu(block, gpu, 0);
4533 
4534         // All page permissions match
4535         for (pte_bit = 0; pte_bit < UVM_PTE_BITS_GPU_MAX; pte_bit++) {
4536             if (prot == UVM_PROT_NONE || pte_bit > get_gpu_pte_bit_index(prot))
4537                 UVM_ASSERT(uvm_page_mask_empty(&gpu_state->pte_bits[pte_bit]));
4538             else
4539                 UVM_ASSERT(uvm_page_mask_full(&gpu_state->pte_bits[pte_bit]));
4540         }
4541 
4542         if (prot != UVM_PROT_NONE) {
4543             resident_id = block_gpu_get_processor_to_map(block, gpu, 0);
4544 
4545             // block_check_resident_proximity verifies that no closer processor
4546             // has a resident page, so we don't need to check that all pages
4547             // have the same resident_id.
4548 
4549             // block_check_mappings_page verifies that all pages marked resident
4550             // are backed by populated memory.
4551 
4552             // The mapped processor should be fully resident and physically-
4553             // contiguous.
4554             UVM_ASSERT(uvm_page_mask_full(uvm_va_block_resident_mask_get(block, resident_id)));
4555 
4556             if (UVM_ID_IS_GPU(resident_id)) {
4557                 resident_gpu_state = uvm_va_block_gpu_state_get(block, resident_id);
4558                 UVM_ASSERT(resident_gpu_state);
4559                 UVM_ASSERT(uvm_gpu_chunk_get_size(resident_gpu_state->chunks[0]) == UVM_CHUNK_SIZE_2M);
4560             }
4561             else {
4562                 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_first_in_region(block,
4563                                                                        uvm_va_block_region_from_block(block),
4564                                                                        NULL);
4565 
4566                 UVM_ASSERT(uvm_page_mask_full(&block->cpu.allocated));
4567                 UVM_ASSERT(chunk);
4568                 UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_2M);
4569             }
4570         }
4571     }
4572     else if (!bitmap_empty(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)) {
4573         UVM_ASSERT(gpu_state->page_table_range_big.table);
4574         UVM_ASSERT(!gpu_state->force_4k_ptes);
4575         UVM_ASSERT(num_big_pages > 0);
4576         UVM_ASSERT(gpu_state->initialized_big);
4577 
4578         for (big_page_index = 0; big_page_index < num_big_pages; big_page_index++) {
4579             big_region = uvm_va_block_big_page_region(block, big_page_index, big_page_size);
4580 
4581             if (!test_bit(big_page_index, gpu_state->big_ptes)) {
4582                 // If there are valid mappings but this isn't a big PTE, the
4583                 // mapping must be using the 4k PTEs.
4584                 if (!uvm_page_mask_region_empty(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ], big_region))
4585                     UVM_ASSERT(gpu_state->page_table_range_4k.table);
4586                 continue;
4587             }
4588 
4589             prot = block_page_prot_gpu(block, gpu, big_region.first);
4590 
4591             // All page permissions match
4592             for (pte_bit = 0; pte_bit < UVM_PTE_BITS_GPU_MAX; pte_bit++) {
4593                 if (prot == UVM_PROT_NONE || pte_bit > get_gpu_pte_bit_index(prot))
4594                     UVM_ASSERT(uvm_page_mask_region_empty(&gpu_state->pte_bits[pte_bit], big_region));
4595                 else
4596                     UVM_ASSERT(uvm_page_mask_region_full(&gpu_state->pte_bits[pte_bit], big_region));
4597             }
4598 
4599             if (prot != UVM_PROT_NONE) {
4600                 resident_id = block_gpu_get_processor_to_map(block, gpu, big_region.first);
4601 
4602                 // The mapped processor should be fully resident and physically-
4603                 // contiguous. Exception: UVM-Lite GPUs always map the preferred
4604                 // location even if the memory is resident elsewhere. Skip the
4605                 // residency check but still verify contiguity.
4606                 if (!uvm_processor_mask_test(block_get_uvm_lite_gpus(block), gpu->id)) {
4607                     UVM_ASSERT(uvm_page_mask_region_full(uvm_va_block_resident_mask_get(block, resident_id),
4608                                                          big_region));
4609                 }
4610 
4611                 if (UVM_ID_IS_CPU(resident_id)) {
4612                     uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, big_region.first);
4613 
4614                     UVM_ASSERT(gpu->parent->can_map_sysmem_with_large_pages);
4615                     UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) >= uvm_va_block_region_size(big_region));
4616                 }
4617                 else {
4618                     // Check GPU chunks
4619                     chunk = block_phys_page_chunk(block, block_phys_page(resident_id, big_region.first), NULL);
4620                     chunk_region = uvm_va_block_chunk_region(block, uvm_gpu_chunk_get_size(chunk), big_region.first);
4621                     UVM_ASSERT(uvm_va_block_region_contains_region(chunk_region, big_region));
4622                 }
4623             }
4624         }
4625     }
4626 
4627     return true;
4628 }
4629 
4630 static bool block_check_mappings(uvm_va_block_t *block)
4631 {
4632     uvm_page_index_t page_index;
4633     uvm_processor_id_t id;
4634 
4635     // Verify the master masks, since block_check_mappings_page relies on them
4636     for_each_processor_id(id) {
4637         const uvm_page_mask_t *resident_mask, *map_mask;
4638 
4639         if (UVM_ID_IS_GPU(id) && !uvm_va_block_gpu_state_get(block, id)) {
4640             UVM_ASSERT(!uvm_processor_mask_test(&block->resident, id));
4641             UVM_ASSERT(!uvm_processor_mask_test(&block->mapped, id));
4642             UVM_ASSERT(!uvm_processor_mask_test(&block->evicted_gpus, id));
4643             continue;
4644         }
4645 
4646         resident_mask = uvm_va_block_resident_mask_get(block, id);
4647         UVM_ASSERT(uvm_processor_mask_test(&block->resident, id) == !uvm_page_mask_empty(resident_mask));
4648 
4649         map_mask = uvm_va_block_map_mask_get(block, id);
4650         UVM_ASSERT(uvm_processor_mask_test(&block->mapped, id) == !uvm_page_mask_empty(map_mask));
4651 
4652         if (UVM_ID_IS_GPU(id)) {
4653             const uvm_page_mask_t *evicted_mask = block_evicted_mask_get(block, id);
4654             UVM_ASSERT(uvm_processor_mask_test(&block->evicted_gpus, id) == !uvm_page_mask_empty(evicted_mask));
4655 
4656             // Pages cannot be resident if they are marked as evicted
4657             UVM_ASSERT(!uvm_page_mask_intersects(evicted_mask, resident_mask));
4658 
4659             // Pages cannot be resident on a GPU with no memory
4660             if (!block_processor_has_memory(block, id))
4661                 UVM_ASSERT(!uvm_processor_mask_test(&block->resident, id));
4662         }
4663     }
4664 
4665     // Check that every page has coherent mappings
4666     for_each_va_block_page(page_index, block)
4667         block_check_mappings_page(block, page_index);
4668 
4669     for_each_gpu_id(id) {
4670         if (uvm_va_block_gpu_state_get(block, id)) {
4671             uvm_gpu_t *gpu = block_get_gpu(block, id);
4672 
4673             // Check big and/or 2M PTE state
4674             block_check_mappings_ptes(block, gpu);
4675         }
4676     }
4677 
4678     return true;
4679 }
4680 
4681 // See the comments on uvm_va_block_unmap
4682 static void block_unmap_cpu(uvm_va_block_t *block, uvm_va_block_region_t region, const uvm_page_mask_t *unmap_pages)
4683 {
4684     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
4685     uvm_pte_bits_cpu_t pte_bit;
4686     bool unmapped_something = false;
4687     uvm_va_block_region_t subregion;
4688     NvU32 num_mapped_processors;
4689 
4690     // Early-out if nothing in the region is mapped or being unmapped.
4691     if (!block_has_valid_mapping_cpu(block, region) ||
4692         (unmap_pages && !uvm_page_mask_intersects(unmap_pages, &block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ])))
4693         return;
4694 
4695     // We can't actually unmap HMM ranges from the CPU here.
4696     // Unmapping happens as part of migrate_vma_setup().
4697     if (uvm_va_block_is_hmm(block)) {
4698         UVM_ASSERT(!uvm_va_block_is_hmm(block));
4699         return;
4700     }
4701 
4702     num_mapped_processors = uvm_processor_mask_get_count(&block->mapped);
4703 
4704     // If we are unmapping a page which we are tracking due to CPU faults with
4705     // correct permissions, clear the info. This will cover both the unmap and
4706     // revoke cases (since we implement CPU revocation by unmap + map)
4707     if (block->cpu.fault_authorized.first_fault_stamp &&
4708         uvm_page_mask_region_test(unmap_pages, region, block->cpu.fault_authorized.page_index))
4709         block->cpu.fault_authorized.first_fault_stamp = 0;
4710 
4711     for_each_va_block_subregion_in_mask(subregion, unmap_pages, region) {
4712         if (!block_has_valid_mapping_cpu(block, subregion))
4713             continue;
4714 
4715         unmap_mapping_range(va_space->mapping,
4716                             uvm_va_block_region_start(block, subregion),
4717                             uvm_va_block_region_size(subregion), 1);
4718 
4719         for (pte_bit = 0; pte_bit < UVM_PTE_BITS_CPU_MAX; pte_bit++)
4720             uvm_page_mask_region_clear(&block->cpu.pte_bits[pte_bit], subregion);
4721 
4722         // If the CPU is the only processor with mappings we can safely mark
4723         // the pages as fully unmapped
4724         if (num_mapped_processors == 1)
4725             uvm_page_mask_region_clear(&block->maybe_mapped_pages, subregion);
4726 
4727         unmapped_something = true;
4728     }
4729 
4730     if (!unmapped_something)
4731         return;
4732 
4733     // Check whether the block has any more mappings
4734     if (uvm_page_mask_empty(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ])) {
4735         UVM_ASSERT(uvm_page_mask_empty(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE]));
4736         uvm_processor_mask_clear(&block->mapped, UVM_ID_CPU);
4737     }
4738 
4739     UVM_ASSERT(block_check_mappings(block));
4740 }
4741 
4742 // Given a mask of mapped pages, returns true if any of the pages in the mask
4743 // are mapped remotely by the given GPU.
4744 static bool block_has_remote_mapping_gpu(uvm_va_block_t *block,
4745                                          uvm_va_block_context_t *block_context,
4746                                          uvm_gpu_id_t gpu_id,
4747                                          const uvm_page_mask_t *mapped_pages)
4748 {
4749     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu_id);
4750 
4751     if (!gpu_state)
4752         return false;
4753 
4754     // The caller must ensure that all pages of the input mask are really mapped
4755     UVM_ASSERT(uvm_page_mask_subset(mapped_pages, &gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ]));
4756 
4757     // UVM-Lite GPUs map the preferred location if it's accessible, regardless
4758     // of the resident location.
4759     if (uvm_processor_mask_test(block_get_uvm_lite_gpus(block), gpu_id)) {
4760         if (uvm_page_mask_empty(mapped_pages))
4761             return false;
4762 
4763         return !uvm_id_equal(uvm_va_range_get_policy(block->va_range)->preferred_location, gpu_id);
4764     }
4765 
4766     // Remote pages are pages which are mapped but not resident locally
4767     return uvm_page_mask_andnot(&block_context->scratch_page_mask, mapped_pages, &gpu_state->resident);
4768 }
4769 
4770 // Writes pte_clear_val to the 4k PTEs covered by clear_page_mask. If
4771 // clear_page_mask is NULL, all 4k PTEs in the {block, gpu} are written.
4772 //
4773 // If tlb_batch is provided, the 4k PTEs written are added to the batch. The
4774 // caller is responsible for ending the TLB batch with the appropriate membar.
4775 static void block_gpu_pte_clear_4k(uvm_va_block_t *block,
4776                                    uvm_gpu_t *gpu,
4777                                    const uvm_page_mask_t *clear_page_mask,
4778                                    NvU64 pte_clear_val,
4779                                    uvm_pte_batch_t *pte_batch,
4780                                    uvm_tlb_batch_t *tlb_batch)
4781 {
4782     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
4783     uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
4784     uvm_gpu_phys_address_t pte_addr;
4785     NvU32 pte_size = uvm_mmu_pte_size(tree, UVM_PAGE_SIZE_4K);
4786     uvm_va_block_region_t region = uvm_va_block_region_from_block(block);
4787     uvm_va_block_region_t subregion;
4788     size_t num_ptes, ptes_per_page = PAGE_SIZE / UVM_PAGE_SIZE_4K;
4789 
4790     for_each_va_block_subregion_in_mask(subregion, clear_page_mask, region) {
4791         num_ptes = uvm_va_block_region_num_pages(subregion) * ptes_per_page;
4792 
4793         pte_addr = uvm_page_table_range_entry_address(tree,
4794                                                       &gpu_state->page_table_range_4k,
4795                                                       subregion.first * ptes_per_page);
4796 
4797         uvm_pte_batch_clear_ptes(pte_batch, pte_addr, pte_clear_val, pte_size, num_ptes);
4798 
4799         if (tlb_batch) {
4800             uvm_tlb_batch_invalidate(tlb_batch,
4801                                      uvm_va_block_region_start(block, subregion),
4802                                      uvm_va_block_region_size(subregion),
4803                                      UVM_PAGE_SIZE_4K,
4804                                      UVM_MEMBAR_NONE);
4805         }
4806     }
4807 }
4808 
4809 // Writes the 4k PTEs covered by write_page_mask using memory from resident_id
4810 // with new_prot permissions. new_prot must not be UVM_PROT_NONE: use
4811 // block_gpu_pte_clear_4k instead.
4812 //
4813 // If write_page_mask is NULL, all 4k PTEs in the {block, gpu} are written.
4814 //
4815 // If tlb_batch is provided, the 4k PTEs written are added to the batch. The
4816 // caller is responsible for ending the TLB batch with the appropriate membar.
4817 static void block_gpu_pte_write_4k(uvm_va_block_t *block,
4818                                    uvm_gpu_t *gpu,
4819                                    uvm_processor_id_t resident_id,
4820                                    uvm_prot_t new_prot,
4821                                    const uvm_page_mask_t *write_page_mask,
4822                                    uvm_pte_batch_t *pte_batch,
4823                                    uvm_tlb_batch_t *tlb_batch)
4824 {
4825     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
4826     uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
4827     NvU32 pte_size = uvm_mmu_pte_size(tree, UVM_PAGE_SIZE_4K);
4828     const size_t ptes_per_page = PAGE_SIZE / UVM_PAGE_SIZE_4K;
4829     uvm_va_block_region_t contig_region = {0};
4830     uvm_gpu_phys_address_t contig_addr = {0};
4831     uvm_gpu_phys_address_t page_addr = {0};
4832     uvm_page_index_t page_index;
4833     NvU64 pte_flags = block_gpu_pte_flag_cacheable(block, gpu, resident_id);
4834 
4835     UVM_ASSERT(new_prot != UVM_PROT_NONE);
4836     UVM_ASSERT(UVM_ID_IS_VALID(resident_id));
4837 
4838     for_each_va_block_page_in_mask(page_index, write_page_mask, block) {
4839         uvm_gpu_phys_address_t pte_addr;
4840         size_t i;
4841 
4842         // Assume that this mapping will be used to write to the page
4843         if (new_prot > UVM_PROT_READ_ONLY && UVM_ID_IS_CPU(resident_id) && !uvm_va_block_is_hmm(block))
4844             block_mark_cpu_page_dirty(block, page_index);
4845 
4846         if (page_index >= contig_region.outer) {
4847             contig_region = block_phys_contig_region(block, page_index, resident_id);
4848             contig_addr = block_phys_page_address(block, block_phys_page(resident_id, contig_region.first), gpu);
4849             page_addr = contig_addr;
4850         }
4851 
4852         page_addr.address = contig_addr.address + (page_index - contig_region.first) * PAGE_SIZE;
4853 
4854         pte_addr = uvm_page_table_range_entry_address(tree,
4855                                                       &gpu_state->page_table_range_4k,
4856                                                       page_index * ptes_per_page);
4857 
4858         // Handle PAGE_SIZE > GPU PTE size
4859         for (i = 0; i < ptes_per_page; i++) {
4860             NvU64 pte_val = tree->hal->make_pte(page_addr.aperture, page_addr.address, new_prot, pte_flags);
4861             uvm_pte_batch_write_pte(pte_batch, pte_addr, pte_val, pte_size);
4862             page_addr.address += UVM_PAGE_SIZE_4K;
4863             pte_addr.address += pte_size;
4864         }
4865 
4866         if (tlb_batch) {
4867             NvU64 page_virt_addr = uvm_va_block_cpu_page_address(block, page_index);
4868             uvm_tlb_batch_invalidate(tlb_batch, page_virt_addr, PAGE_SIZE, UVM_PAGE_SIZE_4K, UVM_MEMBAR_NONE);
4869         }
4870     }
4871 }
4872 
4873 // Writes all 4k PTEs under the big PTE regions described by big_ptes_covered.
4874 // This is used to initialize the 4k PTEs when splitting 2M and big PTEs. It
4875 // only writes 4k PTEs, not big PTEs.
4876 //
4877 // For those 4k PTEs, new_pages_mask indicates which ones should inherit the
4878 // mapping from the corresponding big page (0) and which ones should be written
4879 // using memory from resident_id and new_prot (1). Unlike the other pte_write
4880 // functions, new_prot may be UVM_PROT_NONE.
4881 //
4882 // If resident_id is UVM_ID_INVALID, this function looks up the resident ID
4883 // which should inherit the current permissions. new_prot must be UVM_PROT_NONE
4884 // in this case.
4885 //
4886 // new_pages_mask must not be NULL.
4887 //
4888 // No TLB invalidates are required since we've set up the lower PTEs to never be
4889 // cached by the GPU's MMU when covered by larger PTEs.
4890 static void block_gpu_pte_big_split_write_4k(uvm_va_block_t *block,
4891                                              uvm_va_block_context_t *block_context,
4892                                              uvm_gpu_t *gpu,
4893                                              uvm_processor_id_t resident_id,
4894                                              uvm_prot_t new_prot,
4895                                              const unsigned long *big_ptes_covered,
4896                                              const uvm_page_mask_t *new_pages_mask,
4897                                              uvm_pte_batch_t *pte_batch)
4898 {
4899     uvm_va_block_region_t big_region;
4900     size_t big_page_index;
4901     uvm_processor_id_t curr_resident_id;
4902     uvm_prot_t curr_prot;
4903     NvU32 big_page_size = uvm_va_block_gpu_big_page_size(block, gpu);
4904 
4905     if (UVM_ID_IS_INVALID(resident_id))
4906         UVM_ASSERT(new_prot == UVM_PROT_NONE);
4907 
4908     for_each_set_bit(big_page_index, big_ptes_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) {
4909         big_region = uvm_va_block_big_page_region(block, big_page_index, big_page_size);
4910 
4911         curr_prot = block_page_prot_gpu(block, gpu, big_region.first);
4912 
4913         // The unmap path doesn't know the current residency ahead of time, so
4914         // we have to look it up.
4915         if (UVM_ID_IS_INVALID(resident_id)) {
4916             curr_resident_id = block_gpu_get_processor_to_map(block, gpu, big_region.first);
4917         }
4918         else {
4919             // Check that we aren't changing the aperture of the existing
4920             // mappings. It could be legal in some cases (switching from {RO, A}
4921             // to {RO, B} for example) but we'd need to issue TLB membars.
4922             if (curr_prot != UVM_PROT_NONE)
4923                 UVM_ASSERT(uvm_id_equal(block_gpu_get_processor_to_map(block, gpu, big_region.first), resident_id));
4924 
4925             curr_resident_id = resident_id;
4926         }
4927 
4928         // pages in new_pages_mask under this big page get new_prot
4929         uvm_page_mask_zero(&block_context->scratch_page_mask);
4930         uvm_page_mask_region_fill(&block_context->scratch_page_mask, big_region);
4931         if (uvm_page_mask_and(&block_context->scratch_page_mask, &block_context->scratch_page_mask, new_pages_mask)) {
4932             if (new_prot == UVM_PROT_NONE) {
4933                 block_gpu_pte_clear_4k(block, gpu, &block_context->scratch_page_mask, 0, pte_batch, NULL);
4934             }
4935             else {
4936                 block_gpu_pte_write_4k(block,
4937                                        gpu,
4938                                        curr_resident_id,
4939                                        new_prot,
4940                                        &block_context->scratch_page_mask,
4941                                        pte_batch,
4942                                        NULL);
4943             }
4944         }
4945 
4946         // All other pages under this big page inherit curr_prot
4947         uvm_page_mask_zero(&block_context->scratch_page_mask);
4948         uvm_page_mask_region_fill(&block_context->scratch_page_mask, big_region);
4949         if (uvm_page_mask_andnot(&block_context->scratch_page_mask, &block_context->scratch_page_mask, new_pages_mask)) {
4950             if (curr_prot == UVM_PROT_NONE) {
4951                 block_gpu_pte_clear_4k(block, gpu, &block_context->scratch_page_mask, 0, pte_batch, NULL);
4952             }
4953             else {
4954                 block_gpu_pte_write_4k(block,
4955                                        gpu,
4956                                        curr_resident_id,
4957                                        curr_prot,
4958                                        &block_context->scratch_page_mask,
4959                                        pte_batch,
4960                                        NULL);
4961             }
4962         }
4963     }
4964 }
4965 
4966 // Writes pte_clear_val to the big PTEs in big_ptes_mask. If big_ptes_mask is
4967 // NULL, all big PTEs in the {block, gpu} are cleared.
4968 //
4969 // If tlb_batch is provided, the big PTEs written are added to the batch. The
4970 // caller is responsible for ending the TLB batch with the appropriate membar.
4971 static void block_gpu_pte_clear_big(uvm_va_block_t *block,
4972                                     uvm_gpu_t *gpu,
4973                                     const unsigned long *big_ptes_mask,
4974                                     NvU64 pte_clear_val,
4975                                     uvm_pte_batch_t *pte_batch,
4976                                     uvm_tlb_batch_t *tlb_batch)
4977 {
4978     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
4979     uvm_gpu_va_space_t *gpu_va_space = uvm_va_block_get_gpu_va_space(block, gpu);
4980     NvU32 big_page_size = gpu_va_space->page_tables.big_page_size;
4981     uvm_gpu_phys_address_t pte_addr;
4982     NvU32 pte_size = uvm_mmu_pte_size(&gpu_va_space->page_tables, big_page_size);
4983     size_t big_page_index;
4984     DECLARE_BITMAP(big_ptes_to_clear, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
4985 
4986     if (big_ptes_mask)
4987         bitmap_copy(big_ptes_to_clear, big_ptes_mask, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
4988     else
4989         bitmap_set(big_ptes_to_clear, 0, uvm_va_block_num_big_pages(block, big_page_size));
4990 
4991     for_each_set_bit(big_page_index, big_ptes_to_clear, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) {
4992         pte_addr = uvm_page_table_range_entry_address(&gpu_va_space->page_tables,
4993                                                       &gpu_state->page_table_range_big,
4994                                                       big_page_index);
4995         uvm_pte_batch_clear_ptes(pte_batch, pte_addr, pte_clear_val, pte_size, 1);
4996 
4997         if (tlb_batch) {
4998             uvm_tlb_batch_invalidate(tlb_batch,
4999                                      uvm_va_block_big_page_addr(block, big_page_index, big_page_size),
5000                                      big_page_size,
5001                                      big_page_size,
5002                                      UVM_MEMBAR_NONE);
5003         }
5004     }
5005 }
5006 
5007 // Writes the big PTEs in big_ptes_mask using memory from resident_id with
5008 // new_prot permissions. new_prot must not be UVM_PROT_NONE: use
5009 // block_gpu_pte_clear_big instead.
5010 //
5011 // Unlike block_gpu_pte_clear_big, big_ptes_mask must not be NULL.
5012 //
5013 // If tlb_batch is provided, the big PTEs written are added to the batch. The
5014 // caller is responsible for ending the TLB batch with the appropriate membar.
5015 static void block_gpu_pte_write_big(uvm_va_block_t *block,
5016                                     uvm_gpu_t *gpu,
5017                                     uvm_processor_id_t resident_id,
5018                                     uvm_prot_t new_prot,
5019                                     const unsigned long *big_ptes_mask,
5020                                     uvm_pte_batch_t *pte_batch,
5021                                     uvm_tlb_batch_t *tlb_batch)
5022 {
5023     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
5024     uvm_gpu_va_space_t *gpu_va_space = uvm_va_block_get_gpu_va_space(block, gpu);
5025     uvm_page_tree_t *tree = &gpu_va_space->page_tables;
5026     NvU32 big_page_size = tree->big_page_size;
5027     NvU32 pte_size = uvm_mmu_pte_size(tree, big_page_size);
5028     size_t big_page_index;
5029     uvm_va_block_region_t contig_region = {0};
5030     uvm_gpu_phys_address_t contig_addr = {0};
5031     uvm_gpu_phys_address_t page_addr = {0};
5032     NvU64 pte_flags = block_gpu_pte_flag_cacheable(block, gpu, resident_id);
5033 
5034     UVM_ASSERT(new_prot != UVM_PROT_NONE);
5035     UVM_ASSERT(UVM_ID_IS_VALID(resident_id));
5036     UVM_ASSERT(big_ptes_mask);
5037 
5038     if (!bitmap_empty(big_ptes_mask, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)) {
5039         UVM_ASSERT(uvm_va_block_num_big_pages(block, big_page_size) > 0);
5040 
5041         if (!gpu->parent->can_map_sysmem_with_large_pages)
5042             UVM_ASSERT(UVM_ID_IS_GPU(resident_id));
5043     }
5044 
5045     for_each_set_bit(big_page_index, big_ptes_mask, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) {
5046         NvU64 pte_val;
5047         uvm_gpu_phys_address_t pte_addr;
5048         uvm_va_block_region_t big_region = uvm_va_block_big_page_region(block, big_page_index, big_page_size);
5049 
5050         // Assume that this mapping will be used to write to the page
5051         if (new_prot > UVM_PROT_READ_ONLY && UVM_ID_IS_CPU(resident_id) && !uvm_va_block_is_hmm(block)) {
5052             uvm_page_index_t page_index;
5053 
5054             for_each_va_block_page_in_region(page_index, big_region)
5055                 block_mark_cpu_page_dirty(block, page_index);
5056         }
5057 
5058         if (big_region.first >= contig_region.outer) {
5059             contig_region = block_phys_contig_region(block, big_region.first, resident_id);
5060             contig_addr = block_phys_page_address(block, block_phys_page(resident_id, contig_region.first), gpu);
5061             page_addr = contig_addr;
5062         }
5063 
5064         page_addr.address = contig_addr.address + (big_region.first - contig_region.first) * PAGE_SIZE;
5065 
5066         pte_addr = uvm_page_table_range_entry_address(tree, &gpu_state->page_table_range_big, big_page_index);
5067         pte_val = tree->hal->make_pte(page_addr.aperture, page_addr.address, new_prot, pte_flags);
5068         uvm_pte_batch_write_pte(pte_batch, pte_addr, pte_val, pte_size);
5069 
5070         if (tlb_batch) {
5071             uvm_tlb_batch_invalidate(tlb_batch,
5072                                      uvm_va_block_region_start(block, big_region),
5073                                      big_page_size,
5074                                      big_page_size,
5075                                      UVM_MEMBAR_NONE);
5076         }
5077     }
5078 }
5079 
5080 // Switches any mix of valid or invalid 4k PTEs under the big PTEs in
5081 // big_ptes_to_merge to an unmapped big PTE. This also ends both pte_batch and
5082 // tlb_batch in order to poison the now-unused 4k PTEs.
5083 //
5084 // The 4k PTEs are invalidated with the specified membar.
5085 static void block_gpu_pte_merge_big_and_end(uvm_va_block_t *block,
5086                                             uvm_va_block_context_t *block_context,
5087                                             uvm_gpu_t *gpu,
5088                                             const unsigned long *big_ptes_to_merge,
5089                                             uvm_push_t *push,
5090                                             uvm_pte_batch_t *pte_batch,
5091                                             uvm_tlb_batch_t *tlb_batch,
5092                                             uvm_membar_t tlb_membar)
5093 {
5094     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
5095     uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
5096     NvU32 big_page_size = tree->big_page_size;
5097     NvU64 unmapped_pte_val = tree->hal->unmapped_pte(big_page_size);
5098     size_t big_page_index;
5099     DECLARE_BITMAP(dummy_big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5100 
5101     UVM_ASSERT(!bitmap_empty(big_ptes_to_merge, MAX_BIG_PAGES_PER_UVM_VA_BLOCK));
5102     UVM_ASSERT(!bitmap_and(dummy_big_ptes, gpu_state->big_ptes, big_ptes_to_merge, MAX_BIG_PAGES_PER_UVM_VA_BLOCK));
5103 
5104     // We can be called with the 4k PTEs in two cases:
5105     // 1) 4k PTEs allocated. In this case the 4k PTEs are currently active.
5106     //
5107     // 2) 4k PTEs unallocated. In this case the GPU may not have invalid 4k PTEs
5108     //    active under the big PTE, depending on whether neighboring blocks
5109     //    caused the page tables to be allocated.
5110     //
5111     // In both cases we need to invalidate the 4k PTEs in case the GPU MMU has
5112     // them cached.
5113 
5114     // Each big PTE is currently invalid so the 4ks are active (or unallocated).
5115     // First make the big PTEs unmapped to disable future lookups of the 4ks
5116     // under it. We can't directly transition the entry from valid 4k PTEs to
5117     // valid big PTEs, because that could cause the GPU TLBs to cache the same
5118     // VA in different cache lines. That could cause memory ordering to not be
5119     // maintained.
5120     block_gpu_pte_clear_big(block, gpu, big_ptes_to_merge, unmapped_pte_val, pte_batch, tlb_batch);
5121 
5122     // Now invalidate the big PTEs we just wrote as well as all 4ks under them.
5123     // Subsequent MMU fills will stop at the now-unmapped big PTEs, so we only
5124     // need to invalidate the 4k PTEs without actually writing them.
5125     for_each_set_bit(big_page_index, big_ptes_to_merge, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) {
5126         uvm_tlb_batch_invalidate(tlb_batch,
5127                                  uvm_va_block_big_page_addr(block, big_page_index, big_page_size),
5128                                  big_page_size,
5129                                  big_page_size | UVM_PAGE_SIZE_4K,
5130                                  UVM_MEMBAR_NONE);
5131     }
5132 
5133     // End the batches for the caller. We need to do this here in order to
5134     // poison the 4ks below.
5135     uvm_pte_batch_end(pte_batch);
5136     uvm_tlb_batch_end(tlb_batch, push, tlb_membar);
5137 
5138     // As a guard against bad PTE writes/TLB invalidates, fill the now-unused
5139     // PTEs with a pattern which will trigger fatal faults on access. We have to
5140     // do this after the TLB invalidate of the big PTEs, or the GPU might use
5141     // the new values.
5142     if (UVM_IS_DEBUG() && gpu_state->page_table_range_4k.table) {
5143         uvm_page_mask_init_from_big_ptes(block, gpu, &block_context->scratch_page_mask, big_ptes_to_merge);
5144         uvm_pte_batch_begin(push, pte_batch);
5145         block_gpu_pte_clear_4k(block,
5146                                gpu,
5147                                &block_context->scratch_page_mask,
5148                                tree->hal->poisoned_pte(),
5149                                pte_batch,
5150                                NULL);
5151         uvm_pte_batch_end(pte_batch);
5152     }
5153 }
5154 
5155 // Writes 0 (invalid) to the 2M PTE for this {block, gpu}.
5156 //
5157 // If tlb_batch is provided, the 2M PTE is added to the batch. The caller is
5158 // responsible for ending the TLB batch with the appropriate membar.
5159 static void block_gpu_pte_clear_2m(uvm_va_block_t *block,
5160                                    uvm_gpu_t *gpu,
5161                                    uvm_pte_batch_t *pte_batch,
5162                                    uvm_tlb_batch_t *tlb_batch)
5163 {
5164     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
5165     uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
5166     uvm_gpu_phys_address_t pte_addr = uvm_page_table_range_entry_address(tree, &gpu_state->page_table_range_2m, 0);
5167     NvU32 pte_size = uvm_mmu_pte_size(tree, UVM_PAGE_SIZE_2M);
5168 
5169     // uvm_pte_batch_write_pte only writes the lower 8 bytes of the 16-byte PTE,
5170     // which would cause a problem when trying to make the entry invalid since
5171     // both halves must be 0. Using uvm_pte_batch_clear_ptes writes the entire
5172     // 16 bytes.
5173     uvm_pte_batch_clear_ptes(pte_batch, pte_addr, 0, pte_size, 1);
5174 
5175     if (tlb_batch)
5176         uvm_tlb_batch_invalidate(tlb_batch, block->start, UVM_PAGE_SIZE_2M, UVM_PAGE_SIZE_2M, UVM_MEMBAR_NONE);
5177 }
5178 
5179 // Writes the 2M PTE for {block, gpu} using memory from resident_id with
5180 // new_prot permissions. new_prot must not be UVM_PROT_NONE: use
5181 // block_gpu_pte_clear_2m instead.
5182 //
5183 // If tlb_batch is provided, the 2M PTE is added to the batch. The caller is
5184 // responsible for ending the TLB batch with the appropriate membar.
5185 static void block_gpu_pte_write_2m(uvm_va_block_t *block,
5186                                    uvm_gpu_t *gpu,
5187                                    uvm_processor_id_t resident_id,
5188                                    uvm_prot_t new_prot,
5189                                    uvm_pte_batch_t *pte_batch,
5190                                    uvm_tlb_batch_t *tlb_batch)
5191 {
5192     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
5193     uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
5194     uvm_gpu_phys_address_t pte_addr = uvm_page_table_range_entry_address(tree, &gpu_state->page_table_range_2m, 0);
5195     uvm_gpu_phys_address_t page_addr;
5196     NvU32 pte_size = uvm_mmu_pte_size(tree, UVM_PAGE_SIZE_2M);
5197     NvU64 pte_val;
5198     NvU64 pte_flags = block_gpu_pte_flag_cacheable(block, gpu, resident_id);
5199 
5200     UVM_ASSERT(new_prot != UVM_PROT_NONE);
5201     UVM_ASSERT(UVM_ID_IS_VALID(resident_id));
5202 
5203     if (UVM_ID_IS_CPU(resident_id) && !uvm_va_block_is_hmm(block))
5204         block_mark_cpu_page_dirty(block, 0);
5205 
5206     page_addr = block_phys_page_address(block, block_phys_page(resident_id, 0), gpu);
5207     pte_val = tree->hal->make_pte(page_addr.aperture, page_addr.address, new_prot, pte_flags);
5208     uvm_pte_batch_write_pte(pte_batch, pte_addr, pte_val, pte_size);
5209 
5210     if (tlb_batch)
5211         uvm_tlb_batch_invalidate(tlb_batch, block->start, UVM_PAGE_SIZE_2M, UVM_PAGE_SIZE_2M, UVM_MEMBAR_NONE);
5212 }
5213 
5214 static bool block_gpu_needs_to_activate_table(uvm_va_block_t *block, uvm_gpu_t *gpu)
5215 {
5216     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
5217 
5218     if (!block_gpu_supports_2m(block, gpu))
5219         return false;
5220 
5221     if ((gpu_state->page_table_range_big.table && !gpu_state->activated_big) ||
5222         (gpu_state->page_table_range_4k.table  && !gpu_state->activated_4k))
5223         return true;
5224 
5225     return false;
5226 }
5227 
5228 // Only used if 2M PTEs are supported. Either transitions a 2M PTE to a PDE, or
5229 // activates a newly-allocated page table (big or 4k) while the other is already
5230 // active. The caller must have already written the new PTEs under the table
5231 // with the appropriate membar.
5232 static void block_gpu_write_pde(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_push_t *push, uvm_tlb_batch_t *tlb_batch)
5233 {
5234     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
5235     uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
5236 
5237     if (!gpu_state->pte_is_2m)
5238         UVM_ASSERT(block_gpu_needs_to_activate_table(block, gpu));
5239 
5240     UVM_ASSERT(gpu_state->page_table_range_big.table || gpu_state->page_table_range_4k.table);
5241 
5242     // We always need a membar to order PDE/PTE writes with the TLB invalidate.
5243     // write_pde will do a MEMBAR_SYS by default.
5244     if (uvm_page_table_range_aperture(&gpu_state->page_table_range_2m) == UVM_APERTURE_VID)
5245         uvm_push_set_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU);
5246     uvm_page_tree_write_pde(tree, &gpu_state->page_table_range_2m, push);
5247 
5248     gpu->parent->host_hal->wait_for_idle(push);
5249 
5250     // Invalidate just the PDE
5251     uvm_tlb_batch_invalidate(tlb_batch, block->start, UVM_PAGE_SIZE_2M, UVM_PAGE_SIZE_2M, UVM_MEMBAR_NONE);
5252 
5253     if (gpu_state->page_table_range_big.table)
5254         gpu_state->activated_big = true;
5255 
5256     if (gpu_state->page_table_range_4k.table)
5257         gpu_state->activated_4k = true;
5258 }
5259 
5260 // Called to switch the 2M PTE (valid or invalid) to a PDE. The caller should
5261 // have written all lower PTEs as appropriate into the given pte_batch already.
5262 // This function ends the PTE batch, activates the 2M PDE, and does a TLB
5263 // invalidate.
5264 //
5265 // The caller does not need to do any TLB invalidates since none of the lower
5266 // PTEs could be cached.
5267 static void block_gpu_pte_finish_split_2m(uvm_va_block_t *block,
5268                                           uvm_gpu_t *gpu,
5269                                           uvm_push_t *push,
5270                                           uvm_pte_batch_t *pte_batch,
5271                                           uvm_tlb_batch_t *tlb_batch,
5272                                           uvm_membar_t tlb_membar)
5273 {
5274     uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
5275     uvm_prot_t curr_prot = block_page_prot_gpu(block, gpu, 0);
5276 
5277     // Step 1: Make the 2M entry invalid. We can't directly transition from a
5278     //         valid 2M PTE to valid lower PTEs, because that could cause the
5279     //         GPU TLBs to cache the same VA in different cache lines. That
5280     //         could cause memory ordering to not be maintained.
5281     //
5282     //         If the 2M PTE is already invalid, no TLB invalidate is needed.
5283 
5284     if (curr_prot == UVM_PROT_NONE) {
5285         // If we aren't downgrading, then we don't need a membar.
5286         UVM_ASSERT(tlb_membar == UVM_MEMBAR_NONE);
5287 
5288         // End the batch, which pushes a membar to ensure that the caller's PTE
5289         // writes below 2M are observed before the PDE write we're about to do.
5290         uvm_pte_batch_end(pte_batch);
5291     }
5292     else {
5293         // The 64k and 4k PTEs can't possibly be cached since the 2M entry is
5294         // not yet a PDE, so we just need to invalidate this single 2M entry.
5295         uvm_tlb_batch_begin(tree, tlb_batch);
5296         block_gpu_pte_clear_2m(block, gpu, pte_batch, tlb_batch);
5297 
5298         // Make sure the PTE writes are observed before the TLB invalidate
5299         uvm_pte_batch_end(pte_batch);
5300         uvm_tlb_batch_end(tlb_batch, push, tlb_membar);
5301     }
5302 
5303     // Step 2: Switch the 2M entry from invalid to a PDE. This activates the
5304     //         smaller PTEs.
5305     uvm_tlb_batch_begin(tree, tlb_batch);
5306     block_gpu_write_pde(block, gpu, push, tlb_batch);
5307     uvm_tlb_batch_end(tlb_batch, push, UVM_MEMBAR_NONE);
5308 }
5309 
5310 // Switches any mix of valid or invalid 4k or 64k PTEs to an invalid 2M PTE.
5311 // Any lower PTEs are invalidated with the specified membar.
5312 static void block_gpu_pte_merge_2m(uvm_va_block_t *block,
5313                                    uvm_va_block_context_t *block_context,
5314                                    uvm_gpu_t *gpu,
5315                                    uvm_push_t *push,
5316                                    uvm_membar_t tlb_membar)
5317 {
5318     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
5319     uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
5320     uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch;
5321     uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch;
5322     NvU32 tlb_inval_sizes;
5323 
5324     UVM_ASSERT(!gpu_state->pte_is_2m);
5325     UVM_ASSERT(gpu_state->page_table_range_big.table || gpu_state->page_table_range_4k.table);
5326 
5327     // The 2M entry is currently a PDE, so first make it invalid. We can't
5328     // directly transition the entry from a valid PDE to a valid 2M PTE, because
5329     // that could cause the GPU TLBs to cache the same VA in different cache
5330     // lines. That could cause memory ordering to not be maintained.
5331     uvm_pte_batch_begin(push, pte_batch);
5332     block_gpu_pte_clear_2m(block, gpu, pte_batch, NULL);
5333     uvm_pte_batch_end(pte_batch);
5334 
5335     // Now invalidate both the 2M entry we just wrote as well as all lower-level
5336     // entries which could be cached. Subsequent MMU fills will stop at the now-
5337     // invalid 2M entry, so we only need to invalidate the lower PTEs without
5338     // actually writing them.
5339     tlb_inval_sizes = UVM_PAGE_SIZE_2M;
5340     if (gpu_state->page_table_range_big.table)
5341         tlb_inval_sizes |= UVM_PAGE_SIZE_64K;
5342 
5343     // Strictly-speaking we only need to invalidate those 4k ranges which are
5344     // not covered by a big pte. However, any such invalidate will require
5345     // enough 4k invalidates to force the TLB batching to invalidate everything
5346     // anyway, so just do the simpler thing.
5347     if (!bitmap_full(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK))
5348         tlb_inval_sizes |= UVM_PAGE_SIZE_4K;
5349 
5350     uvm_tlb_batch_begin(tree, tlb_batch);
5351     uvm_tlb_batch_invalidate(tlb_batch, block->start, UVM_PAGE_SIZE_2M, tlb_inval_sizes, UVM_MEMBAR_NONE);
5352     uvm_tlb_batch_end(tlb_batch, push, tlb_membar);
5353 
5354     // As a guard against bad PTE writes/TLB invalidates, fill the now-unused
5355     // PTEs with a pattern which will trigger fatal faults on access. We have to
5356     // do this after the TLB invalidate of the 2M entry, or the GPU might use
5357     // the new values.
5358     if (UVM_IS_DEBUG()) {
5359         uvm_pte_batch_begin(push, pte_batch);
5360 
5361         if (gpu_state->page_table_range_big.table) {
5362             block_gpu_pte_clear_big(block,
5363                                     gpu,
5364                                     NULL,
5365                                     tree->hal->poisoned_pte(),
5366                                     pte_batch,
5367                                     NULL);
5368         }
5369 
5370         if (gpu_state->page_table_range_4k.table) {
5371             block_gpu_pte_clear_4k(block,
5372                                    gpu,
5373                                    NULL,
5374                                    tree->hal->poisoned_pte(),
5375                                    pte_batch,
5376                                    NULL);
5377         }
5378 
5379         uvm_pte_batch_end(pte_batch);
5380     }
5381 }
5382 
5383 static uvm_membar_t block_pte_op_membar(block_pte_op_t pte_op, uvm_gpu_t *gpu, uvm_processor_id_t resident_id)
5384 {
5385     // Permissions upgrades (MAP) don't need membars
5386     if (pte_op == BLOCK_PTE_OP_MAP)
5387         return UVM_MEMBAR_NONE;
5388 
5389     UVM_ASSERT(UVM_ID_IS_VALID(resident_id));
5390     UVM_ASSERT(pte_op == BLOCK_PTE_OP_REVOKE);
5391 
5392     return uvm_hal_downgrade_membar_type(gpu, uvm_id_equal(gpu->id, resident_id));
5393 }
5394 
5395 // Write the 2M PTE for {block, gpu} to the memory on resident_id with new_prot
5396 // permissions. If the 2M entry is currently a PDE, it is first merged into a
5397 // PTE.
5398 //
5399 // new_prot must not be UVM_PROT_NONE: use block_gpu_unmap_to_2m instead.
5400 //
5401 // pte_op specifies whether this is a MAP or REVOKE operation, which determines
5402 // the TLB membar required.
5403 static void block_gpu_map_to_2m(uvm_va_block_t *block,
5404                                 uvm_va_block_context_t *block_context,
5405                                 uvm_gpu_t *gpu,
5406                                 uvm_processor_id_t resident_id,
5407                                 uvm_prot_t new_prot,
5408                                 uvm_push_t *push,
5409                                 block_pte_op_t pte_op)
5410 {
5411     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
5412     uvm_gpu_va_space_t *gpu_va_space = uvm_va_block_get_gpu_va_space(block, gpu);
5413     uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch;
5414     uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch;
5415     uvm_membar_t tlb_membar;
5416 
5417     UVM_ASSERT(new_prot != UVM_PROT_NONE);
5418 
5419     // If we have a mix of big and 4k PTEs, we have to first merge them to an
5420     // invalid 2M PTE.
5421     if (!gpu_state->pte_is_2m) {
5422         block_gpu_pte_merge_2m(block, block_context, gpu, push, UVM_MEMBAR_NONE);
5423 
5424         gpu_state->pte_is_2m = true;
5425         bitmap_zero(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5426     }
5427 
5428     // Write the new permissions
5429     uvm_pte_batch_begin(push, pte_batch);
5430     uvm_tlb_batch_begin(&gpu_va_space->page_tables, tlb_batch);
5431 
5432     block_gpu_pte_write_2m(block, gpu, resident_id, new_prot, pte_batch, tlb_batch);
5433 
5434     uvm_pte_batch_end(pte_batch);
5435 
5436     tlb_membar = block_pte_op_membar(pte_op, gpu, resident_id);
5437     uvm_tlb_batch_end(tlb_batch, push, tlb_membar);
5438 }
5439 
5440 // Combination split + map operation, called when only part of a 2M PTE mapping
5441 // is being changed. This splits an existing valid or invalid 2M PTE into the
5442 // mix of big and 4k PTEs described by block_context->mapping.new_pte_state.
5443 //
5444 // The PTEs covering the pages in pages_to_write are written to the memory on
5445 // resident_id with new_prot permissions. new_prot must not be UVM_PROT_NONE.
5446 //
5447 // The PTEs covering the pages not set in pages_to_write inherit the mapping of
5448 // the current 2M PTE. If the current mapping is valid, it must target
5449 // resident_id.
5450 //
5451 // pte_op specifies whether this is a MAP or REVOKE operation, which determines
5452 // the TLB membar required.
5453 static void block_gpu_map_split_2m(uvm_va_block_t *block,
5454                                    uvm_va_block_context_t *block_context,
5455                                    uvm_gpu_t *gpu,
5456                                    uvm_processor_id_t resident_id,
5457                                    const uvm_page_mask_t *pages_to_write,
5458                                    uvm_prot_t new_prot,
5459                                    uvm_push_t *push,
5460                                    block_pte_op_t pte_op)
5461 {
5462     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
5463     uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
5464     uvm_va_block_new_pte_state_t *new_pte_state = &block_context->mapping.new_pte_state;
5465     uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch;
5466     uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch;
5467     uvm_prot_t curr_prot = block_page_prot_gpu(block, gpu, 0);
5468     uvm_membar_t tlb_membar;
5469     DECLARE_BITMAP(big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5470     DECLARE_BITMAP(big_ptes_inherit, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5471     DECLARE_BITMAP(big_ptes_new_prot, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5472 
5473     UVM_ASSERT(gpu_state->pte_is_2m);
5474 
5475     if (!gpu_state->page_table_range_4k.table)
5476         UVM_ASSERT(bitmap_full(new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK));
5477 
5478     uvm_pte_batch_begin(push, pte_batch);
5479 
5480     // Since the 2M entry is active as a PTE, the GPU MMU can't fetch entries
5481     // from the lower levels. This means we don't need to issue a TLB invalidate
5482     // when writing those levels.
5483 
5484     // Cases to handle:
5485     // 1) Big PTEs which inherit curr_prot
5486     // 2) Big PTEs which get new_prot
5487     // 3) Big PTEs which are split to 4k
5488     //    a) 4k PTEs which inherit curr_prot under the split big PTEs
5489     //    b) 4k PTEs which get new_prot under the split big PTEs
5490 
5491     // Compute the big PTEs which will need to be split to 4k, if any.
5492     bitmap_complement(big_ptes_split, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5493 
5494     if (gpu_state->page_table_range_big.table) {
5495         // Case 1: Write the big PTEs which will inherit the 2M permissions, if
5496         // any. These are the big PTEs which are unchanged (uncovered) by the
5497         // operation.
5498         bitmap_andnot(big_ptes_inherit,
5499                       new_pte_state->big_ptes,
5500                       new_pte_state->big_ptes_covered,
5501                       MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5502 
5503         if (curr_prot == UVM_PROT_NONE) {
5504             block_gpu_pte_clear_big(block,
5505                                     gpu,
5506                                     big_ptes_inherit,
5507                                     tree->hal->unmapped_pte(UVM_PAGE_SIZE_64K),
5508                                     pte_batch,
5509                                     NULL);
5510         }
5511         else {
5512             block_gpu_pte_write_big(block, gpu, resident_id, curr_prot, big_ptes_inherit, pte_batch, NULL);
5513         }
5514 
5515         // Case 2: Write the new big PTEs
5516         bitmap_and(big_ptes_new_prot,
5517                    new_pte_state->big_ptes,
5518                    new_pte_state->big_ptes_covered,
5519                    MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5520         block_gpu_pte_write_big(block, gpu, resident_id, new_prot, big_ptes_new_prot, pte_batch, NULL);
5521 
5522         // Case 3: Write the big PTEs which cover 4k PTEs
5523         block_gpu_pte_clear_big(block, gpu, big_ptes_split, 0, pte_batch, NULL);
5524 
5525         // We just wrote all possible big PTEs, so mark them as initialized
5526         gpu_state->initialized_big = true;
5527     }
5528     else {
5529         UVM_ASSERT(bitmap_empty(new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK));
5530     }
5531 
5532     // Cases 3a and 3b: Write all 4k PTEs under all now-split big PTEs
5533     block_gpu_pte_big_split_write_4k(block,
5534                                      block_context,
5535                                      gpu,
5536                                      resident_id,
5537                                      new_prot,
5538                                      big_ptes_split,
5539                                      pages_to_write,
5540                                      pte_batch);
5541 
5542     // Activate the 2M PDE. This ends the pte_batch and issues a single TLB
5543     // invalidate for the 2M entry.
5544     tlb_membar = block_pte_op_membar(pte_op, gpu, resident_id);
5545     block_gpu_pte_finish_split_2m(block, gpu, push, pte_batch, tlb_batch, tlb_membar);
5546 
5547     gpu_state->pte_is_2m = false;
5548     bitmap_copy(gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5549 }
5550 
5551 // Split the existing 2M PTE into big and 4k PTEs. No permissions are changed.
5552 //
5553 // new_big_ptes specifies which PTEs should be big. NULL means all PTEs should
5554 // be 4k.
5555 static void block_gpu_split_2m(uvm_va_block_t *block,
5556                                uvm_va_block_context_t *block_context,
5557                                uvm_gpu_t *gpu,
5558                                const unsigned long *new_big_ptes,
5559                                uvm_push_t *push)
5560 {
5561     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
5562     uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
5563     uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch;
5564     uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch;
5565     uvm_prot_t curr_prot = block_page_prot_gpu(block, gpu, 0);
5566     DECLARE_BITMAP(new_big_ptes_local, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5567     DECLARE_BITMAP(big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5568     NvU64 unmapped_pte_val;
5569     uvm_processor_id_t curr_residency;
5570 
5571     UVM_ASSERT(gpu_state->pte_is_2m);
5572 
5573     if (new_big_ptes)
5574         bitmap_copy(new_big_ptes_local, new_big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5575     else
5576         bitmap_zero(new_big_ptes_local, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5577 
5578     if (!bitmap_empty(new_big_ptes_local, MAX_BIG_PAGES_PER_UVM_VA_BLOCK))
5579         UVM_ASSERT(gpu_state->page_table_range_big.table);
5580 
5581     // We're splitting from 2M to big only, so we'll be writing all big PTEs
5582     if (gpu_state->page_table_range_big.table)
5583         gpu_state->initialized_big = true;
5584 
5585     // Cases to handle:
5586     // 1) Big PTEs which inherit curr_prot
5587     // 2) Big PTEs which are split to 4k
5588     //    a) 4k PTEs inherit curr_prot under the split big PTEs
5589 
5590     // big_ptes_split will cover the 4k regions
5591     bitmap_complement(big_ptes_split, new_big_ptes_local, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5592     uvm_page_mask_init_from_big_ptes(block, gpu, &block_context->scratch_page_mask, big_ptes_split);
5593 
5594     uvm_pte_batch_begin(push, pte_batch);
5595 
5596     // Since the 2M entry is active as a PTE, the GPU MMU can't fetch entries
5597     // from the lower levels. This means we don't need to issue a TLB invalidate
5598     // when writing those levels.
5599 
5600     if (curr_prot == UVM_PROT_NONE) {
5601         unmapped_pte_val = tree->hal->unmapped_pte(tree->big_page_size);
5602 
5603         // Case 2a: Clear the 4k PTEs under big_ptes_split
5604         block_gpu_pte_clear_4k(block, gpu, &block_context->scratch_page_mask, 0, pte_batch, NULL);
5605 
5606         // Case 1: Make the remaining big PTEs unmapped
5607         block_gpu_pte_clear_big(block, gpu, new_big_ptes_local, unmapped_pte_val, pte_batch, NULL);
5608     }
5609     else {
5610         curr_residency = block_gpu_get_processor_to_map(block, gpu, 0);
5611 
5612         // Case 2a: Write the new 4k PTEs under big_ptes_split
5613         block_gpu_pte_write_4k(block,
5614                                gpu,
5615                                curr_residency,
5616                                curr_prot,
5617                                &block_context->scratch_page_mask,
5618                                pte_batch,
5619                                NULL);
5620 
5621         // Case 1: Write the new big PTEs
5622         block_gpu_pte_write_big(block, gpu, curr_residency, curr_prot, new_big_ptes_local, pte_batch, NULL);
5623     }
5624 
5625     // Case 2: Make big_ptes_split invalid to activate the 4k PTEs
5626     if (gpu_state->page_table_range_big.table)
5627         block_gpu_pte_clear_big(block, gpu, big_ptes_split, 0, pte_batch, NULL);
5628 
5629     // Activate the 2M PDE. This ends the pte_batch and issues a single TLB
5630     // invalidate for the 2M entry. No membar is necessary since we aren't
5631     // changing permissions.
5632     block_gpu_pte_finish_split_2m(block, gpu, push, pte_batch, tlb_batch, UVM_MEMBAR_NONE);
5633 
5634     gpu_state->pte_is_2m = false;
5635     bitmap_copy(gpu_state->big_ptes, new_big_ptes_local, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5636 }
5637 
5638 // Split the big PTEs in big_ptes_to_split into 4k PTEs. No permissions are
5639 // changed.
5640 //
5641 // big_ptes_to_split must not be NULL.
5642 static void block_gpu_split_big(uvm_va_block_t *block,
5643                                 uvm_va_block_context_t *block_context,
5644                                 uvm_gpu_t *gpu,
5645                                 const unsigned long *big_ptes_to_split,
5646                                 uvm_push_t *push)
5647 {
5648     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
5649     uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
5650     uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch;
5651     uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch;
5652     NvU32 big_page_size = tree->big_page_size;
5653     uvm_va_block_region_t big_region;
5654     uvm_processor_id_t resident_id;
5655     size_t big_page_index;
5656     uvm_prot_t curr_prot;
5657     DECLARE_BITMAP(big_ptes_valid, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5658 
5659     UVM_ASSERT(!gpu_state->pte_is_2m);
5660     UVM_ASSERT(bitmap_subset(big_ptes_to_split, gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK));
5661     UVM_ASSERT(!bitmap_empty(big_ptes_to_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK));
5662 
5663     uvm_pte_batch_begin(push, pte_batch);
5664     uvm_tlb_batch_begin(tree, tlb_batch);
5665 
5666     // Write all 4k PTEs under all big PTEs which are being split. We'll make
5667     // the big PTEs inactive below after flushing these writes. No TLB
5668     // invalidate is needed since the big PTE is active.
5669     bitmap_zero(big_ptes_valid, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5670     for_each_set_bit(big_page_index, big_ptes_to_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) {
5671         big_region = uvm_va_block_big_page_region(block, big_page_index, big_page_size);
5672         curr_prot = block_page_prot_gpu(block, gpu, big_region.first);
5673 
5674         uvm_page_mask_zero(&block_context->scratch_page_mask);
5675         uvm_page_mask_region_fill(&block_context->scratch_page_mask, big_region);
5676         if (curr_prot == UVM_PROT_NONE) {
5677             block_gpu_pte_clear_4k(block, gpu, &block_context->scratch_page_mask, 0, pte_batch, NULL);
5678         }
5679         else {
5680             __set_bit(big_page_index, big_ptes_valid);
5681 
5682             resident_id = block_gpu_get_processor_to_map(block, gpu, big_region.first);
5683 
5684             block_gpu_pte_write_4k(block,
5685                                    gpu,
5686                                    resident_id,
5687                                    curr_prot,
5688                                    &block_context->scratch_page_mask,
5689                                    pte_batch,
5690                                    NULL);
5691         }
5692     }
5693 
5694     // Unmap the big PTEs which are valid and are being split to 4k. We can't
5695     // directly transition from a valid big PTE to valid lower PTEs, because
5696     // that could cause the GPU TLBs to cache the same VA in different cache
5697     // lines. That could cause memory ordering to not be maintained.
5698     block_gpu_pte_clear_big(block, gpu, big_ptes_valid, tree->hal->unmapped_pte(big_page_size), pte_batch, tlb_batch);
5699 
5700     // End the batches. We have to commit the membars and TLB invalidates
5701     // before we finish splitting formerly-big PTEs. No membar is necessary
5702     // since we aren't changing permissions.
5703     uvm_pte_batch_end(pte_batch);
5704     uvm_tlb_batch_end(tlb_batch, push, UVM_MEMBAR_NONE);
5705 
5706     // Finish the split by switching the big PTEs from unmapped to invalid. This
5707     // causes the GPU MMU to start reading the 4k PTEs instead of stopping at
5708     // the unmapped big PTEs.
5709     uvm_pte_batch_begin(push, pte_batch);
5710     uvm_tlb_batch_begin(tree, tlb_batch);
5711 
5712     block_gpu_pte_clear_big(block, gpu, big_ptes_to_split, 0, pte_batch, tlb_batch);
5713 
5714     uvm_pte_batch_end(pte_batch);
5715 
5716     // Finally, activate the page tables if they're inactive
5717     if (block_gpu_needs_to_activate_table(block, gpu))
5718         block_gpu_write_pde(block, gpu, push, tlb_batch);
5719 
5720     uvm_tlb_batch_end(tlb_batch, push, UVM_MEMBAR_NONE);
5721 
5722     bitmap_andnot(gpu_state->big_ptes, gpu_state->big_ptes, big_ptes_to_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5723 }
5724 
5725 // Changes permissions on some pre-existing mix of big and 4k PTEs into some
5726 // other mix of big and 4k PTEs, as described by
5727 // block_context->mapping.new_pte_state.
5728 //
5729 // The PTEs covering the pages in pages_to_write are written to the memory on
5730 // resident_id with new_prot permissions. new_prot must not be UVM_PROT_NONE.
5731 //
5732 // pte_op specifies whether this is a MAP or REVOKE operation, which determines
5733 // the TLB membar required.
5734 static void block_gpu_map_big_and_4k(uvm_va_block_t *block,
5735                                      uvm_va_block_context_t *block_context,
5736                                      uvm_gpu_t *gpu,
5737                                      uvm_processor_id_t resident_id,
5738                                      const uvm_page_mask_t *pages_to_write,
5739                                      uvm_prot_t new_prot,
5740                                      uvm_push_t *push,
5741                                      block_pte_op_t pte_op)
5742 {
5743     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
5744     uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
5745     uvm_va_block_new_pte_state_t *new_pte_state = &block_context->mapping.new_pte_state;
5746     uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch;
5747     uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch;
5748     DECLARE_BITMAP(big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5749     DECLARE_BITMAP(big_ptes_before_or_after, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5750     DECLARE_BITMAP(big_ptes_merge, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5751     DECLARE_BITMAP(big_ptes_mask, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5752     uvm_va_block_region_t big_region;
5753     size_t big_page_index;
5754     NvU32 big_page_size = tree->big_page_size;
5755     uvm_membar_t tlb_membar = block_pte_op_membar(pte_op, gpu, resident_id);
5756 
5757     UVM_ASSERT(!gpu_state->pte_is_2m);
5758 
5759     uvm_pte_batch_begin(push, pte_batch);
5760     uvm_tlb_batch_begin(tree, tlb_batch);
5761 
5762     // All of these cases might be perfomed in the same call:
5763     // 1) Split currently-big PTEs to 4k
5764     //    a) Write new 4k PTEs which inherit curr_prot under the split big PTEs
5765     //    b) Write new 4k PTEs which get new_prot under the split big PTEs
5766     // 2) Merge currently-4k PTEs to big with new_prot
5767     // 3) Write currently-big PTEs which wholly get new_prot
5768     // 4) Write currently-4k PTEs which get new_prot
5769     // 5) Initialize big PTEs which are not covered by this operation
5770 
5771     // Cases 1a and 1b: Write all 4k PTEs under all currently-big PTEs which are
5772     // being split. We'll make the big PTEs inactive below after flushing these
5773     // writes. No TLB invalidate is needed since the big PTE is active.
5774     //
5775     // Mask computation: big_before && !big_after
5776     bitmap_andnot(big_ptes_split, gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5777 
5778     block_gpu_pte_big_split_write_4k(block,
5779                                      block_context,
5780                                      gpu,
5781                                      resident_id,
5782                                      new_prot,
5783                                      big_ptes_split,
5784                                      pages_to_write,
5785                                      pte_batch);
5786 
5787     // Case 4: Write the 4k PTEs which weren't covered by a big PTE before, and
5788     // remain uncovered after the operation.
5789     //
5790     // Mask computation: !big_before && !big_after
5791     bitmap_or(big_ptes_before_or_after, gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5792     uvm_page_mask_init_from_big_ptes(block, gpu, &block_context->scratch_page_mask, big_ptes_before_or_after);
5793     if (uvm_page_mask_andnot(&block_context->scratch_page_mask, pages_to_write, &block_context->scratch_page_mask)) {
5794         block_gpu_pte_write_4k(block,
5795                                gpu,
5796                                resident_id,
5797                                new_prot,
5798                                &block_context->scratch_page_mask,
5799                                pte_batch,
5800                                tlb_batch);
5801     }
5802 
5803     // Case 5: If the big page table is newly-allocated, make sure that all big
5804     // PTEs we aren't otherwise writing (that is, those which cover 4k PTEs) are
5805     // all initialized to invalid.
5806     //
5807     // The similar case of making newly-allocated big PTEs unmapped when no
5808     // lower 4k table is present is handled by having
5809     // block_gpu_compute_new_pte_state set new_pte_state->big_ptes
5810     // appropriately.
5811     if (gpu_state->page_table_range_big.table && !gpu_state->initialized_big) {
5812         // TODO: Bug 1766424: If we have the 4k page table already, we could
5813         //       attempt to merge all uncovered big PTE regions when first
5814         //       allocating the big table. That's probably not worth doing.
5815         UVM_ASSERT(gpu_state->page_table_range_4k.table);
5816         UVM_ASSERT(bitmap_empty(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK));
5817         bitmap_complement(big_ptes_mask, new_pte_state->big_ptes, uvm_va_block_num_big_pages(block, big_page_size));
5818         block_gpu_pte_clear_big(block, gpu, big_ptes_mask, 0, pte_batch, tlb_batch);
5819         gpu_state->initialized_big = true;
5820     }
5821 
5822     // Case 1 (step 1): Unmap the currently-big PTEs which are valid and are
5823     // being split to 4k. We can't directly transition from a valid big PTE to
5824     // valid lower PTEs, because that could cause the GPU TLBs to cache the same
5825     // VA in different cache lines. That could cause memory ordering to not be
5826     // maintained.
5827     bitmap_zero(big_ptes_mask, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5828     for_each_set_bit(big_page_index, big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) {
5829         big_region = uvm_va_block_big_page_region(block, big_page_index, big_page_size);
5830         if (uvm_page_mask_test(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ], big_region.first))
5831             __set_bit(big_page_index, big_ptes_mask);
5832     }
5833 
5834     block_gpu_pte_clear_big(block, gpu, big_ptes_mask, tree->hal->unmapped_pte(big_page_size), pte_batch, tlb_batch);
5835 
5836     // Case 3: Write the currently-big PTEs which remain big PTEs, and are
5837     // wholly changing permissions.
5838     //
5839     // Mask computation: big_before && big_after && covered
5840     bitmap_and(big_ptes_mask, gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5841     if (bitmap_and(big_ptes_mask, big_ptes_mask, new_pte_state->big_ptes_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK))
5842         block_gpu_pte_write_big(block, gpu, resident_id, new_prot, big_ptes_mask, pte_batch, tlb_batch);
5843 
5844     // Case 2 (step 1): Merge the new big PTEs and end the batches, now that
5845     // we've done all of the independent PTE writes we can. This also merges
5846     // newly-allocated uncovered big PTEs to unmapped (see
5847     // block_gpu_compute_new_pte_state).
5848     //
5849     // Mask computation: !big_before && big_after
5850     if (bitmap_andnot(big_ptes_merge, new_pte_state->big_ptes, gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)) {
5851         // This writes the newly-big PTEs to unmapped and ends the PTE and TLB
5852         // batches.
5853         block_gpu_pte_merge_big_and_end(block,
5854                                         block_context,
5855                                         gpu,
5856                                         big_ptes_merge,
5857                                         push,
5858                                         pte_batch,
5859                                         tlb_batch,
5860                                         tlb_membar);
5861 
5862         // Remove uncovered big PTEs. We needed to merge them to unmapped above,
5863         // but they shouldn't get new_prot below.
5864         bitmap_and(big_ptes_merge, big_ptes_merge, new_pte_state->big_ptes_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5865     }
5866     else {
5867         // End the batches. We have to commit the membars and TLB invalidates
5868         // before we finish splitting formerly-big PTEs.
5869         uvm_pte_batch_end(pte_batch);
5870         uvm_tlb_batch_end(tlb_batch, push, tlb_membar);
5871     }
5872 
5873     if (!bitmap_empty(big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) ||
5874         !bitmap_empty(big_ptes_merge, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) ||
5875         block_gpu_needs_to_activate_table(block, gpu)) {
5876 
5877         uvm_pte_batch_begin(push, pte_batch);
5878         uvm_tlb_batch_begin(tree, tlb_batch);
5879 
5880         // Case 1 (step 2): Finish splitting our big PTEs, if we have any, by
5881         // switching them from unmapped to invalid. This causes the GPU MMU to
5882         // start reading the 4k PTEs instead of stopping at the unmapped big
5883         // PTEs.
5884         block_gpu_pte_clear_big(block, gpu, big_ptes_split, 0, pte_batch, tlb_batch);
5885 
5886         // Case 2 (step 2): Finish merging our big PTEs, if we have any, by
5887         // switching them from unmapped to new_prot.
5888         block_gpu_pte_write_big(block, gpu, resident_id, new_prot, big_ptes_merge, pte_batch, tlb_batch);
5889 
5890         uvm_pte_batch_end(pte_batch);
5891 
5892         // Finally, activate the page tables if they're inactive
5893         if (block_gpu_needs_to_activate_table(block, gpu))
5894             block_gpu_write_pde(block, gpu, push, tlb_batch);
5895 
5896         uvm_tlb_batch_end(tlb_batch, push, UVM_MEMBAR_NONE);
5897     }
5898 
5899     // Update gpu_state
5900     bitmap_copy(gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5901 }
5902 
5903 // Unmap all PTEs for {block, gpu}. If the 2M entry is currently a PDE, it is
5904 // merged into a PTE.
5905 static void block_gpu_unmap_to_2m(uvm_va_block_t *block,
5906                                   uvm_va_block_context_t *block_context,
5907                                   uvm_gpu_t *gpu,
5908                                   uvm_push_t *push,
5909                                   uvm_membar_t tlb_membar)
5910 {
5911     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
5912     uvm_gpu_va_space_t *gpu_va_space = uvm_va_block_get_gpu_va_space(block, gpu);
5913     uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch;
5914     uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch;
5915 
5916     if (gpu_state->pte_is_2m) {
5917         // If we're already mapped as a valid 2M PTE, just write it to invalid
5918         uvm_pte_batch_begin(push, pte_batch);
5919         uvm_tlb_batch_begin(&gpu_va_space->page_tables, tlb_batch);
5920 
5921         block_gpu_pte_clear_2m(block, gpu, pte_batch, tlb_batch);
5922 
5923         uvm_pte_batch_end(pte_batch);
5924         uvm_tlb_batch_end(tlb_batch, push, tlb_membar);
5925     }
5926     else {
5927         // Otherwise we have a mix of big and 4K PTEs which need to be merged
5928         // into an invalid 2M PTE.
5929         block_gpu_pte_merge_2m(block, block_context, gpu, push, tlb_membar);
5930 
5931         gpu_state->pte_is_2m = true;
5932         bitmap_zero(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5933     }
5934 }
5935 
5936 // Combination split + unmap operation, called when only part of a valid 2M PTE
5937 // mapping is being unmapped. The 2M PTE is split into a mix of valid and
5938 // invalid big and/or 4k PTEs, as described by
5939 // block_context->mapping.new_pte_state.
5940 //
5941 // The PTEs covering the pages in pages_to_unmap are cleared (unmapped).
5942 //
5943 // The PTEs covering the pages not set in pages_to_unmap inherit the mapping of
5944 // the current 2M PTE.
5945 static void block_gpu_unmap_split_2m(uvm_va_block_t *block,
5946                                      uvm_va_block_context_t *block_context,
5947                                      uvm_gpu_t *gpu,
5948                                      const uvm_page_mask_t *pages_to_unmap,
5949                                      uvm_push_t *push,
5950                                      uvm_membar_t tlb_membar)
5951 {
5952     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
5953     uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
5954     uvm_va_block_new_pte_state_t *new_pte_state = &block_context->mapping.new_pte_state;
5955     uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch;
5956     uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch;
5957     uvm_prot_t curr_prot = block_page_prot_gpu(block, gpu, 0);
5958     uvm_processor_id_t resident_id;
5959     DECLARE_BITMAP(big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5960     DECLARE_BITMAP(big_ptes_inherit, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5961     DECLARE_BITMAP(big_ptes_new_prot, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5962 
5963     UVM_ASSERT(gpu_state->pte_is_2m);
5964 
5965     resident_id = block_gpu_get_processor_to_map(block, gpu, 0);
5966 
5967     uvm_pte_batch_begin(push, pte_batch);
5968 
5969     // Since the 2M entry is active as a PTE, the GPU MMU can't fetch entries
5970     // from the lower levels. This means we don't need to issue a TLB invalidate
5971     // when writing those levels.
5972 
5973     // Cases to handle:
5974     // 1) Big PTEs which inherit curr_prot
5975     // 2) Big PTEs which get unmapped
5976     // 3) Big PTEs which are split to 4k
5977     //    a) 4k PTEs which inherit curr_prot under the split big PTEs
5978     //    b) 4k PTEs which get unmapped under the split big PTEs
5979 
5980     // Compute the big PTEs which will need to be split to 4k, if any.
5981     bitmap_complement(big_ptes_split, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5982 
5983     if (gpu_state->page_table_range_big.table) {
5984         // Case 1: Write the big PTEs which will inherit the 2M permissions, if
5985         // any. These are the big PTEs which are unchanged (uncovered) by the
5986         // operation.
5987         bitmap_andnot(big_ptes_inherit,
5988                       new_pte_state->big_ptes,
5989                       new_pte_state->big_ptes_covered,
5990                       MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5991 
5992         block_gpu_pte_write_big(block, gpu, resident_id, curr_prot, big_ptes_inherit, pte_batch, NULL);
5993 
5994         // Case 2: Clear the new big PTEs which get unmapped (those not covering
5995         // 4ks)
5996         bitmap_and(big_ptes_new_prot,
5997                    new_pte_state->big_ptes,
5998                    new_pte_state->big_ptes_covered,
5999                    MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6000 
6001         block_gpu_pte_clear_big(block,
6002                                 gpu,
6003                                 big_ptes_new_prot,
6004                                 tree->hal->unmapped_pte(UVM_PAGE_SIZE_64K),
6005                                 pte_batch,
6006                                 NULL);
6007 
6008         // Case 3: Write the big PTEs which cover 4k PTEs
6009         block_gpu_pte_clear_big(block, gpu, big_ptes_split, 0, pte_batch, NULL);
6010 
6011         // We just wrote all possible big PTEs, so mark them as initialized
6012         gpu_state->initialized_big = true;
6013     }
6014     else {
6015         UVM_ASSERT(bitmap_empty(new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK));
6016         UVM_ASSERT(bitmap_full(new_pte_state->big_ptes_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK));
6017     }
6018 
6019     // Cases 3a and 3b: Write all 4k PTEs under all now-split big PTEs
6020     block_gpu_pte_big_split_write_4k(block,
6021                                      block_context,
6022                                      gpu,
6023                                      resident_id,
6024                                      UVM_PROT_NONE,
6025                                      big_ptes_split,
6026                                      pages_to_unmap,
6027                                      pte_batch);
6028 
6029     // And activate the 2M PDE. This ends the pte_batch and issues a single TLB
6030     // invalidate for the 2M entry.
6031     block_gpu_pte_finish_split_2m(block, gpu, push, pte_batch, tlb_batch, tlb_membar);
6032 
6033     gpu_state->pte_is_2m = false;
6034     bitmap_copy(gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6035 }
6036 
6037 // Unmap some pre-existing mix of big and 4k PTEs into some other mix of big
6038 // and 4k PTEs.
6039 //
6040 // The PTEs covering the pages in pages_to_unmap are cleared (unmapped).
6041 static void block_gpu_unmap_big_and_4k(uvm_va_block_t *block,
6042                                        uvm_va_block_context_t *block_context,
6043                                        uvm_gpu_t *gpu,
6044                                        const uvm_page_mask_t *pages_to_unmap,
6045                                        uvm_push_t *push,
6046                                        uvm_membar_t tlb_membar)
6047 {
6048     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
6049     uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
6050     uvm_va_block_new_pte_state_t *new_pte_state = &block_context->mapping.new_pte_state;
6051     uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch;
6052     uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch;
6053     DECLARE_BITMAP(big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6054     DECLARE_BITMAP(big_ptes_before_or_after, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6055     DECLARE_BITMAP(big_ptes_mask, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6056     NvU32 big_page_size = tree->big_page_size;
6057     NvU64 unmapped_pte_val = tree->hal->unmapped_pte(big_page_size);
6058 
6059     UVM_ASSERT(!gpu_state->pte_is_2m);
6060 
6061     uvm_pte_batch_begin(push, pte_batch);
6062     uvm_tlb_batch_begin(tree, tlb_batch);
6063 
6064     // All of these cases might be perfomed in the same call:
6065     // 1) Split currently-big PTEs to 4k
6066     //    a) Write new 4k PTEs which inherit curr_prot under the split big PTEs
6067     //    b) Clear new 4k PTEs which get unmapped under the split big PTEs
6068     // 2) Merge currently-4k PTEs to unmapped big
6069     // 3) Clear currently-big PTEs which wholly get unmapped
6070     // 4) Clear currently-4k PTEs which get unmapped
6071     // 5) Initialize big PTEs which are not covered by this operation
6072 
6073     // Cases 1a and 1b: Write all 4k PTEs under all currently-big PTEs which are
6074     // being split. We'll make the big PTEs inactive below after flushing these
6075     // writes. No TLB invalidate is needed since the big PTE is active.
6076     //
6077     // Mask computation: big_before && !big_after
6078     bitmap_andnot(big_ptes_split, gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6079 
6080     block_gpu_pte_big_split_write_4k(block,
6081                                      block_context,
6082                                      gpu,
6083                                      UVM_ID_INVALID,
6084                                      UVM_PROT_NONE,
6085                                      big_ptes_split,
6086                                      pages_to_unmap,
6087                                      pte_batch);
6088 
6089     // Case 4: Clear the 4k PTEs which weren't covered by a big PTE before, and
6090     // remain uncovered after the unmap.
6091     //
6092     // Mask computation: !big_before && !big_after
6093     bitmap_or(big_ptes_before_or_after, gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6094     uvm_page_mask_init_from_big_ptes(block, gpu, &block_context->scratch_page_mask, big_ptes_before_or_after);
6095     if (uvm_page_mask_andnot(&block_context->scratch_page_mask, pages_to_unmap, &block_context->scratch_page_mask))
6096         block_gpu_pte_clear_4k(block, gpu, &block_context->scratch_page_mask, 0, pte_batch, tlb_batch);
6097 
6098     // Case 5: If the big page table is newly-allocated, make sure that all big
6099     // PTEs we aren't otherwise writing (that is, those which cover 4k PTEs) are
6100     // all initialized to invalid.
6101     //
6102     // The similar case of making newly-allocated big PTEs unmapped when no
6103     // lower 4k table is present is handled by having
6104     // block_gpu_compute_new_pte_state set new_pte_state->big_ptes
6105     // appropriately.
6106     if (gpu_state->page_table_range_big.table && !gpu_state->initialized_big) {
6107         // TODO: Bug 1766424: If we have the 4k page table already, we could
6108         //       attempt to merge all uncovered big PTE regions when first
6109         //       allocating the big table. That's probably not worth doing.
6110         UVM_ASSERT(gpu_state->page_table_range_4k.table);
6111         UVM_ASSERT(bitmap_empty(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK));
6112         bitmap_complement(big_ptes_mask, new_pte_state->big_ptes, uvm_va_block_num_big_pages(block, big_page_size));
6113         block_gpu_pte_clear_big(block, gpu, big_ptes_mask, 0, pte_batch, tlb_batch);
6114         gpu_state->initialized_big = true;
6115     }
6116 
6117     // Case 3 and step 1 of case 1: Unmap both currently-big PTEs which are
6118     // getting wholly unmapped, and those currently-big PTEs which are being
6119     // split to 4k. We can't directly transition from a valid big PTE to valid
6120     // lower PTEs, because that could cause the GPU TLBs to cache the same VA in
6121     // different cache lines. That could cause memory ordering to not be
6122     // maintained.
6123     //
6124     // Mask computation: (big_before && big_after && covered) ||
6125     //                   (big_before && !big_after)
6126     bitmap_and(big_ptes_mask, gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6127     bitmap_and(big_ptes_mask, big_ptes_mask, new_pte_state->big_ptes_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6128     bitmap_or(big_ptes_mask, big_ptes_mask, big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6129     block_gpu_pte_clear_big(block, gpu, big_ptes_mask, unmapped_pte_val, pte_batch, tlb_batch);
6130 
6131     // Case 2: Merge the new big PTEs and end the batches, now that we've done
6132     // all of the independent PTE writes we can.
6133     //
6134     // Mask computation: !big_before && big_after
6135     if (bitmap_andnot(big_ptes_mask, new_pte_state->big_ptes, gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)) {
6136         // This writes the newly-big PTEs to unmapped and ends the PTE and TLB
6137         // batches.
6138         block_gpu_pte_merge_big_and_end(block,
6139                                         block_context,
6140                                         gpu,
6141                                         big_ptes_mask,
6142                                         push,
6143                                         pte_batch,
6144                                         tlb_batch,
6145                                         tlb_membar);
6146     }
6147     else {
6148         // End the batches. We have to commit the membars and TLB invalidates
6149         // before we finish splitting formerly-big PTEs.
6150         uvm_pte_batch_end(pte_batch);
6151         uvm_tlb_batch_end(tlb_batch, push, tlb_membar);
6152     }
6153 
6154     if (!bitmap_empty(big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) ||
6155         block_gpu_needs_to_activate_table(block, gpu)) {
6156         uvm_pte_batch_begin(push, pte_batch);
6157         uvm_tlb_batch_begin(tree, tlb_batch);
6158 
6159         // Case 1 (step 2): Finish splitting our big PTEs, if we have any, by
6160         // switching them from unmapped to invalid. This causes the GPU MMU to
6161         // start reading the 4k PTEs instead of stopping at the unmapped big
6162         // PTEs.
6163         block_gpu_pte_clear_big(block, gpu, big_ptes_split, 0, pte_batch, tlb_batch);
6164 
6165         uvm_pte_batch_end(pte_batch);
6166 
6167         // Finally, activate the page tables if they're inactive
6168         if (block_gpu_needs_to_activate_table(block, gpu))
6169             block_gpu_write_pde(block, gpu, push, tlb_batch);
6170 
6171         uvm_tlb_batch_end(tlb_batch, push, UVM_MEMBAR_NONE);
6172     }
6173 
6174     // Update gpu_state
6175     bitmap_copy(gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6176 }
6177 
6178 // When PTE state is about to change (for example due to a map/unmap/revoke
6179 // operation), this function decides how to split and merge the PTEs in response
6180 // to that operation.
6181 //
6182 // The operation is described with the two page masks:
6183 //
6184 // - pages_changing indicates which pages will have their PTE mappings changed
6185 //   on the GPU in some way as a result of the operation (for example, which
6186 //   pages will actually have their mapping permissions upgraded).
6187 //
6188 // - page_mask_after indicates which pages on this GPU will have exactly the
6189 //   same PTE attributes (permissions, residency) as pages_changing after the
6190 //   operation is applied.
6191 //
6192 // PTEs are merged eagerly.
6193 static void block_gpu_compute_new_pte_state(uvm_va_block_t *block,
6194                                             uvm_gpu_t *gpu,
6195                                             uvm_processor_id_t resident_id,
6196                                             const uvm_page_mask_t *pages_changing,
6197                                             const uvm_page_mask_t *page_mask_after,
6198                                             uvm_va_block_new_pte_state_t *new_pte_state)
6199 {
6200     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
6201     uvm_va_block_region_t big_region_all, big_page_region, region;
6202     NvU32 big_page_size;
6203     uvm_page_index_t page_index;
6204     size_t big_page_index;
6205     DECLARE_BITMAP(big_ptes_not_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6206     bool can_make_new_big_ptes;
6207 
6208     memset(new_pte_state, 0, sizeof(*new_pte_state));
6209     new_pte_state->needs_4k = true;
6210 
6211     // TODO: Bug 1676485: Force a specific page size for perf testing
6212 
6213     if (gpu_state->force_4k_ptes)
6214         return;
6215 
6216     // Limit HMM GPU allocations to PAGE_SIZE since migrate_vma_*(),
6217     // hmm_range_fault(), and make_device_exclusive_range() don't handle folios
6218     // yet. Also, it makes mremap() difficult since the new address may not
6219     // align with the GPU block size otherwise.
6220     // If PAGE_SIZE is 64K, the code following this check is OK since 64K
6221     // big_pages is supported on all HMM supported GPUs (Turing+).
6222     // TODO: Bug 3368756: add support for transparent huge pages (THP).
6223     if (uvm_va_block_is_hmm(block) && PAGE_SIZE == UVM_PAGE_SIZE_4K)
6224         return;
6225 
6226     UVM_ASSERT(uvm_page_mask_subset(pages_changing, page_mask_after));
6227 
6228     // If all pages in the 2M mask have the same attributes after the
6229     // operation is applied, we can use a 2M PTE.
6230     if (block_gpu_supports_2m(block, gpu) &&
6231         uvm_page_mask_full(page_mask_after) &&
6232         (UVM_ID_IS_INVALID(resident_id) || is_block_phys_contig(block, resident_id))) {
6233         new_pte_state->pte_is_2m = true;
6234         new_pte_state->needs_4k = false;
6235         return;
6236     }
6237 
6238     // Find big PTEs with matching attributes
6239 
6240     // Can this block fit any big pages?
6241     big_page_size = uvm_va_block_gpu_big_page_size(block, gpu);
6242     big_region_all = uvm_va_block_big_page_region_all(block, big_page_size);
6243     if (big_region_all.first >= big_region_all.outer)
6244         return;
6245 
6246     new_pte_state->needs_4k = false;
6247 
6248     can_make_new_big_ptes = true;
6249 
6250     // Big pages can be used when mapping sysmem if the GPU supports it (Pascal+).
6251     if (UVM_ID_IS_CPU(resident_id) && !gpu->parent->can_map_sysmem_with_large_pages)
6252         can_make_new_big_ptes = false;
6253 
6254     // We must not fail during teardown: unmap (resident_id == UVM_ID_INVALID)
6255     // with no splits required. That means we should avoid allocating PTEs
6256     // which are only needed for merges.
6257     //
6258     // This only matters if we're merging to big PTEs. If we're merging to 2M,
6259     // then we must already have the 2M level (since it has to be allocated
6260     // before the lower levels).
6261     //
6262     // If pte_is_2m already and we don't have a big table, we're splitting so we
6263     // have to allocate.
6264     if (UVM_ID_IS_INVALID(resident_id) && !gpu_state->page_table_range_big.table && !gpu_state->pte_is_2m)
6265         can_make_new_big_ptes = false;
6266 
6267     for_each_va_block_page_in_region_mask(page_index, pages_changing, big_region_all) {
6268         uvm_va_block_region_t contig_region = {0};
6269 
6270         big_page_index = uvm_va_block_big_page_index(block, page_index, big_page_size);
6271         big_page_region = uvm_va_block_big_page_region(block, big_page_index, big_page_size);
6272 
6273         if (!UVM_ID_IS_INVALID(resident_id))
6274             contig_region = block_phys_contig_region(block, page_index, resident_id);
6275 
6276         __set_bit(big_page_index, new_pte_state->big_ptes_covered);
6277 
6278         // When mapping sysmem, we can use big pages only if we are mapping all
6279         // pages in the big page subregion and the CPU pages backing the
6280         // subregion are physically contiguous.
6281         if (can_make_new_big_ptes &&
6282             uvm_page_mask_region_full(page_mask_after, big_page_region) &&
6283             (!UVM_ID_IS_CPU(resident_id) ||
6284              (contig_region.first <= big_page_region.first && contig_region.outer >= big_page_region.outer))) {
6285             __set_bit(big_page_index, new_pte_state->big_ptes);
6286         }
6287 
6288         if (!test_bit(big_page_index, new_pte_state->big_ptes))
6289             new_pte_state->needs_4k = true;
6290 
6291         // Skip to the end of the region
6292         page_index = big_page_region.outer - 1;
6293     }
6294 
6295     if (!new_pte_state->needs_4k) {
6296         // All big page regions in pages_changing will be big PTEs. Now check if
6297         // there are any unaligned pages outside of big_region_all which are
6298         // changing.
6299         region = uvm_va_block_region(0, big_region_all.first);
6300         if (!uvm_page_mask_region_empty(pages_changing, region)) {
6301             new_pte_state->needs_4k = true;
6302         }
6303         else {
6304             region = uvm_va_block_region(big_region_all.outer, uvm_va_block_num_cpu_pages(block));
6305             if (!uvm_page_mask_region_empty(pages_changing, region))
6306                 new_pte_state->needs_4k = true;
6307         }
6308     }
6309 
6310     // Now add in the PTEs which should be big but weren't covered by this
6311     // operation.
6312     //
6313     // Note that we can't assume that a given page table range has been
6314     // initialized if it's present here, since it could have been allocated by a
6315     // thread which had to restart its operation due to allocation retry.
6316     if (gpu_state->pte_is_2m || (block_gpu_supports_2m(block, gpu) && !gpu_state->page_table_range_2m.table)) {
6317         // We're splitting a 2M PTE so all of the uncovered big PTE regions will
6318         // become big PTEs which inherit the 2M permissions. If we haven't
6319         // allocated the 2M table yet, it will start as a 2M PTE until the lower
6320         // levels are allocated, so it's the same split case regardless of
6321         // whether this operation will need to retry a later allocation.
6322         bitmap_complement(big_ptes_not_covered, new_pte_state->big_ptes_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6323     }
6324     else if (!gpu_state->page_table_range_4k.table && !new_pte_state->needs_4k) {
6325         // If we don't have 4k PTEs and we won't be allocating them for this
6326         // operation, all of our PTEs need to be big.
6327         UVM_ASSERT(!bitmap_empty(new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK));
6328         bitmap_zero(big_ptes_not_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6329         bitmap_set(big_ptes_not_covered, 0, uvm_va_block_num_big_pages(block, big_page_size));
6330     }
6331     else {
6332         // Otherwise, add in all of the currently-big PTEs which are unchanging.
6333         // They won't be written, but they need to be carried into the new
6334         // gpu_state->big_ptes when it's updated.
6335         bitmap_andnot(big_ptes_not_covered,
6336                       gpu_state->big_ptes,
6337                       new_pte_state->big_ptes_covered,
6338                       MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6339     }
6340 
6341     bitmap_or(new_pte_state->big_ptes, new_pte_state->big_ptes, big_ptes_not_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6342 }
6343 
6344 // Wrapper around uvm_page_tree_get_ptes() and uvm_page_tree_alloc_table() that
6345 // handles allocation retry. If the block lock has been unlocked and relocked as
6346 // part of the allocation, NV_ERR_MORE_PROCESSING_REQUIRED is returned to signal
6347 // to the caller that the operation likely needs to be restarted. If that
6348 // happens, the pending tracker is added to the block's tracker.
6349 static NV_STATUS block_alloc_pt_range_with_retry(uvm_va_block_t *va_block,
6350                                                  uvm_gpu_t *gpu,
6351                                                  NvU32 page_size,
6352                                                  uvm_page_table_range_t *page_table_range,
6353                                                  uvm_tracker_t *pending_tracker)
6354 {
6355     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id);
6356     uvm_gpu_va_space_t *gpu_va_space = uvm_va_block_get_gpu_va_space(va_block, gpu);
6357     uvm_page_tree_t *page_tables = &gpu_va_space->page_tables;
6358     uvm_va_block_test_t *va_block_test = uvm_va_block_get_test(va_block);
6359     uvm_page_table_range_t local_range;
6360     NV_STATUS status;
6361 
6362     // Blocks may contain large PTEs without starting on a PTE boundary or
6363     // having an aligned size. Cover the PTEs of this size in the block's
6364     // interior so we match uvm_va_block_gpu_state_t::big_ptes.
6365     NvU64 start = UVM_ALIGN_UP(va_block->start, page_size);
6366     NvU64 size  = UVM_ALIGN_DOWN(va_block->end + 1, page_size) - start;
6367 
6368     // VA blocks which can use the 2MB level as either a PTE or a PDE need to
6369     // account for the PDE specially, so they must use uvm_page_tree_alloc_table
6370     // to allocate the lower levels.
6371     bool use_alloc_table = block_gpu_supports_2m(va_block, gpu) && page_size < UVM_PAGE_SIZE_2M;
6372 
6373     UVM_ASSERT(page_table_range->table == NULL);
6374 
6375     if (va_block_test && va_block_test->page_table_allocation_retry_force_count > 0) {
6376         --va_block_test->page_table_allocation_retry_force_count;
6377         status = NV_ERR_NO_MEMORY;
6378     }
6379     else if (use_alloc_table) {
6380         // Pascal+: 4k/64k tables under a 2M entry
6381         UVM_ASSERT(gpu_state->page_table_range_2m.table);
6382         status = uvm_page_tree_alloc_table(page_tables,
6383                                            page_size,
6384                                            UVM_PMM_ALLOC_FLAGS_NONE,
6385                                            &gpu_state->page_table_range_2m,
6386                                            page_table_range);
6387     }
6388     else {
6389         // 4k/big tables on pre-Pascal, and the 2M entry on Pascal+
6390         status = uvm_page_tree_get_ptes(page_tables,
6391                                         page_size,
6392                                         start,
6393                                         size,
6394                                         UVM_PMM_ALLOC_FLAGS_NONE,
6395                                         page_table_range);
6396     }
6397 
6398     if (status == NV_OK)
6399         goto allocated;
6400 
6401     if (status != NV_ERR_NO_MEMORY)
6402         return status;
6403 
6404     // Before unlocking the block lock, any pending work on the block has to be
6405     // added to the block's tracker.
6406     if (pending_tracker) {
6407         status = uvm_tracker_add_tracker_safe(&va_block->tracker, pending_tracker);
6408         if (status != NV_OK)
6409             return status;
6410     }
6411 
6412     // Unlock the va block and retry with eviction enabled
6413     uvm_mutex_unlock(&va_block->lock);
6414 
6415     if (use_alloc_table) {
6416         // Although we don't hold the block lock here, it's safe to pass
6417         // gpu_state->page_table_range_2m to the page tree code because we know
6418         // that the 2m range has already been allocated, and that it can't go
6419         // away while we have the va_space lock held.
6420         status = uvm_page_tree_alloc_table(page_tables,
6421                                            page_size,
6422                                            UVM_PMM_ALLOC_FLAGS_EVICT,
6423                                            &gpu_state->page_table_range_2m,
6424                                            &local_range);
6425     }
6426     else {
6427         status = uvm_page_tree_get_ptes(page_tables,
6428                                         page_size,
6429                                         start,
6430                                         size,
6431                                         UVM_PMM_ALLOC_FLAGS_EVICT,
6432                                         &local_range);
6433     }
6434 
6435     uvm_mutex_lock(&va_block->lock);
6436 
6437     if (status != NV_OK)
6438         return status;
6439 
6440     status = NV_ERR_MORE_PROCESSING_REQUIRED;
6441 
6442     if (page_table_range->table) {
6443         // A different caller allocated the page tables in the meantime, release the
6444         // local copy.
6445         uvm_page_tree_put_ptes(page_tables, &local_range);
6446         return status;
6447     }
6448 
6449     *page_table_range = local_range;
6450 
6451 allocated:
6452     // Mark the 2M PTE as active when we first allocate it, since we don't have
6453     // any PTEs below it yet.
6454     if (page_size == UVM_PAGE_SIZE_2M) {
6455         UVM_ASSERT(!gpu_state->pte_is_2m);
6456         gpu_state->pte_is_2m = true;
6457     }
6458     else if (page_size != UVM_PAGE_SIZE_4K) {
6459         // uvm_page_tree_get_ptes initializes big PTEs to invalid.
6460         // uvm_page_tree_alloc_table does not, so we'll have to do it later.
6461         if (use_alloc_table)
6462             UVM_ASSERT(!gpu_state->initialized_big);
6463         else
6464             gpu_state->initialized_big = true;
6465     }
6466 
6467     return status;
6468 }
6469 
6470 // Helper which allocates all page table ranges necessary for the given page
6471 // sizes. See block_alloc_pt_range_with_retry.
6472 static NV_STATUS block_alloc_ptes_with_retry(uvm_va_block_t *va_block,
6473                                              uvm_gpu_t *gpu,
6474                                              NvU32 page_sizes,
6475                                              uvm_tracker_t *pending_tracker)
6476 {
6477     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id);
6478     uvm_gpu_va_space_t *gpu_va_space = uvm_va_block_get_gpu_va_space(va_block, gpu);
6479     uvm_page_table_range_t *range;
6480     NvU32 page_size;
6481     NV_STATUS status, final_status = NV_OK;
6482 
6483     UVM_ASSERT(gpu_state);
6484 
6485     // Blocks which can map 2M PTE/PDEs must always allocate the 2MB level first
6486     // in order to allocate the levels below.
6487     if (block_gpu_supports_2m(va_block, gpu))
6488         page_sizes |= UVM_PAGE_SIZE_2M;
6489 
6490     UVM_ASSERT((page_sizes & gpu_va_space->page_tables.hal->page_sizes()) == page_sizes);
6491 
6492     for_each_chunk_size_rev(page_size, page_sizes) {
6493         if (page_size == UVM_PAGE_SIZE_2M)
6494             range = &gpu_state->page_table_range_2m;
6495         else if (page_size == UVM_PAGE_SIZE_4K)
6496             range = &gpu_state->page_table_range_4k;
6497         else
6498             range = &gpu_state->page_table_range_big;
6499 
6500         if (range->table)
6501             continue;
6502 
6503         if (page_size == UVM_PAGE_SIZE_2M) {
6504             UVM_ASSERT(!gpu_state->pte_is_2m);
6505             UVM_ASSERT(!gpu_state->page_table_range_big.table);
6506             UVM_ASSERT(!gpu_state->page_table_range_4k.table);
6507         }
6508         else if (page_size != UVM_PAGE_SIZE_4K) {
6509             UVM_ASSERT(uvm_va_block_num_big_pages(va_block, uvm_va_block_gpu_big_page_size(va_block, gpu)) > 0);
6510             UVM_ASSERT(bitmap_empty(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK));
6511         }
6512 
6513         status = block_alloc_pt_range_with_retry(va_block, gpu, page_size, range, pending_tracker);
6514 
6515         // Keep going to allocate the remaining levels even if the allocation
6516         // requires a retry, since we'll likely still need them when we retry
6517         // anyway.
6518         if (status == NV_ERR_MORE_PROCESSING_REQUIRED)
6519             final_status = NV_ERR_MORE_PROCESSING_REQUIRED;
6520         else if (status != NV_OK)
6521             return status;
6522     }
6523 
6524     return final_status;
6525 }
6526 
6527 static NV_STATUS block_alloc_ptes_new_state(uvm_va_block_t *va_block,
6528                                             uvm_gpu_t *gpu,
6529                                             uvm_va_block_new_pte_state_t *new_pte_state,
6530                                             uvm_tracker_t *pending_tracker)
6531 {
6532     NvU32 page_sizes = 0;
6533 
6534     if (new_pte_state->pte_is_2m) {
6535         page_sizes |= UVM_PAGE_SIZE_2M;
6536     }
6537     else {
6538         if (!bitmap_empty(new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK))
6539             page_sizes |= uvm_va_block_gpu_big_page_size(va_block, gpu);
6540 
6541         if (new_pte_state->needs_4k)
6542             page_sizes |= UVM_PAGE_SIZE_4K;
6543         else
6544             UVM_ASSERT(!bitmap_empty(new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK));
6545     }
6546 
6547     return block_alloc_ptes_with_retry(va_block, gpu, page_sizes, pending_tracker);
6548 }
6549 
6550 // Make sure that GMMU PDEs down to PDE1 are populated for the given VA block.
6551 // This is currently used on ATS systems to prevent GPUs from inadvertently
6552 // accessing sysmem via ATS because there is no PDE1 in the GMMU page tables,
6553 // which is where the NOATS bit resides.
6554 //
6555 // The current implementation simply pre-allocates the PTEs for the VA Block,
6556 // which is wasteful because the GPU may never need them.
6557 //
6558 // TODO: Bug 2064188: Change the MMU code to be able to directly refcount PDE1
6559 // page table entries without having to request PTEs.
6560 static NV_STATUS block_pre_populate_pde1_gpu(uvm_va_block_t *block,
6561                                              uvm_gpu_va_space_t *gpu_va_space,
6562                                              uvm_tracker_t *pending_tracker)
6563 {
6564     NvU32 page_sizes;
6565     NvU32 big_page_size;
6566     uvm_gpu_t *gpu;
6567     uvm_va_block_gpu_state_t *gpu_state;
6568 
6569     UVM_ASSERT(block);
6570     UVM_ASSERT(gpu_va_space);
6571     UVM_ASSERT(gpu_va_space->ats.enabled);
6572     UVM_ASSERT(uvm_gpu_va_space_state(gpu_va_space) == UVM_GPU_VA_SPACE_STATE_ACTIVE);
6573 
6574     gpu = gpu_va_space->gpu;
6575     big_page_size = gpu_va_space->page_tables.big_page_size;
6576 
6577     gpu_state = block_gpu_state_get_alloc(block, gpu);
6578     if (!gpu_state)
6579         return NV_ERR_NO_MEMORY;
6580 
6581     // If the VA Block supports 2M pages, allocate the 2M PTE only, as it
6582     // requires less memory
6583     if (block_gpu_supports_2m(block, gpu))
6584         page_sizes = UVM_PAGE_SIZE_2M;
6585     else if (uvm_va_block_num_big_pages(block, big_page_size) > 0)
6586         page_sizes = big_page_size;
6587     else
6588         page_sizes = UVM_PAGE_SIZE_4K;
6589 
6590     return block_alloc_ptes_with_retry(block, gpu, page_sizes, pending_tracker);
6591 }
6592 
6593 static NV_STATUS block_pre_populate_pde1_all_gpus(uvm_va_block_t *block, uvm_tracker_t *pending_tracker)
6594 {
6595     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
6596     NV_STATUS status = NV_OK;
6597 
6598     // Pre-populate PDEs down to PDE1 for all GPU VA spaces on ATS systems. See
6599     // comments in block_pre_populate_pde1_gpu.
6600     if (g_uvm_global.ats.enabled && !block->cpu.ever_mapped) {
6601         uvm_gpu_va_space_t *gpu_va_space;
6602 
6603         for_each_gpu_va_space(gpu_va_space, va_space) {
6604             // We only care about systems where ATS is supported and the application
6605             // enabled it.
6606             if (!gpu_va_space->ats.enabled)
6607                 continue;
6608 
6609             status = block_pre_populate_pde1_gpu(block, gpu_va_space, pending_tracker);
6610             if (status != NV_OK)
6611                 break;
6612         }
6613     }
6614 
6615     return status;
6616 }
6617 
6618 static NV_STATUS block_unmap_gpu(uvm_va_block_t *block,
6619                                  uvm_va_block_context_t *block_context,
6620                                  uvm_gpu_t *gpu,
6621                                  const uvm_page_mask_t *unmap_page_mask,
6622                                  uvm_tracker_t *out_tracker)
6623 {
6624     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
6625     uvm_pte_bits_gpu_t pte_bit;
6626     uvm_push_t push;
6627     uvm_membar_t tlb_membar;
6628     bool only_local_mappings;
6629     uvm_page_mask_t *pages_to_unmap = &block_context->mapping.page_mask;
6630     NV_STATUS status;
6631     uvm_va_block_new_pte_state_t *new_pte_state = &block_context->mapping.new_pte_state;
6632     bool mask_empty;
6633 
6634     // We have to check gpu_state before looking at any VA space state like our
6635     // gpu_va_space, because we could be on the eviction path where we don't
6636     // have a lock on that state. However, since remove_gpu_va_space walks each
6637     // block to unmap the GPU before destroying the gpu_va_space, we're
6638     // guaranteed that if this GPU has page tables, the gpu_va_space can't go
6639     // away while we're holding the block lock.
6640     if (!block_gpu_has_page_tables(block, gpu))
6641         return NV_OK;
6642 
6643     if (!uvm_page_mask_and(pages_to_unmap, unmap_page_mask, &gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ]))
6644         return NV_OK;
6645 
6646     // block_gpu_compute_new_pte_state needs a mask of pages which will have
6647     // matching attributes after the operation is performed. In the case of
6648     // unmap, those are the pages with unset bits.
6649     uvm_page_mask_andnot(&block_context->scratch_page_mask, &gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ], pages_to_unmap);
6650     uvm_page_mask_complement(&block_context->scratch_page_mask, &block_context->scratch_page_mask);
6651     block_gpu_compute_new_pte_state(block,
6652                                     gpu,
6653                                     UVM_ID_INVALID,
6654                                     pages_to_unmap,
6655                                     &block_context->scratch_page_mask,
6656                                     new_pte_state);
6657 
6658     status = block_alloc_ptes_new_state(block, gpu, new_pte_state, out_tracker);
6659     if (status != NV_OK)
6660         return status;
6661 
6662     only_local_mappings = !block_has_remote_mapping_gpu(block, block_context, gpu->id, pages_to_unmap);
6663     tlb_membar = uvm_hal_downgrade_membar_type(gpu, only_local_mappings);
6664 
6665     status = uvm_push_begin_acquire(gpu->channel_manager,
6666                                     UVM_CHANNEL_TYPE_MEMOPS,
6667                                     &block->tracker,
6668                                     &push,
6669                                     "Unmapping pages in block [0x%llx, 0x%llx)",
6670                                     block->start,
6671                                     block->end + 1);
6672     if (status != NV_OK)
6673         return status;
6674 
6675     if (new_pte_state->pte_is_2m) {
6676         // We're either unmapping a whole valid 2M PTE, or we're unmapping all
6677         // remaining pages in a split 2M PTE.
6678         block_gpu_unmap_to_2m(block, block_context, gpu, &push, tlb_membar);
6679     }
6680     else if (gpu_state->pte_is_2m) {
6681         // The block is currently mapped as a valid 2M PTE and we're unmapping
6682         // some pages within the 2M, so we have to split it into the appropriate
6683         // mix of big and 4k PTEs.
6684         block_gpu_unmap_split_2m(block, block_context, gpu, pages_to_unmap, &push, tlb_membar);
6685     }
6686     else {
6687         // We're unmapping some pre-existing mix of big and 4K PTEs into some
6688         // other mix of big and 4K PTEs.
6689         block_gpu_unmap_big_and_4k(block, block_context, gpu, pages_to_unmap, &push, tlb_membar);
6690     }
6691 
6692     uvm_push_end(&push);
6693 
6694     if (!uvm_processor_mask_test(block_get_uvm_lite_gpus(block), gpu->id)) {
6695         uvm_processor_mask_t non_uvm_lite_gpus;
6696         uvm_processor_mask_andnot(&non_uvm_lite_gpus, &block->mapped, block_get_uvm_lite_gpus(block));
6697 
6698         UVM_ASSERT(uvm_processor_mask_test(&non_uvm_lite_gpus, gpu->id));
6699 
6700         // If the GPU is the only non-UVM-Lite processor with mappings, we can
6701         // safely mark pages as fully unmapped
6702         if (uvm_processor_mask_get_count(&non_uvm_lite_gpus) == 1)
6703             uvm_page_mask_andnot(&block->maybe_mapped_pages, &block->maybe_mapped_pages, pages_to_unmap);
6704     }
6705 
6706     // Clear block PTE state
6707     for (pte_bit = 0; pte_bit < UVM_PTE_BITS_GPU_MAX; pte_bit++) {
6708         mask_empty = !uvm_page_mask_andnot(&gpu_state->pte_bits[pte_bit],
6709                                            &gpu_state->pte_bits[pte_bit],
6710                                            pages_to_unmap);
6711         if (pte_bit == UVM_PTE_BITS_GPU_READ && mask_empty)
6712             uvm_processor_mask_clear(&block->mapped, gpu->id);
6713     }
6714 
6715     UVM_ASSERT(block_check_mappings(block));
6716 
6717     return uvm_tracker_add_push_safe(out_tracker, &push);
6718 }
6719 
6720 NV_STATUS uvm_va_block_unmap(uvm_va_block_t *va_block,
6721                              uvm_va_block_context_t *va_block_context,
6722                              uvm_processor_id_t id,
6723                              uvm_va_block_region_t region,
6724                              const uvm_page_mask_t *unmap_page_mask,
6725                              uvm_tracker_t *out_tracker)
6726 {
6727     uvm_page_mask_t *region_page_mask = &va_block_context->mapping.map_running_page_mask;
6728 
6729     UVM_ASSERT(!uvm_va_block_is_dead(va_block));
6730     uvm_assert_mutex_locked(&va_block->lock);
6731 
6732     if (UVM_ID_IS_CPU(id)) {
6733        block_unmap_cpu(va_block, region, unmap_page_mask);
6734        return NV_OK;
6735     }
6736 
6737     uvm_page_mask_init_from_region(region_page_mask, region, unmap_page_mask);
6738 
6739     return block_unmap_gpu(va_block, va_block_context, block_get_gpu(va_block, id), region_page_mask, out_tracker);
6740 }
6741 
6742 // This function essentially works as a wrapper around vm_insert_page (hence
6743 // the similar function prototype). This is needed since vm_insert_page
6744 // doesn't take permissions as input, but uses vma->vm_page_prot instead.
6745 // Since we may have multiple VA blocks under one VMA which need to map
6746 // with different permissions, we have to manually change vma->vm_page_prot for
6747 // each call to vm_insert_page. Multiple faults under one VMA in separate
6748 // blocks can be serviced concurrently, so the VMA wrapper lock is used
6749 // to protect access to vma->vm_page_prot.
6750 static NV_STATUS uvm_cpu_insert_page(struct vm_area_struct *vma,
6751                                      NvU64 addr,
6752                                      struct page *page,
6753                                      uvm_prot_t new_prot)
6754 {
6755     uvm_vma_wrapper_t *vma_wrapper;
6756     unsigned long target_flags;
6757     pgprot_t target_pgprot;
6758     int ret;
6759 
6760     UVM_ASSERT(vma);
6761     UVM_ASSERT(vma->vm_private_data);
6762 
6763     vma_wrapper = vma->vm_private_data;
6764     target_flags = vma->vm_flags;
6765 
6766     if (new_prot == UVM_PROT_READ_ONLY)
6767         target_flags &= ~VM_WRITE;
6768 
6769     target_pgprot = vm_get_page_prot(target_flags);
6770 
6771     // Take VMA wrapper lock to check vma->vm_page_prot
6772     uvm_down_read(&vma_wrapper->lock);
6773 
6774     // Take a write lock if we need to modify the VMA vm_page_prot
6775     // - vma->vm_page_prot creates writable PTEs but new prot is RO
6776     // - vma->vm_page_prot creates read-only PTEs but new_prot is RW
6777     if (pgprot_val(vma->vm_page_prot) != pgprot_val(target_pgprot)) {
6778         uvm_up_read(&vma_wrapper->lock);
6779         uvm_down_write(&vma_wrapper->lock);
6780 
6781         vma->vm_page_prot = target_pgprot;
6782 
6783         uvm_downgrade_write(&vma_wrapper->lock);
6784     }
6785 
6786     ret = vm_insert_page(vma, addr, page);
6787     uvm_up_read(&vma_wrapper->lock);
6788     if (ret) {
6789         UVM_ASSERT_MSG(ret == -ENOMEM, "ret: %d\n", ret);
6790         return errno_to_nv_status(ret);
6791     }
6792 
6793     return NV_OK;
6794 }
6795 
6796 static uvm_prot_t compute_logical_prot(uvm_va_block_t *va_block,
6797                                        uvm_va_block_context_t *va_block_context,
6798                                        uvm_page_index_t page_index)
6799 {
6800     struct vm_area_struct *vma;
6801     uvm_prot_t logical_prot;
6802 
6803     if (uvm_va_block_is_hmm(va_block)) {
6804         NvU64 addr = uvm_va_block_cpu_page_address(va_block, page_index);
6805 
6806         logical_prot = uvm_hmm_compute_logical_prot(va_block, va_block_context, addr);
6807     }
6808     else {
6809         uvm_va_range_t *va_range = va_block->va_range;
6810 
6811         UVM_ASSERT(va_range->type == UVM_VA_RANGE_TYPE_MANAGED);
6812 
6813         // Zombified VA ranges no longer have a vma, so they have no permissions
6814         if (uvm_va_range_is_managed_zombie(va_range)) {
6815             logical_prot = UVM_PROT_NONE;
6816         }
6817         else {
6818             vma = uvm_va_range_vma(va_range);
6819 
6820             if (!(vma->vm_flags & VM_READ))
6821                 logical_prot = UVM_PROT_NONE;
6822             else if (!(vma->vm_flags & VM_WRITE))
6823                 logical_prot = UVM_PROT_READ_ONLY;
6824             else
6825                 logical_prot = UVM_PROT_READ_WRITE_ATOMIC;
6826         }
6827     }
6828 
6829     return logical_prot;
6830 }
6831 
6832 static struct page *block_page_get(uvm_va_block_t *block, block_phys_page_t block_page)
6833 {
6834     struct page *page;
6835 
6836     if (UVM_ID_IS_CPU(block_page.processor)) {
6837         page = uvm_cpu_chunk_get_cpu_page(block, block_page.page_index);
6838     }
6839     else {
6840         uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
6841         uvm_gpu_t *gpu = uvm_va_space_get_gpu(va_space, block_page.processor);
6842         size_t chunk_offset;
6843         uvm_gpu_chunk_t *chunk = block_phys_page_chunk(block, block_page, &chunk_offset);
6844 
6845         UVM_ASSERT(gpu->mem_info.numa.enabled);
6846         page = uvm_gpu_chunk_to_page(&gpu->pmm, chunk) + chunk_offset / PAGE_SIZE;
6847     }
6848 
6849     UVM_ASSERT(page);
6850     return page;
6851 }
6852 
6853 // Creates or upgrades a CPU mapping for the given page, updating the block's
6854 // mapping and pte_bits bitmaps as appropriate. Upon successful return, the page
6855 // will be mapped with at least new_prot permissions.
6856 //
6857 // This never downgrades mappings, so new_prot must not be UVM_PROT_NONE. Use
6858 // block_unmap_cpu or uvm_va_block_revoke_prot instead.
6859 //
6860 // If the existing mapping is >= new_prot already, this is a no-op.
6861 //
6862 // It is the caller's responsibility to:
6863 //  - Revoke mappings from other processors as appropriate so the CPU can map
6864 //    with new_prot permissions
6865 //  - Guarantee that vm_insert_page is safe to use (vma->vm_mm has a reference
6866 //    and mmap_lock is held in at least read mode)
6867 //  - Ensure that the struct page corresponding to the physical memory being
6868 //    mapped exists
6869 //  - Manage the block's residency bitmap
6870 //  - Ensure that the block hasn't been killed (block->va_range is present)
6871 //  - Update the pte/mapping tracking state on success
6872 static NV_STATUS block_map_cpu_page_to(uvm_va_block_t *block,
6873                                        uvm_va_block_context_t *va_block_context,
6874                                        uvm_processor_id_t resident_id,
6875                                        uvm_page_index_t page_index,
6876                                        uvm_prot_t new_prot)
6877 {
6878     uvm_prot_t curr_prot = block_page_prot_cpu(block, page_index);
6879     uvm_va_range_t *va_range = block->va_range;
6880     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
6881     struct vm_area_struct *vma;
6882     NV_STATUS status;
6883     NvU64 addr;
6884     struct page *page;
6885 
6886     UVM_ASSERT(uvm_va_block_is_hmm(block) || va_range->type == UVM_VA_RANGE_TYPE_MANAGED);
6887     UVM_ASSERT(new_prot != UVM_PROT_NONE);
6888     UVM_ASSERT(new_prot < UVM_PROT_MAX);
6889     UVM_ASSERT(uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(resident_id)], UVM_ID_CPU));
6890 
6891     uvm_assert_mutex_locked(&block->lock);
6892     if (UVM_ID_IS_CPU(resident_id))
6893         UVM_ASSERT(uvm_page_mask_test(&block->cpu.allocated, page_index));
6894 
6895     // For the CPU, write implies atomic
6896     if (new_prot == UVM_PROT_READ_WRITE)
6897         new_prot = UVM_PROT_READ_WRITE_ATOMIC;
6898 
6899     // Only upgrades are supported in this function
6900     UVM_ASSERT(curr_prot <= new_prot);
6901 
6902     if (new_prot == curr_prot)
6903         return NV_OK;
6904 
6905     // Check for existing VMA permissions. They could have been modified after
6906     // the initial mmap by mprotect.
6907     if (new_prot > compute_logical_prot(block, va_block_context, page_index))
6908         return NV_ERR_INVALID_ACCESS_TYPE;
6909 
6910     if (uvm_va_block_is_hmm(block)) {
6911         // Do not map CPU pages because they belong to the Linux kernel.
6912         return NV_OK;
6913     }
6914 
6915     UVM_ASSERT(va_range);
6916 
6917     if (UVM_ID_IS_CPU(resident_id) && UVM_ID_IS_CPU(uvm_va_range_get_policy(va_range)->preferred_location)) {
6918         // Add the page's range group range to the range group's migrated list.
6919         uvm_range_group_range_t *rgr = uvm_range_group_range_find(va_space,
6920                                                                   uvm_va_block_cpu_page_address(block, page_index));
6921         if (rgr != NULL) {
6922             uvm_spin_lock(&rgr->range_group->migrated_ranges_lock);
6923             if (list_empty(&rgr->range_group_migrated_list_node))
6924                 list_move_tail(&rgr->range_group_migrated_list_node, &rgr->range_group->migrated_ranges);
6925             uvm_spin_unlock(&rgr->range_group->migrated_ranges_lock);
6926         }
6927     }
6928 
6929     // It's possible here that current->mm != vma->vm_mm. That can happen for
6930     // example due to access_process_vm (ptrace) or get_user_pages from another
6931     // driver.
6932     //
6933     // In such cases the caller has taken care of ref counting vma->vm_mm for
6934     // us, so we can safely operate on the vma but we can't use
6935     // uvm_va_range_vma_current.
6936     vma = uvm_va_range_vma(va_range);
6937     uvm_assert_mmap_lock_locked(vma->vm_mm);
6938     UVM_ASSERT(!uvm_va_space_mm_enabled(va_space) || va_space->va_space_mm.mm == vma->vm_mm);
6939 
6940     // Add the mapping
6941     addr = uvm_va_block_cpu_page_address(block, page_index);
6942 
6943     // This unmap handles upgrades as vm_insert_page returns -EBUSY when
6944     // there's already a mapping present at fault_addr, so we have to unmap
6945     // first anyway when upgrading from RO -> RW.
6946     if (curr_prot != UVM_PROT_NONE)
6947         unmap_mapping_range(va_space->mapping, addr, PAGE_SIZE, 1);
6948 
6949     // Don't map the CPU until prior copies and GPU PTE updates finish,
6950     // otherwise we might not stay coherent.
6951     status = uvm_tracker_wait(&block->tracker);
6952     if (status != NV_OK)
6953         return status;
6954 
6955     page = block_page_get(block, block_phys_page(resident_id, page_index));
6956     return uvm_cpu_insert_page(vma, addr, page, new_prot);
6957 }
6958 
6959 // Maps the CPU to the given pages which are resident on resident_id.
6960 // map_page_mask is an in/out parameter: the pages which are mapped to
6961 // resident_id are removed from the mask before returning.
6962 //
6963 // Caller must ensure that:
6964 // -  Pages in map_page_mask must not be set in the corresponding cpu.pte_bits
6965 // mask for the requested protection.
6966 static NV_STATUS block_map_cpu_to(uvm_va_block_t *block,
6967                                   uvm_va_block_context_t *block_context,
6968                                   uvm_processor_id_t resident_id,
6969                                   uvm_va_block_region_t region,
6970                                   uvm_page_mask_t *map_page_mask,
6971                                   uvm_prot_t new_prot,
6972                                   uvm_tracker_t *out_tracker)
6973 {
6974     NV_STATUS status = NV_OK;
6975     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
6976     uvm_page_index_t page_index;
6977     uvm_page_mask_t *pages_to_map = &block_context->mapping.page_mask;
6978     const uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(block, resident_id);
6979     uvm_pte_bits_cpu_t prot_pte_bit = get_cpu_pte_bit_index(new_prot);
6980     uvm_pte_bits_cpu_t pte_bit;
6981 
6982     UVM_ASSERT(uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(resident_id)], UVM_ID_CPU));
6983 
6984     // TODO: Bug 1766424: Check if optimizing the unmap_mapping_range calls
6985     //       within block_map_cpu_page_to by doing them once here is helpful.
6986 
6987     UVM_ASSERT(!uvm_page_mask_and(&block_context->scratch_page_mask,
6988                                   map_page_mask,
6989                                   &block->cpu.pte_bits[prot_pte_bit]));
6990 
6991     // The pages which will actually change are those in the input page mask
6992     // which are resident on the target.
6993     if (!uvm_page_mask_and(pages_to_map, map_page_mask, resident_mask))
6994         return NV_OK;
6995 
6996     status = block_pre_populate_pde1_all_gpus(block, out_tracker);
6997     if (status != NV_OK)
6998         return status;
6999 
7000     block->cpu.ever_mapped = true;
7001 
7002     for_each_va_block_page_in_region_mask(page_index, pages_to_map, region) {
7003         status = block_map_cpu_page_to(block,
7004                                        block_context,
7005                                        resident_id,
7006                                        page_index,
7007                                        new_prot);
7008         if (status != NV_OK)
7009             break;
7010 
7011         uvm_processor_mask_set(&block->mapped, UVM_ID_CPU);
7012     }
7013 
7014     // If there was some error, shrink the region so that we only update the
7015     // pte/mapping tracking bits for the pages that succeeded
7016     if (status != NV_OK) {
7017         region = uvm_va_block_region(region.first, page_index);
7018         uvm_page_mask_region_clear_outside(pages_to_map, region);
7019     }
7020 
7021     // If pages are mapped from a remote residency, notify the remote mapping
7022     // events to tools. We skip event notification if the cause is Invalid. We
7023     // use it to signal that this function is being called from the revocation
7024     // path to avoid reporting duplicate events.
7025     if (UVM_ID_IS_GPU(resident_id) &&
7026         va_space->tools.enabled &&
7027         block_context->mapping.cause != UvmEventMapRemoteCauseInvalid) {
7028         uvm_va_block_region_t subregion;
7029         for_each_va_block_subregion_in_mask(subregion, pages_to_map, region) {
7030             uvm_tools_record_map_remote(block,
7031                                         NULL,
7032                                         UVM_ID_CPU,
7033                                         resident_id,
7034                                         uvm_va_block_region_start(block, subregion),
7035                                         uvm_va_block_region_size(subregion),
7036                                         block_context->mapping.cause);
7037         }
7038     }
7039 
7040     // Update CPU mapping state
7041     for (pte_bit = 0; pte_bit <= prot_pte_bit; pte_bit++)
7042         uvm_page_mask_or(&block->cpu.pte_bits[pte_bit], &block->cpu.pte_bits[pte_bit], pages_to_map);
7043 
7044     uvm_page_mask_or(&block->maybe_mapped_pages, &block->maybe_mapped_pages, pages_to_map);
7045 
7046     UVM_ASSERT(block_check_mappings(block));
7047 
7048     // Remove all pages that were newly-mapped from the input mask
7049     uvm_page_mask_andnot(map_page_mask, map_page_mask, pages_to_map);
7050 
7051     return status;
7052 }
7053 
7054 // Maps the GPU to the given pages which are resident on resident_id.
7055 // map_page_mask is an in/out parameter: the pages which are mapped
7056 // to resident_id are removed from the mask before returning.
7057 //
7058 // Caller must ensure that:
7059 // -  Pages in map_page_mask must not be set in the corresponding pte_bits mask
7060 // for the requested protection on the mapping GPU.
7061 static NV_STATUS block_map_gpu_to(uvm_va_block_t *va_block,
7062                                   uvm_va_block_context_t *block_context,
7063                                   uvm_gpu_t *gpu,
7064                                   uvm_processor_id_t resident_id,
7065                                   uvm_page_mask_t *map_page_mask,
7066                                   uvm_prot_t new_prot,
7067                                   uvm_tracker_t *out_tracker)
7068 {
7069     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id);
7070     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
7071     uvm_push_t push;
7072     NV_STATUS status;
7073     uvm_page_mask_t *pages_to_map = &block_context->mapping.page_mask;
7074     const uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, resident_id);
7075     uvm_pte_bits_gpu_t pte_bit;
7076     uvm_pte_bits_gpu_t prot_pte_bit = get_gpu_pte_bit_index(new_prot);
7077     uvm_va_block_new_pte_state_t *new_pte_state = &block_context->mapping.new_pte_state;
7078     block_pte_op_t pte_op;
7079 
7080     UVM_ASSERT(map_page_mask);
7081     UVM_ASSERT(uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(resident_id)], gpu->id));
7082 
7083     if (uvm_processor_mask_test(block_get_uvm_lite_gpus(va_block), gpu->id))
7084         UVM_ASSERT(uvm_id_equal(resident_id, uvm_va_range_get_policy(va_block->va_range)->preferred_location));
7085 
7086     UVM_ASSERT(!uvm_page_mask_and(&block_context->scratch_page_mask,
7087                                   map_page_mask,
7088                                   &gpu_state->pte_bits[prot_pte_bit]));
7089 
7090     // The pages which will actually change are those in the input page mask
7091     // which are resident on the target.
7092     if (!uvm_page_mask_and(pages_to_map, map_page_mask, resident_mask))
7093         return NV_OK;
7094 
7095     UVM_ASSERT(block_check_mapping_residency(va_block, gpu, resident_id, pages_to_map));
7096 
7097     // For PTE merge/split computation, compute all resident pages which will
7098     // have exactly new_prot after performing the mapping.
7099     uvm_page_mask_or(&block_context->scratch_page_mask, &gpu_state->pte_bits[prot_pte_bit], pages_to_map);
7100     if (prot_pte_bit < UVM_PTE_BITS_GPU_ATOMIC) {
7101         uvm_page_mask_andnot(&block_context->scratch_page_mask,
7102                              &block_context->scratch_page_mask,
7103                              &gpu_state->pte_bits[prot_pte_bit + 1]);
7104     }
7105     uvm_page_mask_and(&block_context->scratch_page_mask, &block_context->scratch_page_mask, resident_mask);
7106 
7107     block_gpu_compute_new_pte_state(va_block,
7108                                     gpu,
7109                                     resident_id,
7110                                     pages_to_map,
7111                                     &block_context->scratch_page_mask,
7112                                     new_pte_state);
7113 
7114     status = block_alloc_ptes_new_state(va_block, gpu, new_pte_state, out_tracker);
7115     if (status != NV_OK)
7116         return status;
7117 
7118     status = uvm_push_begin_acquire(gpu->channel_manager,
7119                                     UVM_CHANNEL_TYPE_MEMOPS,
7120                                     &va_block->tracker,
7121                                     &push,
7122                                     "Mapping pages in block [0x%llx, 0x%llx) as %s",
7123                                     va_block->start,
7124                                     va_block->end + 1,
7125                                     uvm_prot_string(new_prot));
7126     if (status != NV_OK)
7127         return status;
7128 
7129     pte_op = BLOCK_PTE_OP_MAP;
7130     if (new_pte_state->pte_is_2m) {
7131         // We're either modifying permissions of a pre-existing 2M PTE, or all
7132         // permissions match so we can merge to a new 2M PTE.
7133         block_gpu_map_to_2m(va_block, block_context, gpu, resident_id, new_prot, &push, pte_op);
7134     }
7135     else if (gpu_state->pte_is_2m) {
7136         // Permissions on a subset of the existing 2M PTE are being upgraded, so
7137         // we have to split it into the appropriate mix of big and 4k PTEs.
7138         block_gpu_map_split_2m(va_block, block_context, gpu, resident_id, pages_to_map, new_prot, &push, pte_op);
7139     }
7140     else {
7141         // We're upgrading permissions on some pre-existing mix of big and 4K
7142         // PTEs into some other mix of big and 4K PTEs.
7143         block_gpu_map_big_and_4k(va_block, block_context, gpu, resident_id, pages_to_map, new_prot, &push, pte_op);
7144     }
7145 
7146     // If we are mapping remotely, record the event
7147     if (va_space->tools.enabled && !uvm_id_equal(resident_id, gpu->id)) {
7148         uvm_va_block_region_t subregion, region = uvm_va_block_region_from_block(va_block);
7149 
7150         UVM_ASSERT(block_context->mapping.cause != UvmEventMapRemoteCauseInvalid);
7151 
7152         for_each_va_block_subregion_in_mask(subregion, pages_to_map, region) {
7153             uvm_tools_record_map_remote(va_block,
7154                                         &push,
7155                                         gpu->id,
7156                                         resident_id,
7157                                         uvm_va_block_region_start(va_block, subregion),
7158                                         uvm_va_block_region_size(subregion),
7159                                         block_context->mapping.cause);
7160         }
7161     }
7162 
7163     uvm_push_end(&push);
7164 
7165     // Update GPU mapping state
7166     for (pte_bit = 0; pte_bit <= prot_pte_bit; pte_bit++)
7167         uvm_page_mask_or(&gpu_state->pte_bits[pte_bit], &gpu_state->pte_bits[pte_bit], pages_to_map);
7168 
7169     uvm_processor_mask_set(&va_block->mapped, gpu->id);
7170 
7171     // If we are mapping a UVM-Lite GPU do not update maybe_mapped_pages
7172     if (!uvm_processor_mask_test(block_get_uvm_lite_gpus(va_block), gpu->id))
7173         uvm_page_mask_or(&va_block->maybe_mapped_pages, &va_block->maybe_mapped_pages, pages_to_map);
7174 
7175     // Remove all pages resident on this processor from the input mask, which
7176     // were newly-mapped.
7177     uvm_page_mask_andnot(map_page_mask, map_page_mask, pages_to_map);
7178 
7179     UVM_ASSERT(block_check_mappings(va_block));
7180 
7181     return uvm_tracker_add_push_safe(out_tracker, &push);
7182 }
7183 
7184 static void map_get_allowed_destinations(uvm_va_block_t *block,
7185                                          uvm_va_block_context_t *va_block_context,
7186                                          const uvm_va_policy_t *policy,
7187                                          uvm_processor_id_t id,
7188                                          uvm_processor_mask_t *allowed_mask)
7189 {
7190     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
7191 
7192     if (uvm_processor_mask_test(block_get_uvm_lite_gpus(block), id)) {
7193         // UVM-Lite can only map resident pages on the preferred location
7194         uvm_processor_mask_zero(allowed_mask);
7195         uvm_processor_mask_set(allowed_mask, policy->preferred_location);
7196     }
7197     else if ((uvm_va_policy_is_read_duplicate(policy, va_space) ||
7198               (uvm_id_equal(policy->preferred_location, id) &&
7199                !is_uvm_fault_force_sysmem_set() &&
7200                !uvm_hmm_must_use_sysmem(block, va_block_context))) &&
7201              uvm_va_space_processor_has_memory(va_space, id)) {
7202         // When operating under read-duplication we should only map the local
7203         // processor to cause fault-and-duplicate of remote pages.
7204         //
7205         // The same holds when this processor is the preferred location: only
7206         // create local mappings to force remote pages to fault-and-migrate.
7207         uvm_processor_mask_zero(allowed_mask);
7208         uvm_processor_mask_set(allowed_mask, id);
7209     }
7210     else {
7211         // Common case: Just map wherever the memory happens to reside
7212         uvm_processor_mask_and(allowed_mask, &block->resident, &va_space->can_access[uvm_id_value(id)]);
7213         return;
7214     }
7215 
7216     // Clamp to resident and accessible processors
7217     uvm_processor_mask_and(allowed_mask, allowed_mask, &block->resident);
7218     uvm_processor_mask_and(allowed_mask, allowed_mask, &va_space->can_access[uvm_id_value(id)]);
7219 }
7220 
7221 NV_STATUS uvm_va_block_map(uvm_va_block_t *va_block,
7222                            uvm_va_block_context_t *va_block_context,
7223                            uvm_processor_id_t id,
7224                            uvm_va_block_region_t region,
7225                            const uvm_page_mask_t *map_page_mask,
7226                            uvm_prot_t new_prot,
7227                            UvmEventMapRemoteCause cause,
7228                            uvm_tracker_t *out_tracker)
7229 {
7230     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
7231     uvm_gpu_t *gpu = NULL;
7232     uvm_processor_mask_t allowed_destinations;
7233     uvm_processor_id_t resident_id;
7234     const uvm_page_mask_t *pte_mask;
7235     uvm_page_mask_t *running_page_mask = &va_block_context->mapping.map_running_page_mask;
7236     NV_STATUS status;
7237 
7238     va_block_context->mapping.cause = cause;
7239 
7240     UVM_ASSERT(new_prot != UVM_PROT_NONE);
7241     UVM_ASSERT(new_prot < UVM_PROT_MAX);
7242     uvm_assert_mutex_locked(&va_block->lock);
7243     UVM_ASSERT(uvm_va_block_check_policy_is_valid(va_block, va_block_context->policy, region));
7244 
7245     // Mapping is not supported on the eviction path that doesn't hold the VA
7246     // space lock.
7247     uvm_assert_rwsem_locked(&va_space->lock);
7248 
7249     if (UVM_ID_IS_CPU(id)) {
7250         uvm_pte_bits_cpu_t prot_pte_bit;
7251 
7252         // Check if the current thread is allowed to call vm_insert_page
7253         if (!uvm_va_block_is_hmm(va_block) && !uvm_va_range_vma_check(va_block->va_range, va_block_context->mm))
7254             return NV_OK;
7255 
7256         prot_pte_bit = get_cpu_pte_bit_index(new_prot);
7257         pte_mask = &va_block->cpu.pte_bits[prot_pte_bit];
7258     }
7259     else {
7260         uvm_va_block_gpu_state_t *gpu_state;
7261         uvm_pte_bits_gpu_t prot_pte_bit;
7262 
7263         gpu = uvm_va_space_get_gpu(va_space, id);
7264 
7265         // Although this GPU UUID is registered in the VA space, it might not have a
7266         // GPU VA space registered.
7267         if (!uvm_gpu_va_space_get(va_space, gpu))
7268             return NV_OK;
7269 
7270         gpu_state = block_gpu_state_get_alloc(va_block, gpu);
7271         if (!gpu_state)
7272             return NV_ERR_NO_MEMORY;
7273 
7274         prot_pte_bit = get_gpu_pte_bit_index(new_prot);
7275         pte_mask = &gpu_state->pte_bits[prot_pte_bit];
7276     }
7277 
7278     uvm_page_mask_init_from_region(running_page_mask, region, map_page_mask);
7279 
7280     if (!uvm_page_mask_andnot(running_page_mask, running_page_mask, pte_mask))
7281         return NV_OK;
7282 
7283     // Map per resident location so we can more easily detect physically-
7284     // contiguous mappings.
7285     map_get_allowed_destinations(va_block, va_block_context, va_block_context->policy, id, &allowed_destinations);
7286 
7287     for_each_closest_id(resident_id, &allowed_destinations, id, va_space) {
7288         if (UVM_ID_IS_CPU(id)) {
7289             status = block_map_cpu_to(va_block,
7290                                       va_block_context,
7291                                       resident_id,
7292                                       region,
7293                                       running_page_mask,
7294                                       new_prot,
7295                                       out_tracker);
7296         }
7297         else {
7298             status = block_map_gpu_to(va_block,
7299                                       va_block_context,
7300                                       gpu,
7301                                       resident_id,
7302                                       running_page_mask,
7303                                       new_prot,
7304                                       out_tracker);
7305         }
7306 
7307         if (status != NV_OK)
7308             return status;
7309 
7310         // If we've mapped all requested pages, we're done
7311         if (uvm_page_mask_region_empty(running_page_mask, region))
7312             break;
7313     }
7314 
7315     return NV_OK;
7316 }
7317 
7318 // Revokes the given pages mapped by cpu. This is implemented by unmapping all
7319 // pages and mapping them later with the lower permission. This is required
7320 // because vm_insert_page can only be used for upgrades from Invalid.
7321 //
7322 // Caller must ensure that:
7323 // -  Pages in revoke_page_mask must be set in the
7324 // cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE] mask.
7325 static NV_STATUS block_revoke_cpu_write(uvm_va_block_t *block,
7326                                         uvm_va_block_context_t *block_context,
7327                                         uvm_va_block_region_t region,
7328                                         const uvm_page_mask_t *revoke_page_mask,
7329                                         uvm_tracker_t *out_tracker)
7330 {
7331     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
7332     uvm_va_block_region_t subregion;
7333 
7334     UVM_ASSERT(revoke_page_mask);
7335 
7336     UVM_ASSERT(uvm_page_mask_subset(revoke_page_mask, &block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE]));
7337 
7338     block_unmap_cpu(block, region, revoke_page_mask);
7339 
7340     // Coalesce revocation event notification
7341     for_each_va_block_subregion_in_mask(subregion, revoke_page_mask, region) {
7342         uvm_perf_event_notify_revocation(&va_space->perf_events,
7343                                          block,
7344                                          UVM_ID_CPU,
7345                                          uvm_va_block_region_start(block, subregion),
7346                                          uvm_va_block_region_size(subregion),
7347                                          UVM_PROT_READ_WRITE_ATOMIC,
7348                                          UVM_PROT_READ_ONLY);
7349     }
7350 
7351     // uvm_va_block_map will skip this remap if we aren't holding the right mm
7352     // lock.
7353     return uvm_va_block_map(block,
7354                             block_context,
7355                             UVM_ID_CPU,
7356                             region,
7357                             revoke_page_mask,
7358                             UVM_PROT_READ_ONLY,
7359                             UvmEventMapRemoteCauseInvalid,
7360                             out_tracker);
7361 }
7362 
7363 static void block_revoke_prot_gpu_perf_notify(uvm_va_block_t *block,
7364                                               uvm_va_block_context_t *block_context,
7365                                               uvm_gpu_t *gpu,
7366                                               uvm_prot_t prot_revoked,
7367                                               const uvm_page_mask_t *pages_revoked)
7368 {
7369     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
7370     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
7371     uvm_va_block_region_t subregion, region = uvm_va_block_region_from_block(block);
7372     uvm_pte_bits_gpu_t pte_bit;
7373 
7374     for (pte_bit = UVM_PTE_BITS_GPU_ATOMIC; pte_bit >= get_gpu_pte_bit_index(prot_revoked); pte_bit--) {
7375         uvm_prot_t old_prot;
7376 
7377         if (!uvm_page_mask_and(&block_context->scratch_page_mask, &gpu_state->pte_bits[pte_bit], pages_revoked))
7378             continue;
7379 
7380         if (pte_bit == UVM_PTE_BITS_GPU_ATOMIC)
7381             old_prot = UVM_PROT_READ_WRITE_ATOMIC;
7382         else
7383             old_prot = UVM_PROT_READ_WRITE;
7384 
7385         for_each_va_block_subregion_in_mask(subregion, &block_context->scratch_page_mask, region) {
7386             uvm_perf_event_notify_revocation(&va_space->perf_events,
7387                                              block,
7388                                              gpu->id,
7389                                              uvm_va_block_region_start(block, subregion),
7390                                              uvm_va_block_region_size(subregion),
7391                                              old_prot,
7392                                              prot_revoked - 1);
7393         }
7394     }
7395 }
7396 
7397 // Revokes the given pages mapped by gpu which are resident on resident_id.
7398 // revoke_page_mask is an in/out parameter: the pages which have the appropriate
7399 // permissions and are mapped to resident_id are removed from the mask before
7400 // returning.
7401 //
7402 // Caller must ensure that:
7403 // -  Pages in map_page_mask must be set in the corresponding pte_bits mask for
7404 // the protection to be revoked on the mapping GPU.
7405 static NV_STATUS block_revoke_prot_gpu_to(uvm_va_block_t *va_block,
7406                                           uvm_va_block_context_t *block_context,
7407                                           uvm_gpu_t *gpu,
7408                                           uvm_processor_id_t resident_id,
7409                                           uvm_page_mask_t *revoke_page_mask,
7410                                           uvm_prot_t prot_to_revoke,
7411                                           uvm_tracker_t *out_tracker)
7412 {
7413     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id);
7414     uvm_push_t push;
7415     NV_STATUS status;
7416     uvm_pte_bits_gpu_t pte_bit;
7417     uvm_pte_bits_gpu_t prot_pte_bit = get_gpu_pte_bit_index(prot_to_revoke);
7418     uvm_prot_t new_prot = prot_to_revoke - 1;
7419     uvm_va_block_new_pte_state_t *new_pte_state = &block_context->mapping.new_pte_state;
7420     block_pte_op_t pte_op;
7421     const uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, resident_id);
7422     uvm_page_mask_t *pages_to_revoke = &block_context->mapping.page_mask;
7423 
7424     UVM_ASSERT(revoke_page_mask);
7425     UVM_ASSERT(uvm_page_mask_subset(revoke_page_mask, &gpu_state->pte_bits[prot_pte_bit]));
7426 
7427     // The pages which will actually change are those in the input page mask
7428     // which are resident on the target.
7429     if (!uvm_page_mask_and(pages_to_revoke, revoke_page_mask, resident_mask))
7430         return NV_OK;
7431 
7432     UVM_ASSERT(block_check_mapping_residency(va_block, gpu, resident_id, pages_to_revoke));
7433 
7434     // For PTE merge/split computation, compute all resident pages which will
7435     // have exactly prot_to_revoke-1 after performing the revocation.
7436     uvm_page_mask_andnot(&block_context->scratch_page_mask, &gpu_state->pte_bits[prot_pte_bit], pages_to_revoke);
7437     uvm_page_mask_andnot(&block_context->scratch_page_mask,
7438                          &gpu_state->pte_bits[prot_pte_bit - 1],
7439                          &block_context->scratch_page_mask);
7440     uvm_page_mask_and(&block_context->scratch_page_mask, &block_context->scratch_page_mask, resident_mask);
7441 
7442     block_gpu_compute_new_pte_state(va_block,
7443                                     gpu,
7444                                     resident_id,
7445                                     pages_to_revoke,
7446                                     &block_context->scratch_page_mask,
7447                                     new_pte_state);
7448 
7449     status = block_alloc_ptes_new_state(va_block, gpu, new_pte_state, out_tracker);
7450     if (status != NV_OK)
7451         return status;
7452 
7453     status = uvm_push_begin_acquire(gpu->channel_manager,
7454                                     UVM_CHANNEL_TYPE_MEMOPS,
7455                                     &va_block->tracker,
7456                                     &push,
7457                                     "Revoking %s access privileges in block [0x%llx, 0x%llx) ",
7458                                     uvm_prot_string(prot_to_revoke),
7459                                     va_block->start,
7460                                     va_block->end + 1);
7461     if (status != NV_OK)
7462         return status;
7463 
7464     pte_op = BLOCK_PTE_OP_REVOKE;
7465     if (new_pte_state->pte_is_2m) {
7466         // We're either modifying permissions of a pre-existing 2M PTE, or all
7467         // permissions match so we can merge to a new 2M PTE.
7468         block_gpu_map_to_2m(va_block, block_context, gpu, resident_id, new_prot, &push, pte_op);
7469     }
7470     else if (gpu_state->pte_is_2m) {
7471         // Permissions on a subset of the existing 2M PTE are being downgraded,
7472         // so we have to split it into the appropriate mix of big and 4k PTEs.
7473         block_gpu_map_split_2m(va_block, block_context, gpu, resident_id, pages_to_revoke, new_prot, &push, pte_op);
7474     }
7475     else {
7476         // We're downgrading permissions on some pre-existing mix of big and 4K
7477         // PTEs into some other mix of big and 4K PTEs.
7478         block_gpu_map_big_and_4k(va_block, block_context, gpu, resident_id, pages_to_revoke, new_prot, &push, pte_op);
7479     }
7480 
7481     uvm_push_end(&push);
7482 
7483     block_revoke_prot_gpu_perf_notify(va_block, block_context, gpu, prot_to_revoke, pages_to_revoke);
7484 
7485     // Update GPU mapping state
7486     for (pte_bit = UVM_PTE_BITS_GPU_ATOMIC; pte_bit >= prot_pte_bit; pte_bit--)
7487         uvm_page_mask_andnot(&gpu_state->pte_bits[pte_bit], &gpu_state->pte_bits[pte_bit], pages_to_revoke);
7488 
7489     // Remove all pages resident on this processor from the input mask, which
7490     // pages which were revoked and pages which already had the correct
7491     // permissions.
7492     uvm_page_mask_andnot(revoke_page_mask, revoke_page_mask, pages_to_revoke);
7493 
7494     UVM_ASSERT(block_check_mappings(va_block));
7495 
7496     return uvm_tracker_add_push_safe(out_tracker, &push);
7497 }
7498 
7499 NV_STATUS uvm_va_block_revoke_prot(uvm_va_block_t *va_block,
7500                                    uvm_va_block_context_t *va_block_context,
7501                                    uvm_processor_id_t id,
7502                                    uvm_va_block_region_t region,
7503                                    const uvm_page_mask_t *revoke_page_mask,
7504                                    uvm_prot_t prot_to_revoke,
7505                                    uvm_tracker_t *out_tracker)
7506 {
7507     uvm_gpu_t *gpu;
7508     uvm_va_block_gpu_state_t *gpu_state;
7509     uvm_processor_mask_t resident_procs;
7510     uvm_processor_id_t resident_id;
7511     uvm_page_mask_t *running_page_mask = &va_block_context->mapping.revoke_running_page_mask;
7512     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
7513     uvm_pte_bits_gpu_t prot_pte_bit;
7514 
7515     UVM_ASSERT(prot_to_revoke > UVM_PROT_READ_ONLY);
7516     UVM_ASSERT(prot_to_revoke < UVM_PROT_MAX);
7517     uvm_assert_mutex_locked(&va_block->lock);
7518 
7519     if (UVM_ID_IS_CPU(id)) {
7520         if (prot_to_revoke == UVM_PROT_READ_WRITE_ATOMIC)
7521             return NV_OK;
7522 
7523         if (uvm_va_block_is_hmm(va_block)) {
7524             // Linux is responsible for CPU page table updates.
7525             uvm_page_mask_region_clear(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE], region);
7526             return NV_OK;
7527         }
7528 
7529         uvm_page_mask_init_from_region(running_page_mask, region, revoke_page_mask);
7530 
7531         if (uvm_page_mask_and(running_page_mask, running_page_mask, &va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE]))
7532             return block_revoke_cpu_write(va_block, va_block_context, region, running_page_mask, out_tracker);
7533 
7534         return NV_OK;
7535     }
7536 
7537     gpu = uvm_va_space_get_gpu(va_space, id);
7538 
7539     // UVM-Lite GPUs should never have access revoked
7540     UVM_ASSERT_MSG(!uvm_processor_mask_test(block_get_uvm_lite_gpus(va_block), gpu->id),
7541                    "GPU %s\n", uvm_gpu_name(gpu));
7542 
7543     // Return early if there are no mappings for the GPU present in the block
7544     if (!uvm_processor_mask_test(&va_block->mapped, gpu->id))
7545         return NV_OK;
7546 
7547     gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id);
7548     prot_pte_bit = get_gpu_pte_bit_index(prot_to_revoke);
7549 
7550     uvm_page_mask_init_from_region(running_page_mask, region, revoke_page_mask);
7551 
7552     if (!uvm_page_mask_and(running_page_mask, running_page_mask, &gpu_state->pte_bits[prot_pte_bit]))
7553         return NV_OK;
7554 
7555     // Revoke per resident location so we can more easily detect physically-
7556     // contiguous mappings.
7557     uvm_processor_mask_copy(&resident_procs, &va_block->resident);
7558 
7559     for_each_closest_id(resident_id, &resident_procs, gpu->id, va_space) {
7560         NV_STATUS status = block_revoke_prot_gpu_to(va_block,
7561                                                     va_block_context,
7562                                                     gpu,
7563                                                     resident_id,
7564                                                     running_page_mask,
7565                                                     prot_to_revoke,
7566                                                     out_tracker);
7567         if (status != NV_OK)
7568             return status;
7569 
7570         // If we've revoked all requested pages, we're done
7571         if (uvm_page_mask_region_empty(running_page_mask, region))
7572             break;
7573     }
7574 
7575     return NV_OK;
7576 }
7577 
7578 NV_STATUS uvm_va_block_map_mask(uvm_va_block_t *va_block,
7579                                 uvm_va_block_context_t *va_block_context,
7580                                 const uvm_processor_mask_t *map_processor_mask,
7581                                 uvm_va_block_region_t region,
7582                                 const uvm_page_mask_t *map_page_mask,
7583                                 uvm_prot_t new_prot,
7584                                 UvmEventMapRemoteCause cause)
7585 {
7586     uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
7587     NV_STATUS status = NV_OK;
7588     NV_STATUS tracker_status;
7589     uvm_processor_id_t id;
7590 
7591     UVM_ASSERT(uvm_va_block_check_policy_is_valid(va_block, va_block_context->policy, region));
7592 
7593     for_each_id_in_mask(id, map_processor_mask) {
7594         status = uvm_va_block_map(va_block,
7595                                   va_block_context,
7596                                   id,
7597                                   region,
7598                                   map_page_mask,
7599                                   new_prot,
7600                                   cause,
7601                                   &local_tracker);
7602         if (status != NV_OK)
7603             break;
7604     }
7605 
7606     // Regardless of error, add the successfully-pushed mapping operations into
7607     // the block's tracker. Note that we can't overwrite the tracker because we
7608     // aren't guaranteed that the map actually pushed anything (in which case it
7609     // would've acquired the block tracker first).
7610     tracker_status = uvm_tracker_add_tracker_safe(&va_block->tracker, &local_tracker);
7611     uvm_tracker_deinit(&local_tracker);
7612 
7613     return status == NV_OK ? tracker_status : status;
7614 }
7615 
7616 NV_STATUS uvm_va_block_unmap_mask(uvm_va_block_t *va_block,
7617                                   uvm_va_block_context_t *va_block_context,
7618                                   const uvm_processor_mask_t *unmap_processor_mask,
7619                                   uvm_va_block_region_t region,
7620                                   const uvm_page_mask_t *unmap_page_mask)
7621 {
7622     uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
7623     NV_STATUS status = NV_OK;
7624     NV_STATUS tracker_status;
7625     uvm_processor_id_t id;
7626 
7627     // Watch out, unmap_mask could change during iteration since it could be
7628     // va_block->mapped.
7629     for_each_id_in_mask(id, unmap_processor_mask) {
7630         // Errors could either be a system-fatal error (ECC) or an allocation
7631         // retry due to PTE splitting. In either case we should stop after
7632         // hitting the first one.
7633         status = uvm_va_block_unmap(va_block, va_block_context, id, region, unmap_page_mask, &local_tracker);
7634         if (status != NV_OK)
7635             break;
7636     }
7637 
7638     // See the comment in uvm_va_block_map_mask for adding to the tracker.
7639     tracker_status = uvm_tracker_add_tracker_safe(&va_block->tracker, &local_tracker);
7640     uvm_tracker_deinit(&local_tracker);
7641 
7642     return status == NV_OK ? tracker_status : status;
7643 }
7644 
7645 NV_STATUS uvm_va_block_revoke_prot_mask(uvm_va_block_t *va_block,
7646                                         uvm_va_block_context_t *va_block_context,
7647                                         const uvm_processor_mask_t *revoke_processor_mask,
7648                                         uvm_va_block_region_t region,
7649                                         const uvm_page_mask_t *revoke_page_mask,
7650                                         uvm_prot_t prot_to_revoke)
7651 {
7652     uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
7653     NV_STATUS status = NV_OK;
7654     NV_STATUS tracker_status;
7655     uvm_processor_id_t id;
7656 
7657     for_each_id_in_mask(id, revoke_processor_mask) {
7658         status = uvm_va_block_revoke_prot(va_block,
7659                                           va_block_context,
7660                                           id,
7661                                           region,
7662                                           revoke_page_mask,
7663                                           prot_to_revoke,
7664                                           &local_tracker);
7665         if (status != NV_OK)
7666             break;
7667     }
7668 
7669     // See the comment in uvm_va_block_map_mask for adding to the tracker.
7670     tracker_status = uvm_tracker_add_tracker_safe(&va_block->tracker, &local_tracker);
7671     uvm_tracker_deinit(&local_tracker);
7672 
7673     return status == NV_OK ? tracker_status : status;
7674 }
7675 
7676 // Updates the read_duplicated_pages mask in the block when the state of GPU id
7677 // is being destroyed
7678 static void update_read_duplicated_pages_mask(uvm_va_block_t *block,
7679                                               uvm_gpu_id_t id,
7680                                               uvm_va_block_gpu_state_t *gpu_state)
7681 {
7682     uvm_gpu_id_t running_id;
7683     bool first = true;
7684     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
7685     uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, NULL);
7686     uvm_page_mask_t *running_page_mask = &block_context->update_read_duplicated_pages.running_page_mask;
7687     uvm_page_mask_t *tmp_page_mask = &block_context->scratch_page_mask;
7688 
7689     uvm_page_mask_zero(&block->read_duplicated_pages);
7690 
7691     for_each_id_in_mask(running_id, &block->resident) {
7692         const uvm_page_mask_t *running_residency_mask;
7693 
7694         if (uvm_id_equal(running_id, id))
7695             continue;
7696 
7697         running_residency_mask = uvm_va_block_resident_mask_get(block, running_id);
7698 
7699         if (first) {
7700             uvm_page_mask_copy(running_page_mask, running_residency_mask);
7701             first = false;
7702             continue;
7703         }
7704 
7705         if (uvm_page_mask_and(tmp_page_mask, running_page_mask, running_residency_mask))
7706             uvm_page_mask_or(&block->read_duplicated_pages, &block->read_duplicated_pages, tmp_page_mask);
7707 
7708         uvm_page_mask_or(running_page_mask, running_page_mask, running_residency_mask);
7709     }
7710 }
7711 
7712 // Unmaps all GPU mappings under this block, frees the page tables, and frees
7713 // all the GPU chunks. This simply drops the chunks on the floor, so the caller
7714 // must take care of copying the data elsewhere if it needs to remain intact.
7715 //
7716 // This serializes on the block tracker since it must unmap page tables.
7717 static void block_destroy_gpu_state(uvm_va_block_t *block, uvm_gpu_id_t id)
7718 {
7719     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, id);
7720     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
7721     uvm_gpu_va_space_t *gpu_va_space;
7722     uvm_gpu_t *gpu, *other_gpu;
7723 
7724     if (!gpu_state)
7725         return;
7726 
7727     uvm_assert_mutex_locked(&block->lock);
7728 
7729     // Unmap PTEs and free page tables
7730     gpu = uvm_va_space_get_gpu(va_space, id);
7731     gpu_va_space = uvm_gpu_va_space_get(va_space, gpu);
7732     if (gpu_va_space) {
7733         uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, NULL);
7734 
7735         uvm_va_block_remove_gpu_va_space(block, gpu_va_space, block_context);
7736     }
7737 
7738     UVM_ASSERT(!uvm_processor_mask_test(&block->mapped, id));
7739 
7740     // No processor should have this GPU mapped at this point
7741     UVM_ASSERT(block_check_processor_not_mapped(block, id));
7742 
7743     // We need to remove the mappings of the indirect peers from the reverse
7744     // map when the GPU state is being destroyed (for example, on
7745     // unregister_gpu) and when peer access between indirect peers is disabled.
7746     // However, we need to avoid double mapping removals. There are two
7747     // possible scenarios:
7748     // - Disable peer access first. This will remove all mappings between A and
7749     // B GPUs, and the indirect_peers bit is cleared. Thus, the later call to
7750     // unregister_gpu will not operate on that pair of GPUs.
7751     // - Unregister GPU first. This will remove all mappings from all indirect
7752     // peers to the GPU being unregistered. It will also destroy its GPU state.
7753     // Subsequent calls to disable peers will remove the mappings from the GPU
7754     // being unregistered, but never to the GPU being unregistered (since it no
7755     // longer has a valid GPU state).
7756     for_each_va_space_gpu_in_mask(other_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)])
7757         block_gpu_unmap_all_chunks_indirect_peer(block, gpu, other_gpu);
7758 
7759     if (gpu_state->chunks) {
7760         size_t i, num_chunks;
7761 
7762         update_read_duplicated_pages_mask(block, id, gpu_state);
7763         uvm_page_mask_zero(&gpu_state->resident);
7764         block_clear_resident_processor(block, id);
7765 
7766         num_chunks = block_num_gpu_chunks(block, gpu);
7767         for (i = 0; i < num_chunks; i++) {
7768             uvm_gpu_chunk_t *chunk = gpu_state->chunks[i];
7769             if (!chunk)
7770                 continue;
7771 
7772             uvm_mmu_chunk_unmap(chunk, &block->tracker);
7773             uvm_pmm_gpu_free(&gpu->pmm, chunk, &block->tracker);
7774         }
7775 
7776         uvm_kvfree(gpu_state->chunks);
7777     }
7778     else {
7779         UVM_ASSERT(!uvm_processor_mask_test(&block->resident, id));
7780     }
7781 
7782 
7783     // Pending operations may still need the DMA memory to be mapped.
7784     uvm_tracker_wait(&block->tracker);
7785 
7786     block_gpu_unmap_phys_all_cpu_pages(block, gpu);
7787     uvm_processor_mask_clear(&block->evicted_gpus, id);
7788 
7789     kmem_cache_free(g_uvm_va_block_gpu_state_cache, gpu_state);
7790     block->gpus[uvm_id_gpu_index(id)] = NULL;
7791 }
7792 
7793 static void block_put_ptes_safe(uvm_page_tree_t *tree, uvm_page_table_range_t *range)
7794 {
7795     if (range->table) {
7796         uvm_page_tree_put_ptes(tree, range);
7797         memset(range, 0, sizeof(*range));
7798     }
7799 }
7800 
7801 NV_STATUS uvm_va_block_add_gpu_va_space(uvm_va_block_t *va_block, uvm_gpu_va_space_t *gpu_va_space)
7802 {
7803     uvm_assert_mutex_locked(&va_block->lock);
7804 
7805     if (!gpu_va_space->ats.enabled || !va_block->cpu.ever_mapped)
7806         return NV_OK;
7807 
7808     // Pre-populate PDEs down to PDE1 for all GPU VA spaces on ATS systems. See
7809     // comments in pre_populate_pde1_gpu.
7810     return block_pre_populate_pde1_gpu(va_block, gpu_va_space, NULL);
7811 }
7812 
7813 void uvm_va_block_remove_gpu_va_space(uvm_va_block_t *va_block,
7814                                       uvm_gpu_va_space_t *gpu_va_space,
7815                                       uvm_va_block_context_t *block_context)
7816 {
7817     uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch;
7818     uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch;
7819     uvm_gpu_t *gpu = gpu_va_space->gpu;
7820     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id);
7821     uvm_va_block_region_t region = uvm_va_block_region_from_block(va_block);
7822     uvm_push_t push;
7823     NV_STATUS status;
7824 
7825     uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
7826 
7827     if (!gpu_state)
7828         return;
7829 
7830     uvm_assert_mutex_locked(&va_block->lock);
7831 
7832     // Unmapping the whole block won't cause a page table split, so this should
7833     // only fail if we have a system-fatal error.
7834     status = uvm_va_block_unmap(va_block, block_context, gpu->id, region, NULL, &local_tracker);
7835     if (status != NV_OK) {
7836         UVM_ASSERT(status == uvm_global_get_status());
7837         return; // Just leak
7838     }
7839 
7840     UVM_ASSERT(!uvm_processor_mask_test(&va_block->mapped, gpu->id));
7841 
7842     // Reset the page tables if other allocations could reuse them
7843     if (!block_gpu_supports_2m(va_block, gpu) &&
7844         !bitmap_empty(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)) {
7845 
7846         status = uvm_push_begin_acquire(gpu->channel_manager,
7847                                         UVM_CHANNEL_TYPE_MEMOPS,
7848                                         &local_tracker,
7849                                         &push,
7850                                         "Resetting PTEs for block [0x%llx, 0x%llx)",
7851                                         va_block->start,
7852                                         va_block->end + 1);
7853         if (status != NV_OK) {
7854             UVM_ASSERT(status == uvm_global_get_status());
7855             return; // Just leak
7856         }
7857 
7858         uvm_pte_batch_begin(&push, pte_batch);
7859         uvm_tlb_batch_begin(&gpu_va_space->page_tables, tlb_batch);
7860 
7861         // When the big PTEs is active, the 4k PTEs under it are garbage. Make
7862         // them invalid so the page tree code can reuse them for other
7863         // allocations on this VA. These don't need TLB invalidates since the
7864         // big PTEs above them are active.
7865         if (gpu_state->page_table_range_4k.table) {
7866             uvm_page_mask_init_from_big_ptes(va_block, gpu, &block_context->scratch_page_mask, gpu_state->big_ptes);
7867             block_gpu_pte_clear_4k(va_block, gpu, &block_context->scratch_page_mask, 0, pte_batch, NULL);
7868         }
7869 
7870         // We unmapped all big PTEs above, which means they have the unmapped
7871         // pattern so the GPU MMU won't read 4k PTEs under them. Set them to
7872         // invalid to activate the 4ks below so new allocations using just those
7873         // 4k PTEs will work.
7874         block_gpu_pte_clear_big(va_block, gpu, gpu_state->big_ptes, 0, pte_batch, tlb_batch);
7875 
7876         uvm_pte_batch_end(pte_batch);
7877         uvm_tlb_batch_end(tlb_batch, &push, UVM_MEMBAR_NONE);
7878 
7879         uvm_push_end(&push);
7880         uvm_tracker_overwrite_with_push(&local_tracker, &push);
7881     }
7882 
7883     // The unmap must finish before we free the page tables
7884     status = uvm_tracker_wait_deinit(&local_tracker);
7885     if (status != NV_OK)
7886         return; // System-fatal error, just leak
7887 
7888     // Note that if the PTE is currently 2M with lower tables allocated but not
7889     // in use, calling put_ptes on those lower ranges will re-write the 2M entry
7890     // to be a PDE.
7891     block_put_ptes_safe(&gpu_va_space->page_tables, &gpu_state->page_table_range_4k);
7892     block_put_ptes_safe(&gpu_va_space->page_tables, &gpu_state->page_table_range_big);
7893     block_put_ptes_safe(&gpu_va_space->page_tables, &gpu_state->page_table_range_2m);
7894 
7895     gpu_state->pte_is_2m = false;
7896     gpu_state->initialized_big = false;
7897     gpu_state->activated_big = false;
7898     gpu_state->activated_4k = false;
7899     bitmap_zero(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
7900 
7901     UVM_ASSERT(block_check_mappings(va_block));
7902 }
7903 
7904 NV_STATUS uvm_va_block_enable_peer(uvm_va_block_t *va_block, uvm_gpu_t *gpu0, uvm_gpu_t *gpu1)
7905 {
7906     NV_STATUS status;
7907     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
7908 
7909     UVM_ASSERT(uvm_gpu_peer_caps(gpu0, gpu1)->link_type != UVM_GPU_LINK_INVALID);
7910     uvm_assert_rwsem_locked_write(&va_space->lock);
7911     uvm_assert_mutex_locked(&va_block->lock);
7912 
7913     if (uvm_processor_mask_test(&va_space->indirect_peers[uvm_id_value(gpu0->id)], gpu1->id)) {
7914         status = block_gpu_map_all_chunks_indirect_peer(va_block, gpu0, gpu1);
7915         if (status != NV_OK)
7916             return status;
7917 
7918         status = block_gpu_map_all_chunks_indirect_peer(va_block, gpu1, gpu0);
7919         if (status != NV_OK) {
7920             block_gpu_unmap_all_chunks_indirect_peer(va_block, gpu0, gpu1);
7921             return status;
7922         }
7923     }
7924 
7925     // TODO: Bug 1767224: Refactor the uvm_va_block_set_accessed_by logic so we
7926     //       call it here.
7927 
7928     return NV_OK;
7929 }
7930 
7931 void uvm_va_block_disable_peer(uvm_va_block_t *va_block, uvm_gpu_t *gpu0, uvm_gpu_t *gpu1)
7932 {
7933     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
7934     NV_STATUS status;
7935     uvm_tracker_t tracker = UVM_TRACKER_INIT();
7936     uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, NULL);
7937     uvm_page_mask_t *unmap_page_mask = &block_context->caller_page_mask;
7938     const uvm_page_mask_t *resident0;
7939     const uvm_page_mask_t *resident1;
7940 
7941     uvm_assert_mutex_locked(&va_block->lock);
7942 
7943     // See comment in block_destroy_gpu_state
7944     if (uvm_processor_mask_test(&va_space->indirect_peers[uvm_id_value(gpu0->id)], gpu1->id)) {
7945         block_gpu_unmap_all_chunks_indirect_peer(va_block, gpu0, gpu1);
7946         block_gpu_unmap_all_chunks_indirect_peer(va_block, gpu1, gpu0);
7947     }
7948 
7949     // If either of the GPUs doesn't have GPU state then nothing could be mapped
7950     // between them.
7951     if (!uvm_va_block_gpu_state_get(va_block, gpu0->id) || !uvm_va_block_gpu_state_get(va_block, gpu1->id))
7952         return;
7953 
7954     resident0 = uvm_va_block_resident_mask_get(va_block, gpu0->id);
7955     resident1 = uvm_va_block_resident_mask_get(va_block, gpu1->id);
7956 
7957     // Unmap all pages resident on gpu1, but not on gpu0, from gpu0
7958     if (uvm_page_mask_andnot(unmap_page_mask, resident1, resident0)) {
7959         status = block_unmap_gpu(va_block, block_context, gpu0, unmap_page_mask, &tracker);
7960         if (status != NV_OK) {
7961             // Since all PTEs unmapped by this call have the same aperture, page
7962             // splits should never be required so any failure should be the
7963             // result of a system-fatal error.
7964             UVM_ASSERT_MSG(status == uvm_global_get_status(),
7965                            "Unmapping failed: %s, GPU %s\n",
7966                            nvstatusToString(status),
7967                            uvm_gpu_name(gpu0));
7968         }
7969     }
7970 
7971     // Unmap all pages resident on gpu0, but not on gpu1, from gpu1
7972     if (uvm_page_mask_andnot(unmap_page_mask, resident0, resident1)) {
7973         status = block_unmap_gpu(va_block, block_context, gpu1, unmap_page_mask, &tracker);
7974         if (status != NV_OK) {
7975             UVM_ASSERT_MSG(status == uvm_global_get_status(),
7976                            "Unmapping failed: %s, GPU %s\n",
7977                            nvstatusToString(status),
7978                            uvm_gpu_name(gpu0));
7979         }
7980     }
7981 
7982     status = uvm_tracker_add_tracker_safe(&va_block->tracker, &tracker);
7983     if (status != NV_OK)
7984         UVM_ASSERT(status == uvm_global_get_status());
7985 
7986     status = uvm_tracker_wait_deinit(&tracker);
7987     if (status != NV_OK)
7988         UVM_ASSERT(status == uvm_global_get_status());
7989 }
7990 
7991 void uvm_va_block_unmap_preferred_location_uvm_lite(uvm_va_block_t *va_block, uvm_gpu_t *gpu)
7992 {
7993     NV_STATUS status;
7994     uvm_va_range_t *va_range = va_block->va_range;
7995     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
7996     uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, NULL);
7997     uvm_va_block_region_t region = uvm_va_block_region_from_block(va_block);
7998 
7999     uvm_assert_mutex_locked(&va_block->lock);
8000     UVM_ASSERT(uvm_processor_mask_test(&va_range->uvm_lite_gpus, gpu->id));
8001 
8002     // If the GPU doesn't have GPU state then nothing could be mapped.
8003     if (!uvm_va_block_gpu_state_get(va_block, gpu->id))
8004         return;
8005 
8006     // In UVM-Lite mode, mappings to the preferred location are not tracked
8007     // directly, so just unmap the whole block.
8008     status = uvm_va_block_unmap(va_block, block_context, gpu->id, region, NULL, &va_block->tracker);
8009     if (status != NV_OK) {
8010         // Unmapping the whole block should not cause page splits so any failure
8011         // should be the result of a system-fatal error.
8012         UVM_ASSERT_MSG(status == uvm_global_get_status(),
8013                        "Unmapping failed: %s, GPU %s\n",
8014                        nvstatusToString(status), uvm_gpu_name(gpu));
8015     }
8016 
8017     status = uvm_tracker_wait(&va_block->tracker);
8018     if (status != NV_OK) {
8019         UVM_ASSERT_MSG(status == uvm_global_get_status(),
8020                        "Unmapping failed: %s, GPU %s\n",
8021                        nvstatusToString(status), uvm_gpu_name(gpu));
8022     }
8023 }
8024 
8025 // Evict pages from the GPU by moving each resident region to the CPU
8026 //
8027 // Notably the caller needs to support allocation-retry as
8028 // uvm_va_block_migrate_locked() requires that.
8029 static NV_STATUS block_evict_pages_from_gpu(uvm_va_block_t *va_block, uvm_gpu_t *gpu, struct mm_struct *mm)
8030 {
8031     NV_STATUS status = NV_OK;
8032     const uvm_page_mask_t *resident = uvm_va_block_resident_mask_get(va_block, gpu->id);
8033     uvm_va_block_region_t region = uvm_va_block_region_from_block(va_block);
8034     uvm_va_block_region_t subregion;
8035     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
8036     uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, mm);
8037 
8038     // Move all subregions resident on the GPU to the CPU
8039     for_each_va_block_subregion_in_mask(subregion, resident, region) {
8040         if (uvm_va_block_is_hmm(va_block)) {
8041             status = uvm_hmm_va_block_evict_pages_from_gpu(va_block,
8042                                                            gpu,
8043                                                            block_context,
8044                                                            resident,
8045                                                            subregion);
8046         }
8047         else {
8048             status = uvm_va_block_migrate_locked(va_block,
8049                                                  NULL,
8050                                                  block_context,
8051                                                  subregion,
8052                                                  UVM_ID_CPU,
8053                                                  UVM_MIGRATE_MODE_MAKE_RESIDENT_AND_MAP,
8054                                                  NULL);
8055         }
8056         if (status != NV_OK)
8057             return status;
8058     }
8059 
8060     UVM_ASSERT(!uvm_processor_mask_test(&va_block->resident, gpu->id));
8061     return NV_OK;
8062 }
8063 
8064 void uvm_va_block_unregister_gpu_locked(uvm_va_block_t *va_block, uvm_gpu_t *gpu, struct mm_struct *mm)
8065 {
8066     NV_STATUS status;
8067     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id);
8068 
8069     uvm_assert_mutex_locked(&va_block->lock);
8070 
8071     if (!gpu_state)
8072         return;
8073 
8074     // The mappings should've already been torn down by GPU VA space unregister
8075     UVM_ASSERT(!uvm_processor_mask_test(&va_block->mapped, gpu->id));
8076     UVM_ASSERT(uvm_page_mask_empty(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ]));
8077     UVM_ASSERT(!block_gpu_has_page_tables(va_block, gpu));
8078 
8079     // Use UVM_VA_BLOCK_RETRY_LOCKED() as the va block lock is already taken and
8080     // we don't rely on any state of the block across the call.
8081     status = UVM_VA_BLOCK_RETRY_LOCKED(va_block, NULL, block_evict_pages_from_gpu(va_block, gpu, mm));
8082     if (status != NV_OK) {
8083         UVM_ERR_PRINT("Failed to evict GPU pages on GPU unregister: %s, GPU %s\n",
8084                       nvstatusToString(status),
8085                       uvm_gpu_name(gpu));
8086         uvm_global_set_fatal_error(status);
8087     }
8088 
8089     // This function will copy the block's tracker into each chunk then free the
8090     // chunk to PMM. If we do this before waiting for the block tracker below
8091     // we'll populate PMM's free chunks with tracker entries, which gives us
8092     // better testing coverage of chunk synchronization on GPU unregister.
8093     block_destroy_gpu_state(va_block, gpu->id);
8094 
8095     // Any time a GPU is unregistered we need to make sure that there are no
8096     // pending (direct or indirect) tracker entries for that GPU left in the
8097     // block's tracker. The only way to ensure that is to wait for the whole
8098     // tracker.
8099     status = uvm_tracker_wait(&va_block->tracker);
8100     if (status != NV_OK)
8101         UVM_ASSERT(status == uvm_global_get_status());
8102 }
8103 
8104 void uvm_va_block_unregister_gpu(uvm_va_block_t *va_block, uvm_gpu_t *gpu, struct mm_struct *mm)
8105 {
8106     // Take the lock internally to not expose the caller to allocation-retry.
8107     uvm_mutex_lock(&va_block->lock);
8108 
8109     uvm_va_block_unregister_gpu_locked(va_block, gpu, mm);
8110 
8111     uvm_mutex_unlock(&va_block->lock);
8112 }
8113 
8114 static void block_mark_region_cpu_dirty(uvm_va_block_t *va_block, uvm_va_block_region_t region)
8115 {
8116     uvm_page_index_t page_index;
8117 
8118     uvm_assert_mutex_locked(&va_block->lock);
8119 
8120     for_each_va_block_page_in_region_mask (page_index, &va_block->cpu.resident, region)
8121         block_mark_cpu_page_dirty(va_block, page_index);
8122 }
8123 
8124 // Tears down everything within the block, but doesn't free the block itself.
8125 // Note that when uvm_va_block_kill is called, this is called twice: once for
8126 // the initial kill itself, then again when the block's ref count is eventually
8127 // destroyed. block->va_range is used to track whether the block has already
8128 // been killed.
8129 static void block_kill(uvm_va_block_t *block)
8130 {
8131     uvm_va_space_t *va_space;
8132     uvm_perf_event_data_t event_data;
8133     uvm_cpu_chunk_t *chunk;
8134     uvm_gpu_id_t id;
8135     NV_STATUS status;
8136     uvm_va_block_region_t region = uvm_va_block_region_from_block(block);
8137     uvm_page_index_t page_index;
8138     uvm_page_index_t next_page_index;
8139 
8140     if (uvm_va_block_is_dead(block))
8141         return;
8142 
8143     va_space = uvm_va_block_get_va_space(block);
8144     event_data.block_destroy.block = block;
8145     uvm_perf_event_notify(&va_space->perf_events, UVM_PERF_EVENT_BLOCK_DESTROY, &event_data);
8146 
8147     // Unmap all processors in parallel first. Unmapping the whole block won't
8148     // cause a page table split, so this should only fail if we have a system-
8149     // fatal error.
8150     if (!uvm_processor_mask_empty(&block->mapped)) {
8151         uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, NULL);
8152 
8153         // HMM CPU mappings are controlled by Linux so no need to unmap.
8154         // Remote GPU mappings will be removed below.
8155         if (uvm_va_block_is_hmm(block) && uvm_processor_mask_test(&block->mapped, UVM_ID_CPU)) {
8156             uvm_page_mask_zero(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE]);
8157             uvm_page_mask_zero(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ]);
8158             uvm_processor_mask_clear(&block->mapped, UVM_ID_CPU);
8159         }
8160 
8161         // We could only be killed with mapped GPU state by VA range free or VA
8162         // space teardown, so it's safe to use the va_space's block_context
8163         // because both of those have the VA space lock held in write mode.
8164         status = uvm_va_block_unmap_mask(block, block_context, &block->mapped, region, NULL);
8165         UVM_ASSERT(status == uvm_global_get_status());
8166     }
8167 
8168     UVM_ASSERT(uvm_processor_mask_empty(&block->mapped));
8169 
8170     // Free the GPU page tables and chunks
8171     for_each_gpu_id(id)
8172         block_destroy_gpu_state(block, id);
8173 
8174     // Wait for the GPU PTE unmaps before freeing CPU memory
8175     uvm_tracker_wait_deinit(&block->tracker);
8176 
8177     // No processor should have the CPU mapped at this point
8178     UVM_ASSERT(block_check_processor_not_mapped(block, UVM_ID_CPU));
8179 
8180     // Free CPU pages
8181     for_each_cpu_chunk_in_block_safe(chunk, page_index, next_page_index, block) {
8182         // be conservative.
8183         // Tell the OS we wrote to the page because we sometimes clear the dirty
8184         // bit after writing to it. HMM dirty flags are managed by the kernel.
8185         if (!uvm_va_block_is_hmm(block))
8186             uvm_cpu_chunk_mark_dirty(chunk, 0);
8187         uvm_cpu_chunk_remove_from_block(block, page_index);
8188         uvm_cpu_chunk_free(chunk);
8189     }
8190 
8191     uvm_kvfree((void *)block->cpu.chunks);
8192     block->cpu.chunks = 0;
8193 
8194     // Clearing the resident bit isn't strictly necessary since this block
8195     // is getting destroyed, but it keeps state consistent for assertions.
8196     uvm_page_mask_zero(&block->cpu.resident);
8197     block_clear_resident_processor(block, UVM_ID_CPU);
8198 
8199     if (uvm_va_block_is_hmm(block))
8200         uvm_va_policy_clear(block, block->start, block->end);
8201 
8202     block->va_range = NULL;
8203 #if UVM_IS_CONFIG_HMM()
8204     block->hmm.va_space = NULL;
8205 #endif
8206 }
8207 
8208 // Called when the block's ref count drops to 0
8209 void uvm_va_block_destroy(nv_kref_t *nv_kref)
8210 {
8211     uvm_va_block_t *block = container_of(nv_kref, uvm_va_block_t, kref);
8212 
8213     // Nobody else should have a reference when freeing
8214     uvm_assert_mutex_unlocked(&block->lock);
8215 
8216     uvm_mutex_lock(&block->lock);
8217     block_kill(block);
8218     uvm_mutex_unlock(&block->lock);
8219 
8220     if (uvm_enable_builtin_tests) {
8221         uvm_va_block_wrapper_t *block_wrapper = container_of(block, uvm_va_block_wrapper_t, block);
8222 
8223         kmem_cache_free(g_uvm_va_block_cache, block_wrapper);
8224     }
8225     else {
8226         kmem_cache_free(g_uvm_va_block_cache, block);
8227     }
8228 }
8229 
8230 void uvm_va_block_kill(uvm_va_block_t *va_block)
8231 {
8232     uvm_mutex_lock(&va_block->lock);
8233     block_kill(va_block);
8234     uvm_mutex_unlock(&va_block->lock);
8235 
8236     // May call block_kill again
8237     uvm_va_block_release(va_block);
8238 }
8239 
8240 static void block_gpu_release_region(uvm_va_block_t *va_block,
8241                                      uvm_gpu_id_t gpu_id,
8242                                      uvm_va_block_gpu_state_t *gpu_state,
8243                                      uvm_page_mask_t *page_mask,
8244                                      uvm_va_block_region_t region)
8245 {
8246     uvm_page_index_t page_index;
8247 
8248     for_each_va_block_page_in_region_mask(page_index, page_mask, region) {
8249         uvm_gpu_chunk_t *gpu_chunk = gpu_state->chunks[page_index];
8250 
8251         if (!gpu_chunk)
8252             continue;
8253 
8254         // TODO: Bug 3898467: unmap indirect peers when freeing GPU chunks
8255 
8256         uvm_mmu_chunk_unmap(gpu_chunk, &va_block->tracker);
8257 
8258         // The GPU chunk will be freed when the device private reference drops.
8259         if (uvm_page_mask_test_and_clear(&gpu_state->resident, page_index) &&
8260             uvm_page_mask_empty(&gpu_state->resident))
8261             block_clear_resident_processor(va_block, gpu_id);
8262 
8263         gpu_state->chunks[page_index] = NULL;
8264     }
8265 }
8266 
8267 void uvm_va_block_munmap_region(uvm_va_block_t *va_block,
8268                                 uvm_va_block_region_t region)
8269 {
8270     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
8271     uvm_perf_event_data_t event_data;
8272     uvm_gpu_id_t gpu_id;
8273 
8274     UVM_ASSERT(uvm_va_block_is_hmm(va_block));
8275     uvm_assert_mutex_locked(&va_block->lock);
8276 
8277     // Reset thrashing state for the region.
8278     event_data.block_munmap.block = va_block;
8279     event_data.block_munmap.region = region;
8280     uvm_perf_event_notify(&va_space->perf_events, UVM_PERF_EVENT_BLOCK_MUNMAP, &event_data);
8281 
8282     // Set a flag so that GPU fault events are flushed since they might refer
8283     // to the region being unmapped.
8284     // Note that holding the va_block lock prevents GPU VA spaces from
8285     // being removed so the registered_gpu_va_spaces mask is stable.
8286     for_each_gpu_id_in_mask(gpu_id, &va_space->registered_gpu_va_spaces) {
8287         uvm_processor_mask_set_atomic(&va_space->needs_fault_buffer_flush, gpu_id);
8288     }
8289 
8290     // Release any remaining vidmem chunks in the given region.
8291     for_each_gpu_id(gpu_id) {
8292         uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu_id);
8293 
8294         if (!gpu_state)
8295             continue;
8296 
8297         uvm_page_mask_region_clear(&gpu_state->evicted, region);
8298         if (uvm_page_mask_empty(&gpu_state->evicted))
8299             uvm_processor_mask_clear(&va_block->evicted_gpus, gpu_id);
8300 
8301         if (gpu_state->chunks) {
8302             block_gpu_release_region(va_block, gpu_id, gpu_state, NULL, region);
8303 
8304             // TODO: bug 3660922: Need to update the read duplicated pages mask
8305             // when read duplication is supported for HMM.
8306         }
8307         else {
8308             UVM_ASSERT(!uvm_processor_mask_test(&va_block->resident, gpu_id));
8309         }
8310     }
8311 
8312     uvm_va_policy_clear(va_block,
8313                         uvm_va_block_region_start(va_block, region),
8314                         uvm_va_block_region_end(va_block, region));
8315 }
8316 
8317 static NV_STATUS block_split_presplit_ptes_gpu(uvm_va_block_t *existing, uvm_va_block_t *new, uvm_gpu_t *gpu)
8318 {
8319     uvm_va_block_gpu_state_t *existing_gpu_state = uvm_va_block_gpu_state_get(existing, gpu->id);
8320     uvm_va_space_t *va_space = uvm_va_block_get_va_space(existing);
8321     uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, NULL);
8322     NvU32 big_page_size = uvm_va_block_gpu_big_page_size(existing, gpu);
8323     NvU32 alloc_sizes;
8324     DECLARE_BITMAP(new_big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
8325     uvm_page_index_t new_start_page_index = uvm_va_block_cpu_page_index(existing, new->start);
8326     size_t big_page_index;
8327     uvm_push_t push;
8328     NV_STATUS status;
8329 
8330     // We only have to split to big PTEs if we're currently a 2M PTE
8331     if (existing_gpu_state->pte_is_2m) {
8332         // We can skip the split if the 2M PTE is invalid and we have no lower
8333         // PTEs.
8334         if (block_page_prot_gpu(existing, gpu, 0) == UVM_PROT_NONE &&
8335             !existing_gpu_state->page_table_range_big.table &&
8336             !existing_gpu_state->page_table_range_4k.table)
8337             return NV_OK;
8338 
8339         alloc_sizes = big_page_size;
8340         bitmap_fill(new_big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
8341 
8342         if (!IS_ALIGNED(new->start, big_page_size)) {
8343             alloc_sizes |= UVM_PAGE_SIZE_4K;
8344 
8345             big_page_index = uvm_va_block_big_page_index(existing, new_start_page_index, big_page_size);
8346             __clear_bit(big_page_index, new_big_ptes);
8347         }
8348 
8349         status = block_alloc_ptes_with_retry(existing, gpu, alloc_sizes, NULL);
8350         if (status != NV_OK)
8351             return status;
8352 
8353         status = uvm_push_begin_acquire(gpu->channel_manager,
8354                                         UVM_CHANNEL_TYPE_MEMOPS,
8355                                         &existing->tracker,
8356                                         &push,
8357                                         "Splitting 2M PTE, existing [0x%llx, 0x%llx) new [0x%llx, 0x%llx)",
8358                                         existing->start, existing->end + 1,
8359                                         new->start, new->end + 1);
8360         if (status != NV_OK)
8361             return status;
8362 
8363         block_gpu_split_2m(existing, block_context, gpu, new_big_ptes, &push);
8364     }
8365     else {
8366         big_page_index = uvm_va_block_big_page_index(existing, new_start_page_index, big_page_size);
8367 
8368         // If the split point is on a big page boundary, or if the split point
8369         // is not currently covered by a big PTE, we don't have to split
8370         // anything.
8371         if (IS_ALIGNED(new->start, big_page_size) ||
8372             big_page_index == MAX_BIG_PAGES_PER_UVM_VA_BLOCK ||
8373             !test_bit(big_page_index, existing_gpu_state->big_ptes))
8374             return NV_OK;
8375 
8376         status = block_alloc_ptes_with_retry(existing, gpu, UVM_PAGE_SIZE_4K, NULL);
8377         if (status != NV_OK)
8378             return status;
8379 
8380         bitmap_zero(new_big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
8381         __set_bit(big_page_index, new_big_ptes);
8382 
8383         status = uvm_push_begin_acquire(gpu->channel_manager,
8384                                         UVM_CHANNEL_TYPE_MEMOPS,
8385                                         &existing->tracker,
8386                                         &push,
8387                                         "Splitting big PTE, existing [0x%llx, 0x%llx) new [0x%llx, 0x%llx)",
8388                                         existing->start, existing->end + 1,
8389                                         new->start, new->end + 1);
8390         if (status != NV_OK)
8391             return status;
8392 
8393         block_gpu_split_big(existing, block_context, gpu, new_big_ptes, &push);
8394     }
8395 
8396     uvm_push_end(&push);
8397 
8398     // Adding this push to existing block tracker will cause all GPU PTE splits
8399     // to serialize on each other, but it's simpler than maintaining a separate
8400     // tracker and this path isn't performance-critical.
8401     return uvm_tracker_add_push_safe(&existing->tracker, &push);
8402 }
8403 
8404 static NV_STATUS block_split_presplit_ptes(uvm_va_block_t *existing, uvm_va_block_t *new)
8405 {
8406     uvm_gpu_t *gpu;
8407     uvm_gpu_id_t id;
8408     NV_STATUS status;
8409 
8410     for_each_gpu_id(id) {
8411         if (!uvm_va_block_gpu_state_get(existing, id))
8412             continue;
8413 
8414         gpu = block_get_gpu(existing, id);
8415 
8416         if (block_gpu_has_page_tables(existing, gpu)) {
8417             status = block_split_presplit_ptes_gpu(existing, new, gpu);
8418             if (status != NV_OK)
8419                 return status;
8420         }
8421     }
8422 
8423     return NV_OK;
8424 }
8425 
8426 typedef struct
8427 {
8428     // Number of chunks contained by this VA block
8429     size_t num_chunks;
8430 
8431     // Index of the "interesting" chunk, either adjacent to or spanning the
8432     // split point depending on which block this is.
8433     size_t chunk_index;
8434 
8435     // Size of the chunk referenced by chunk_index
8436     uvm_chunk_size_t chunk_size;
8437 } block_gpu_chunk_split_state_t;
8438 
8439 static void block_gpu_chunk_get_split_state(uvm_va_block_t *block,
8440                                             block_gpu_chunk_split_state_t *state,
8441                                             NvU64 start,
8442                                             NvU64 end,
8443                                             uvm_page_index_t page_index,
8444                                             uvm_gpu_t *gpu)
8445 {
8446     NvU64 size = end - start + 1;
8447     state->num_chunks = block_num_gpu_chunks_range(block, start, size, gpu);
8448     state->chunk_index = block_gpu_chunk_index_range(block, start, size, gpu, page_index, &state->chunk_size);
8449 }
8450 
8451 static void block_merge_chunk(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_gpu_chunk_t *chunk)
8452 {
8453     uvm_gpu_t *accessing_gpu;
8454     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
8455 
8456     uvm_pmm_gpu_merge_chunk(&gpu->pmm, chunk);
8457 
8458     for_each_va_space_gpu_in_mask(accessing_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)]) {
8459         NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(&gpu->pmm, chunk, accessing_gpu);
8460 
8461         uvm_pmm_sysmem_mappings_merge_gpu_chunk_mappings(&accessing_gpu->pmm_reverse_sysmem_mappings,
8462                                                          peer_addr,
8463                                                          uvm_gpu_chunk_get_size(chunk));
8464     }
8465 }
8466 
8467 // Perform any chunk splitting and array growing required for this block split,
8468 // but don't actually move chunk pointers anywhere.
8469 static NV_STATUS block_presplit_gpu_chunks(uvm_va_block_t *existing, uvm_va_block_t *new, uvm_gpu_t *gpu)
8470 {
8471     uvm_va_block_gpu_state_t *existing_gpu_state = uvm_va_block_gpu_state_get(existing, gpu->id);
8472     uvm_gpu_t *accessing_gpu;
8473     uvm_va_space_t *va_space = uvm_va_block_get_va_space(existing);
8474     uvm_gpu_chunk_t **temp_chunks;
8475     uvm_gpu_chunk_t *original_chunk, *curr_chunk;
8476     uvm_page_index_t split_page_index = uvm_va_block_cpu_page_index(existing, new->start);
8477     uvm_chunk_sizes_mask_t split_sizes;
8478     uvm_chunk_size_t subchunk_size;
8479     NV_STATUS status;
8480     block_gpu_chunk_split_state_t existing_before_state, existing_after_state, new_state;
8481 
8482     block_gpu_chunk_get_split_state(existing,
8483                                     &existing_before_state,
8484                                     existing->start,
8485                                     existing->end,
8486                                     split_page_index,
8487                                     gpu);
8488     block_gpu_chunk_get_split_state(existing,
8489                                     &existing_after_state,
8490                                     existing->start,
8491                                     new->start - 1,
8492                                     split_page_index - 1,
8493                                     gpu);
8494     block_gpu_chunk_get_split_state(new,
8495                                     &new_state,
8496                                     new->start,
8497                                     new->end,
8498                                     0,
8499                                     gpu);
8500 
8501     // Even though we're splitting existing, we could wind up requiring a larger
8502     // chunks array if we split a large chunk into many smaller ones.
8503     if (existing_after_state.num_chunks > existing_before_state.num_chunks) {
8504         temp_chunks = uvm_kvrealloc(existing_gpu_state->chunks,
8505                                     existing_after_state.num_chunks * sizeof(existing_gpu_state->chunks[0]));
8506         if (!temp_chunks)
8507             return NV_ERR_NO_MEMORY;
8508         existing_gpu_state->chunks = temp_chunks;
8509     }
8510 
8511     original_chunk = existing_gpu_state->chunks[existing_before_state.chunk_index];
8512 
8513     // If the chunk covering the split point is not populated, we're done. We've
8514     // already grown the array to cover any new chunks which may be populated
8515     // later.
8516     if (!original_chunk)
8517         return NV_OK;
8518 
8519     // Figure out the splits we need to perform. Remove all sizes >= the current
8520     // size, and all sizes < the target size. Note that the resulting mask will
8521     // be 0 if the sizes match (we're already splitting at a chunk boundary).
8522     UVM_ASSERT(uvm_gpu_chunk_get_size(original_chunk) == existing_before_state.chunk_size);
8523     UVM_ASSERT(existing_before_state.chunk_size >= new_state.chunk_size);
8524     split_sizes = gpu->parent->mmu_user_chunk_sizes;
8525     split_sizes &= existing_before_state.chunk_size - 1;
8526     split_sizes &= ~(new_state.chunk_size - 1);
8527 
8528     // Keep splitting the chunk covering the split point until we hit the target
8529     // size.
8530     curr_chunk = original_chunk;
8531     for_each_chunk_size_rev(subchunk_size, split_sizes) {
8532         size_t last_index, num_subchunks;
8533 
8534         status = uvm_pmm_gpu_split_chunk(&gpu->pmm, curr_chunk, subchunk_size, NULL);
8535         if (status != NV_OK)
8536             goto error;
8537 
8538         // Split physical GPU mappings for indirect peers
8539         for_each_va_space_gpu_in_mask(accessing_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)]) {
8540             NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(&gpu->pmm, curr_chunk, accessing_gpu);
8541 
8542             status = uvm_pmm_sysmem_mappings_split_gpu_chunk_mappings(&accessing_gpu->pmm_reverse_sysmem_mappings,
8543                                                                       peer_addr,
8544                                                                       subchunk_size);
8545             if (status != NV_OK)
8546                 goto error;
8547         }
8548 
8549         if (subchunk_size == new_state.chunk_size)
8550             break;
8551 
8552         // Compute the last subchunk index prior to the split point. Divide the
8553         // entire address space into units of subchunk_size, then mod by the
8554         // number of subchunks within the parent.
8555         last_index = (size_t)uvm_div_pow2_64(new->start - 1, subchunk_size);
8556         num_subchunks = (size_t)uvm_div_pow2_64(uvm_gpu_chunk_get_size(curr_chunk), subchunk_size);
8557         UVM_ASSERT(num_subchunks > 1);
8558         last_index &= num_subchunks - 1;
8559 
8560         uvm_pmm_gpu_get_subchunks(&gpu->pmm, curr_chunk, last_index, 1, &curr_chunk);
8561         UVM_ASSERT(uvm_gpu_chunk_get_size(curr_chunk) == subchunk_size);
8562     }
8563 
8564     // Note that existing's chunks array still has a pointer to original_chunk,
8565     // not to any newly-split subchunks. If a subsequent split failure occurs on
8566     // a later GPU we'll have to merge it back. Once we're past the preallocate
8567     // stage we'll remove it from the chunks array and move the new split chunks
8568     // in.
8569 
8570     return NV_OK;
8571 
8572 error:
8573     // On error we need to leave the chunk in its initial state
8574     block_merge_chunk(existing, gpu, original_chunk);
8575 
8576     return status;
8577 }
8578 
8579 static NV_STATUS block_split_cpu_chunk_to_64k(uvm_va_block_t *block)
8580 {
8581     uvm_cpu_chunk_storage_mixed_t *mixed;
8582     uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, 0);
8583     NV_STATUS status;
8584 
8585     UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_2M);
8586     UVM_ASSERT(uvm_cpu_storage_get_type(block) == UVM_CPU_CHUNK_STORAGE_CHUNK);
8587 
8588     mixed = uvm_kvmalloc_zero(sizeof(*mixed));
8589     if (!mixed)
8590         return NV_ERR_NO_MEMORY;
8591 
8592     status = uvm_cpu_chunk_split(chunk, (uvm_cpu_chunk_t **)&mixed->slots);
8593     if (status != NV_OK) {
8594         uvm_kvfree(mixed);
8595         return status;
8596     }
8597 
8598     bitmap_fill(mixed->big_chunks, MAX_BIG_CPU_CHUNK_SLOTS_PER_UVM_VA_BLOCK);
8599     block->cpu.chunks = (unsigned long)mixed | UVM_CPU_CHUNK_STORAGE_MIXED;
8600     return status;
8601 }
8602 
8603 static NV_STATUS block_split_cpu_chunk_to_4k(uvm_va_block_t *block, uvm_page_index_t page_index)
8604 {
8605     uvm_cpu_chunk_storage_mixed_t *mixed;
8606     uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index);
8607     uvm_cpu_chunk_t **small_chunks;
8608     size_t slot_index;
8609     NV_STATUS status;
8610 
8611     UVM_ASSERT(chunk);
8612     UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_64K);
8613     UVM_ASSERT(uvm_cpu_storage_get_type(block) == UVM_CPU_CHUNK_STORAGE_MIXED);
8614 
8615     mixed = uvm_cpu_storage_get_ptr(block);
8616     slot_index = compute_slot_index(block, page_index);
8617     small_chunks = uvm_kvmalloc_zero(sizeof(*small_chunks) * MAX_SMALL_CHUNKS_PER_BIG_SLOT);
8618     if (!small_chunks)
8619         return NV_ERR_NO_MEMORY;
8620 
8621     status = uvm_cpu_chunk_split(chunk, small_chunks);
8622     if (status != NV_OK) {
8623         uvm_kvfree(small_chunks);
8624         return status;
8625     }
8626 
8627     mixed->slots[slot_index] = small_chunks;
8628     clear_bit(slot_index, mixed->big_chunks);
8629     return status;
8630 }
8631 
8632 static NV_STATUS block_split_cpu_chunk_one(uvm_va_block_t *block, uvm_page_index_t page_index)
8633 {
8634     uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index);
8635     uvm_chunk_size_t chunk_size = uvm_cpu_chunk_get_size(chunk);
8636     uvm_chunk_size_t new_size;
8637     uvm_gpu_t *gpu;
8638     NvU64 gpu_mapping_addr;
8639     uvm_processor_mask_t gpu_split_mask;
8640     uvm_gpu_id_t id;
8641     NV_STATUS status;
8642 
8643     if (chunk_size == UVM_CHUNK_SIZE_2M)
8644         new_size = UVM_CHUNK_SIZE_64K;
8645     else
8646         new_size = UVM_CHUNK_SIZE_4K;
8647 
8648     UVM_ASSERT(IS_ALIGNED(chunk_size, new_size));
8649 
8650     uvm_processor_mask_zero(&gpu_split_mask);
8651     for_each_gpu_id(id) {
8652         if (!uvm_va_block_gpu_state_get(block, id))
8653             continue;
8654 
8655         gpu = block_get_gpu(block, id);
8656 
8657         // If the parent chunk has not been mapped, there is nothing to split.
8658         gpu_mapping_addr = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu->parent);
8659         if (gpu_mapping_addr == 0)
8660             continue;
8661 
8662         status = uvm_pmm_sysmem_mappings_split_gpu_mappings(&gpu->pmm_reverse_sysmem_mappings,
8663                                                             gpu_mapping_addr,
8664                                                             new_size);
8665         if (status != NV_OK)
8666             goto merge;
8667 
8668         uvm_processor_mask_set(&gpu_split_mask, id);
8669     }
8670 
8671     if (new_size == UVM_CHUNK_SIZE_64K)
8672         status = block_split_cpu_chunk_to_64k(block);
8673     else
8674         status = block_split_cpu_chunk_to_4k(block, page_index);
8675 
8676     if (status != NV_OK) {
8677 merge:
8678         for_each_gpu_id_in_mask(id, &gpu_split_mask) {
8679             gpu = block_get_gpu(block, id);
8680             gpu_mapping_addr = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu->parent);
8681             uvm_pmm_sysmem_mappings_merge_gpu_mappings(&gpu->pmm_reverse_sysmem_mappings,
8682                                                        gpu_mapping_addr,
8683                                                        chunk_size);
8684         }
8685     }
8686 
8687     return status;
8688 }
8689 
8690 static NV_STATUS block_prealloc_cpu_chunk_storage(uvm_va_block_t *existing, uvm_va_block_t *new)
8691 {
8692     uvm_cpu_chunk_storage_mixed_t *existing_mixed;
8693     uvm_cpu_chunk_storage_mixed_t *new_mixed = NULL;
8694     size_t slot_offset;
8695     size_t existing_slot;
8696     NV_STATUS status = NV_OK;
8697 
8698     UVM_ASSERT(uvm_cpu_storage_get_type(existing) == UVM_CPU_CHUNK_STORAGE_MIXED);
8699     existing_mixed = uvm_cpu_storage_get_ptr(existing);
8700 
8701     // Pre-allocate chunk storage for the new block. By definition, the new block
8702     // will contain either 64K and/or 4K chunks.
8703     //
8704     // We do this here so there are no failures in block_split_cpu().
8705     new_mixed = uvm_kvmalloc_zero(sizeof(*new_mixed));
8706     if (!new_mixed)
8707         return NV_ERR_NO_MEMORY;
8708 
8709     slot_offset = compute_slot_index(existing, uvm_va_block_cpu_page_index(existing, new->start));
8710     existing_slot = slot_offset;
8711     for_each_clear_bit_from(existing_slot, existing_mixed->big_chunks, MAX_BIG_CPU_CHUNK_SLOTS_PER_UVM_VA_BLOCK) {
8712         size_t new_slot = existing_slot - slot_offset;
8713 
8714         if (existing_mixed->slots[existing_slot]) {
8715             uvm_cpu_chunk_t **small_chunks = uvm_kvmalloc_zero(sizeof(*small_chunks) * MAX_SMALL_CHUNKS_PER_BIG_SLOT);
8716 
8717             if (!small_chunks) {
8718                 status = NV_ERR_NO_MEMORY;
8719                 goto done;
8720             }
8721 
8722             new_mixed->slots[new_slot] = small_chunks;
8723         }
8724     }
8725 
8726     new->cpu.chunks = (unsigned long)new_mixed | UVM_CPU_CHUNK_STORAGE_MIXED;
8727     UVM_ASSERT(status == NV_OK);
8728 
8729 done:
8730     if (status != NV_OK) {
8731         for (; existing_slot > slot_offset; existing_slot--)
8732             uvm_kvfree(new_mixed->slots[existing_slot - slot_offset]);
8733 
8734         uvm_kvfree(new_mixed);
8735     }
8736 
8737     return status;
8738 }
8739 
8740 static void block_free_cpu_chunk_storage(uvm_va_block_t *block)
8741 {
8742     if (block->cpu.chunks) {
8743         uvm_cpu_chunk_storage_mixed_t *mixed;
8744         size_t slot_index;
8745 
8746         UVM_ASSERT(uvm_cpu_storage_get_type(block) == UVM_CPU_CHUNK_STORAGE_MIXED);
8747         mixed = uvm_cpu_storage_get_ptr(block);
8748         for (slot_index = 0; slot_index < MAX_BIG_CPU_CHUNK_SLOTS_PER_UVM_VA_BLOCK; slot_index++)
8749             uvm_kvfree(mixed->slots[slot_index]);
8750 
8751         uvm_kvfree(mixed);
8752         block->cpu.chunks = 0;
8753     }
8754 }
8755 
8756 // Perform any CPU chunk splitting that may be required for this block split.
8757 // Just like block_presplit_gpu_chunks, no chunks are moved to the new block.
8758 static NV_STATUS block_presplit_cpu_chunks(uvm_va_block_t *existing, uvm_va_block_t *new)
8759 {
8760     uvm_page_index_t page_index = uvm_va_block_cpu_page_index(existing, new->start);
8761     uvm_cpu_chunk_t *splitting_chunk;
8762     uvm_chunk_sizes_mask_t split_sizes = uvm_cpu_chunk_get_allocation_sizes();
8763     uvm_chunk_size_t subchunk_size;
8764     NV_STATUS status = NV_OK;
8765 
8766     UVM_ASSERT(!IS_ALIGNED(new->start, UVM_VA_BLOCK_SIZE));
8767     splitting_chunk = uvm_cpu_chunk_get_chunk_for_page(existing, page_index);
8768 
8769     // If the page covering the split point has not been populated, there is no
8770     // need to split.
8771     if (!splitting_chunk)
8772         return NV_OK;
8773 
8774     // If the split point is aligned on the chunk size, there is no need to
8775     // split.
8776     if (IS_ALIGNED(new->start, uvm_cpu_chunk_get_size(splitting_chunk)))
8777         return NV_OK;
8778 
8779     // Remove all sizes above the chunk's current size.
8780     split_sizes &= uvm_cpu_chunk_get_size(splitting_chunk) - 1;
8781     // Remove all sizes below the alignment of the new block's start.
8782     split_sizes &= ~(IS_ALIGNED(new->start, UVM_CHUNK_SIZE_64K) ? UVM_CHUNK_SIZE_64K - 1 : 0);
8783 
8784     for_each_chunk_size_rev(subchunk_size, split_sizes) {
8785         status = block_split_cpu_chunk_one(existing, page_index);
8786         if (status != NV_OK)
8787             return status;
8788     }
8789 
8790     return block_prealloc_cpu_chunk_storage(existing, new);
8791 }
8792 
8793 static void block_merge_cpu_chunks_to_64k(uvm_va_block_t *block, uvm_page_index_t page_index)
8794 {
8795     uvm_cpu_chunk_storage_mixed_t *mixed = uvm_cpu_storage_get_ptr(block);
8796     size_t slot_index = compute_slot_index(block, page_index);
8797     uvm_cpu_chunk_t **small_chunks = mixed->slots[slot_index];
8798     uvm_cpu_chunk_t *merged_chunk;
8799 
8800     UVM_ASSERT(uvm_cpu_storage_get_type(block) == UVM_CPU_CHUNK_STORAGE_MIXED);
8801     UVM_ASSERT(small_chunks);
8802     UVM_ASSERT(!test_bit(slot_index, mixed->big_chunks));
8803 
8804     merged_chunk = uvm_cpu_chunk_merge(small_chunks);
8805     mixed->slots[slot_index] = merged_chunk;
8806     set_bit(slot_index, mixed->big_chunks);
8807     uvm_kvfree(small_chunks);
8808 }
8809 
8810 static void block_merge_cpu_chunks_to_2m(uvm_va_block_t *block, uvm_page_index_t page_index)
8811 {
8812     uvm_cpu_chunk_storage_mixed_t *mixed = uvm_cpu_storage_get_ptr(block);
8813     uvm_cpu_chunk_t **big_chunks = (uvm_cpu_chunk_t **)&mixed->slots;
8814     uvm_cpu_chunk_t *merged_chunk;
8815 
8816     UVM_ASSERT(uvm_cpu_storage_get_type(block) == UVM_CPU_CHUNK_STORAGE_MIXED);
8817     UVM_ASSERT(bitmap_full(mixed->big_chunks, MAX_BIG_CPU_CHUNK_SLOTS_PER_UVM_VA_BLOCK));
8818 
8819     merged_chunk = uvm_cpu_chunk_merge(big_chunks);
8820     block->cpu.chunks = (unsigned long)merged_chunk | UVM_CPU_CHUNK_STORAGE_CHUNK;
8821     uvm_kvfree(mixed);
8822 }
8823 
8824 static void block_merge_cpu_chunks_one(uvm_va_block_t *block, uvm_page_index_t page_index)
8825 {
8826     uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index);
8827     uvm_gpu_id_t id;
8828 
8829     if (uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_4K) {
8830         block_merge_cpu_chunks_to_64k(block, page_index);
8831     }
8832     else {
8833         UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_64K);
8834         block_merge_cpu_chunks_to_2m(block, page_index);
8835     }
8836 
8837     chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index);
8838 
8839     for_each_gpu_id(id) {
8840         NvU64 gpu_mapping_addr;
8841         uvm_gpu_t *gpu;
8842 
8843         if (!uvm_va_block_gpu_state_get(block, id))
8844             continue;
8845 
8846         gpu = block_get_gpu(block, id);
8847         gpu_mapping_addr = uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu->parent);
8848         if (gpu_mapping_addr == 0)
8849             continue;
8850 
8851         uvm_pmm_sysmem_mappings_merge_gpu_mappings(&gpu->pmm_reverse_sysmem_mappings,
8852                                                    gpu_mapping_addr,
8853                                                    uvm_cpu_chunk_get_size(chunk));
8854     }
8855 }
8856 
8857 static void block_merge_cpu_chunks(uvm_va_block_t *existing, uvm_va_block_t *new)
8858 {
8859     uvm_page_index_t page_index = uvm_va_block_cpu_page_index(existing, new->start);
8860     uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(existing, page_index);
8861     uvm_chunk_sizes_mask_t merge_sizes = uvm_cpu_chunk_get_allocation_sizes();
8862     uvm_chunk_size_t largest_size;
8863     uvm_chunk_size_t chunk_size;
8864     uvm_chunk_size_t merge_size;
8865     size_t block_size = uvm_va_block_size(existing);
8866 
8867     if (!chunk || uvm_cpu_chunk_is_physical(chunk))
8868         return;
8869 
8870     chunk_size = uvm_cpu_chunk_get_size(chunk);
8871 
8872     // Remove all CPU chunk sizes above the size of the existing VA block.
8873     // Since block sizes are not always powers of 2, use the largest power of 2
8874     // less than or equal to the block size since we can't merge to a size
8875     // larger than the block's size.
8876     largest_size = rounddown_pow_of_two(block_size);
8877     merge_sizes &= (largest_size | (largest_size - 1));
8878 
8879     // Remove all CPU chunk sizes smaller than the size of the chunk being merged up.
8880     merge_sizes &= ~(chunk_size | (chunk_size - 1));
8881 
8882     for_each_chunk_size(merge_size, merge_sizes) {
8883         uvm_va_block_region_t chunk_region;
8884 
8885         // The block has to fully contain the VA range after the merge.
8886         if (!uvm_va_block_contains_address(existing, UVM_ALIGN_DOWN(new->start, merge_size)) ||
8887             !uvm_va_block_contains_address(existing, UVM_ALIGN_DOWN(new->start, merge_size) + merge_size - 1))
8888             break;
8889 
8890         chunk_region = uvm_va_block_chunk_region(existing, merge_size, page_index);
8891 
8892         // If not all pages in the region covered by the chunk are allocated,
8893         // we can't merge.
8894         if (!uvm_page_mask_region_full(&existing->cpu.allocated, chunk_region))
8895             break;
8896 
8897         block_merge_cpu_chunks_one(existing, chunk_region.first);
8898         chunk = uvm_cpu_chunk_get_chunk_for_page(existing, page_index);
8899         if (uvm_cpu_chunk_is_physical(chunk))
8900             break;
8901     }
8902 
8903     block_free_cpu_chunk_storage(new);
8904 }
8905 
8906 // Pre-allocate everything which doesn't require retry on both existing and new
8907 // which will be needed to handle a split. If this fails, existing must remain
8908 // functionally unmodified.
8909 static NV_STATUS block_split_preallocate_no_retry(uvm_va_block_t *existing, uvm_va_block_t *new)
8910 {
8911     NV_STATUS status;
8912     uvm_gpu_t *gpu;
8913     uvm_gpu_id_t id;
8914     uvm_page_index_t split_page_index;
8915     uvm_va_block_test_t *block_test;
8916 
8917     status = block_presplit_cpu_chunks(existing, new);
8918     if (status != NV_OK)
8919         goto error;
8920 
8921     for_each_gpu_id(id) {
8922         if (!uvm_va_block_gpu_state_get(existing, id))
8923             continue;
8924 
8925         gpu = block_get_gpu(existing, id);
8926 
8927         status = block_presplit_gpu_chunks(existing, new, gpu);
8928         if (status != NV_OK)
8929             goto error;
8930 
8931         if (!block_gpu_state_get_alloc(new, gpu)) {
8932             status = NV_ERR_NO_MEMORY;
8933             goto error;
8934         }
8935     }
8936 
8937     block_test = uvm_va_block_get_test(existing);
8938     if (block_test && block_test->inject_split_error) {
8939         block_test->inject_split_error = false;
8940         if (!uvm_va_block_is_hmm(existing)) {
8941             UVM_ASSERT(existing->va_range->inject_split_error);
8942             existing->va_range->inject_split_error = false;
8943         }
8944         status = NV_ERR_NO_MEMORY;
8945         goto error;
8946     }
8947 
8948     if (uvm_va_block_is_hmm(existing)) {
8949         uvm_va_policy_node_t *node = uvm_va_policy_node_find(existing, new->start);
8950 
8951         if (node && node->node.start != new->start) {
8952             status = uvm_va_policy_node_split(existing, node, new->start - 1, NULL);
8953             if (status != NV_OK)
8954                 goto error;
8955         }
8956     }
8957 
8958     return NV_OK;
8959 
8960 error:
8961     // Merge back the chunks we split
8962     split_page_index = uvm_va_block_cpu_page_index(existing, new->start);
8963 
8964     for_each_gpu_id(id) {
8965         uvm_gpu_chunk_t *chunk;
8966         size_t chunk_index;
8967         uvm_va_block_gpu_state_t *existing_gpu_state = uvm_va_block_gpu_state_get(existing, id);
8968 
8969         if (!existing_gpu_state)
8970             continue;
8971 
8972         // If the chunk spanning the split point was split, merge it back
8973         gpu = block_get_gpu(existing, id);
8974         chunk_index = block_gpu_chunk_index(existing, gpu, split_page_index, NULL);
8975         chunk = existing_gpu_state->chunks[chunk_index];
8976         if (!chunk || chunk->state != UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT)
8977             continue;
8978 
8979         block_merge_chunk(existing, gpu, chunk);
8980 
8981         // We could attempt to shrink the chunks array back down, but it doesn't
8982         // hurt much to have it larger than necessary, and we'd have to handle
8983         // the shrink call failing anyway on this error path.
8984 
8985     }
8986 
8987     block_merge_cpu_chunks(existing, new);
8988 
8989     return status;
8990 }
8991 
8992 // Re-calculate the block's top-level processor masks:
8993 //   - block->mapped
8994 //   - block->resident
8995 //
8996 // This is called on block split.
8997 static void block_set_processor_masks(uvm_va_block_t *block)
8998 {
8999     size_t num_pages = uvm_va_block_num_cpu_pages(block);
9000     uvm_va_block_region_t block_region = uvm_va_block_region(0, num_pages);
9001     uvm_gpu_id_t id;
9002 
9003     if (uvm_page_mask_region_empty(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ], block_region)) {
9004         UVM_ASSERT(uvm_page_mask_region_empty(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE], block_region));
9005         uvm_processor_mask_clear(&block->mapped, UVM_ID_CPU);
9006     }
9007     else {
9008         uvm_processor_mask_set(&block->mapped, UVM_ID_CPU);
9009     }
9010 
9011     if (uvm_page_mask_region_empty(&block->cpu.resident, block_region)) {
9012         uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
9013 
9014         if (uvm_processor_mask_get_gpu_count(&va_space->can_access[UVM_ID_CPU_VALUE]) == 0)
9015             UVM_ASSERT(!uvm_processor_mask_test(&block->mapped, UVM_ID_CPU));
9016 
9017         block_clear_resident_processor(block, UVM_ID_CPU);
9018     }
9019     else {
9020         block_set_resident_processor(block, UVM_ID_CPU);
9021     }
9022 
9023     for_each_gpu_id(id) {
9024         uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, id);
9025         if (!gpu_state)
9026             continue;
9027 
9028         if (uvm_page_mask_region_empty(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ], block_region)) {
9029             UVM_ASSERT(uvm_page_mask_region_empty(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_WRITE], block_region));
9030             UVM_ASSERT(uvm_page_mask_region_empty(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_ATOMIC], block_region));
9031             uvm_processor_mask_clear(&block->mapped, id);
9032         }
9033         else {
9034             uvm_processor_mask_set(&block->mapped, id);
9035         }
9036 
9037         if (uvm_page_mask_region_empty(&gpu_state->resident, block_region))
9038             block_clear_resident_processor(block, id);
9039         else
9040             block_set_resident_processor(block, id);
9041 
9042         if (uvm_page_mask_region_empty(&gpu_state->evicted, block_region))
9043             uvm_processor_mask_clear(&block->evicted_gpus, id);
9044         else
9045             uvm_processor_mask_set(&block->evicted_gpus, id);
9046     }
9047 }
9048 
9049 // Split a PAGES_PER_UVM_VA_BLOCK sized bitmap into new and existing parts
9050 // corresponding to a block split.
9051 static void block_split_page_mask(uvm_page_mask_t *existing_mask,
9052                                   size_t existing_pages,
9053                                   uvm_page_mask_t *new_mask,
9054                                   size_t new_pages)
9055 {
9056     UVM_ASSERT_MSG(existing_pages + new_pages <= PAGES_PER_UVM_VA_BLOCK, "existing %zu new %zu\n",
9057                    existing_pages, new_pages);
9058 
9059     // The new block is always in the upper region of existing, so shift the bit
9060     // vectors down.
9061     //
9062     // Note that bitmap_shift_right requires both dst and src to be the same
9063     // size. That's ok since we don't scale them by block size.
9064     uvm_page_mask_shift_right(new_mask, existing_mask, existing_pages);
9065     uvm_page_mask_region_clear(existing_mask, uvm_va_block_region(existing_pages, existing_pages + new_pages));
9066 }
9067 
9068 // Split the CPU state within the existing block. existing's start is correct
9069 // but its end has not yet been adjusted.
9070 static void block_split_cpu(uvm_va_block_t *existing, uvm_va_block_t *new)
9071 {
9072     size_t existing_pages, new_pages = uvm_va_block_num_cpu_pages(new);
9073     uvm_pte_bits_cpu_t pte_bit;
9074     uvm_va_block_region_t block_region = uvm_va_block_region_from_block(existing);
9075     uvm_page_index_t split_page_index = uvm_va_block_cpu_page_index(existing, new->start);
9076     uvm_page_index_t page_index;
9077     uvm_page_index_t next_page_index;
9078     uvm_cpu_chunk_t *chunk;
9079     uvm_va_range_t *existing_va_range = existing->va_range;
9080 
9081     if (existing_va_range) {
9082         UVM_ASSERT(existing->va_range->type == UVM_VA_RANGE_TYPE_MANAGED);
9083         UVM_ASSERT(existing->va_range->type == new->va_range->type);
9084     }
9085 
9086     UVM_ASSERT(existing->start < new->start);
9087     UVM_ASSERT(existing->end == new->end);
9088 
9089     UVM_ASSERT(PAGE_ALIGNED(new->start));
9090     UVM_ASSERT(PAGE_ALIGNED(existing->start));
9091 
9092     existing_pages = (new->start - existing->start) / PAGE_SIZE;
9093 
9094     // We don't have to unmap the CPU since its virtual -> physical mappings
9095     // don't change.
9096 
9097     page_index = uvm_va_block_next_page_in_mask(block_region, &existing->cpu.allocated, split_page_index - 1);
9098 
9099     for_each_cpu_chunk_in_block_region_safe(chunk,
9100                                             page_index,
9101                                             next_page_index,
9102                                             existing,
9103                                             uvm_va_block_region(split_page_index, block_region.outer)) {
9104         uvm_page_index_t new_chunk_page_index;
9105         NV_STATUS status;
9106 
9107         uvm_cpu_chunk_remove_from_block(existing, page_index);
9108 
9109         // The chunk has to be adjusted for the new block before inserting it.
9110         new_chunk_page_index = page_index - split_page_index;
9111 
9112         // This should never fail because all necessary storage was allocated
9113         // in block_presplit_cpu_chunks().
9114         status = uvm_cpu_chunk_insert_in_block(new, chunk, new_chunk_page_index);
9115         UVM_ASSERT(status == NV_OK);
9116     }
9117 
9118     new->cpu.ever_mapped = existing->cpu.ever_mapped;
9119 
9120     block_split_page_mask(&existing->cpu.resident, existing_pages, &new->cpu.resident, new_pages);
9121 
9122     for (pte_bit = 0; pte_bit < UVM_PTE_BITS_CPU_MAX; pte_bit++)
9123         block_split_page_mask(&existing->cpu.pte_bits[pte_bit], existing_pages, &new->cpu.pte_bits[pte_bit], new_pages);
9124 }
9125 
9126 // Fill out the blocks' chunks arrays with the chunks split by
9127 // block_presplit_gpu_chunks.
9128 static void block_copy_split_gpu_chunks(uvm_va_block_t *existing, uvm_va_block_t *new, uvm_gpu_t *gpu)
9129 {
9130     uvm_va_block_gpu_state_t *existing_gpu_state = uvm_va_block_gpu_state_get(existing, gpu->id);
9131     uvm_va_block_gpu_state_t *new_gpu_state = uvm_va_block_gpu_state_get(new, gpu->id);
9132     uvm_gpu_chunk_t **temp_chunks;
9133     uvm_gpu_chunk_t *original_chunk;
9134     block_gpu_chunk_split_state_t existing_before_state, existing_after_state, new_state;
9135     size_t num_pre_chunks, num_post_chunks, num_split_chunks_existing, num_split_chunks_new;
9136     uvm_page_index_t split_page_index = uvm_va_block_cpu_page_index(existing, new->start);
9137     size_t i;
9138 
9139     block_gpu_chunk_get_split_state(existing,
9140                                     &existing_before_state,
9141                                     existing->start,
9142                                     existing->end,
9143                                     split_page_index,
9144                                     gpu);
9145     block_gpu_chunk_get_split_state(existing,
9146                                     &existing_after_state,
9147                                     existing->start,
9148                                     new->start - 1,
9149                                     split_page_index - 1,
9150                                     gpu);
9151     block_gpu_chunk_get_split_state(new,
9152                                     &new_state,
9153                                     new->start,
9154                                     new->end,
9155                                     0,
9156                                     gpu);
9157 
9158     // General case (B is original_chunk):
9159     //                                          split
9160     //                                            v
9161     //  existing (before) [------ A -----][------ B -----][------ C -----]
9162     //  existing (after)  [------ A -----][- B0 -]
9163     //  new                                       [- B1 -][------ C -----]
9164     //
9165     // Note that the logic below also handles the case of the split happening at
9166     // a chunk boundary. That case behaves as though there is no B0 chunk.
9167 
9168     // Number of chunks to the left and right of original_chunk (A and C above).
9169     // Either or both of these may be 0.
9170     num_pre_chunks  = existing_before_state.chunk_index;
9171     num_post_chunks = existing_before_state.num_chunks - num_pre_chunks - 1;
9172 
9173     // Number of subchunks under existing's portion of original_chunk (B0 above)
9174     num_split_chunks_existing = existing_after_state.num_chunks - num_pre_chunks;
9175 
9176     // Number of subchunks under new's portion of original_chunk (B1 above)
9177     num_split_chunks_new = new_state.num_chunks - num_post_chunks;
9178 
9179     UVM_ASSERT(num_pre_chunks + num_split_chunks_existing > 0);
9180     UVM_ASSERT(num_split_chunks_new > 0);
9181 
9182     // Copy post chunks from the end of existing into new (C above)
9183     memcpy(&new_gpu_state->chunks[num_split_chunks_new],
9184            &existing_gpu_state->chunks[existing_before_state.chunk_index + 1],
9185            num_post_chunks * sizeof(new_gpu_state->chunks[0]));
9186 
9187     // Save off the original split chunk since we may overwrite the array
9188     original_chunk = existing_gpu_state->chunks[existing_before_state.chunk_index];
9189 
9190     // Fill out the new pointers
9191     if (original_chunk) {
9192         // Note that if the split happened at a chunk boundary, original_chunk
9193         // will not be split. In that case, num_split_chunks_existing will be 0
9194         // and num_split_chunks_new will be 1, so the left copy will be skipped
9195         // and the right copy will pick up the chunk.
9196 
9197         // Copy left newly-split chunks into existing (B0 above). The array was
9198         // re-sized in block_presplit_gpu_chunks as necessary.
9199         size_t num_subchunks;
9200 
9201         num_subchunks = uvm_pmm_gpu_get_subchunks(&gpu->pmm,
9202                                                   original_chunk,
9203                                                   0, // start_index
9204                                                   num_split_chunks_existing,
9205                                                   &existing_gpu_state->chunks[existing_before_state.chunk_index]);
9206         UVM_ASSERT(num_subchunks == num_split_chunks_existing);
9207 
9208         // Copy right newly-split chunks into new (B1 above), overwriting the
9209         // pointer to the original chunk.
9210         num_subchunks = uvm_pmm_gpu_get_subchunks(&gpu->pmm,
9211                                                   original_chunk,
9212                                                   num_split_chunks_existing, // start_index
9213                                                   num_split_chunks_new,
9214                                                   &new_gpu_state->chunks[0]);
9215         UVM_ASSERT(num_subchunks == num_split_chunks_new);
9216     }
9217     else {
9218         // If the chunk wasn't already populated we don't need to copy pointers
9219         // anywhere, but we need to clear out stale pointers from existing's
9220         // array covering the new elements. new's chunks array was already zero-
9221         // initialized.
9222         memset(&existing_gpu_state->chunks[existing_before_state.chunk_index],
9223                0,
9224                num_split_chunks_existing * sizeof(existing_gpu_state->chunks[0]));
9225     }
9226 
9227     // Since we update the reverse map information, protect it against a
9228     // concurrent lookup
9229     uvm_spin_lock(&gpu->pmm.list_lock);
9230 
9231     // Update the reverse map of all the chunks that are now under the new block
9232     for (i = 0; i < new_state.num_chunks; ++i) {
9233         if (new_gpu_state->chunks[i]) {
9234             UVM_ASSERT(new_gpu_state->chunks[i]->va_block == existing);
9235             new_gpu_state->chunks[i]->va_block = new;
9236 
9237             // Adjust the page_index within the VA block for the new subchunks in
9238             // the new VA block
9239             UVM_ASSERT(new_gpu_state->chunks[i]->va_block_page_index >= split_page_index);
9240             new_gpu_state->chunks[i]->va_block_page_index -= split_page_index;
9241         }
9242     }
9243 
9244     uvm_spin_unlock(&gpu->pmm.list_lock);
9245 
9246     // Attempt to shrink existing's chunk allocation. If the realloc fails, just
9247     // keep on using the old larger one.
9248     if (existing_after_state.num_chunks < existing_before_state.num_chunks) {
9249         temp_chunks = uvm_kvrealloc(existing_gpu_state->chunks,
9250                                     existing_after_state.num_chunks * sizeof(existing_gpu_state->chunks[0]));
9251         if (temp_chunks)
9252             existing_gpu_state->chunks = temp_chunks;
9253     }
9254 }
9255 
9256 static void block_split_gpu(uvm_va_block_t *existing, uvm_va_block_t *new, uvm_gpu_id_t gpu_id)
9257 {
9258     uvm_va_block_gpu_state_t *existing_gpu_state = uvm_va_block_gpu_state_get(existing, gpu_id);
9259     uvm_va_block_gpu_state_t *new_gpu_state = uvm_va_block_gpu_state_get(new, gpu_id);
9260     uvm_va_space_t *va_space = uvm_va_block_get_va_space(existing);
9261     uvm_gpu_va_space_t *gpu_va_space;
9262     uvm_gpu_t *gpu;
9263     uvm_gpu_t *accessing_gpu;
9264     size_t new_pages = uvm_va_block_num_cpu_pages(new);
9265     size_t existing_pages, existing_pages_4k, existing_pages_big, new_pages_big;
9266     uvm_pte_bits_gpu_t pte_bit;
9267     size_t num_chunks, i;
9268     uvm_cpu_chunk_t *cpu_chunk;
9269     uvm_page_index_t page_index;
9270 
9271     if (!existing_gpu_state)
9272         return;
9273 
9274     gpu = uvm_va_space_get_gpu(va_space, gpu_id);
9275     UVM_ASSERT(new_gpu_state);
9276 
9277     new_gpu_state->force_4k_ptes = existing_gpu_state->force_4k_ptes;
9278 
9279     UVM_ASSERT(PAGE_ALIGNED(new->start));
9280     UVM_ASSERT(PAGE_ALIGNED(existing->start));
9281     existing_pages = (new->start - existing->start) / PAGE_SIZE;
9282 
9283     for_each_cpu_chunk_in_block(cpu_chunk, page_index, new) {
9284         uvm_pmm_sysmem_mappings_reparent_gpu_mapping(&gpu->pmm_reverse_sysmem_mappings,
9285                                                      uvm_cpu_chunk_get_gpu_phys_addr(cpu_chunk, gpu->parent),
9286                                                      new);
9287     }
9288 
9289     block_copy_split_gpu_chunks(existing, new, gpu);
9290 
9291     num_chunks = block_num_gpu_chunks(new, gpu);
9292 
9293     // Reparent GPU mappings for indirect peers
9294     for (i = 0; i < num_chunks; ++i) {
9295         uvm_gpu_chunk_t *chunk = new_gpu_state->chunks[i];
9296         if (!chunk)
9297             continue;
9298 
9299         for_each_va_space_gpu_in_mask(accessing_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)]) {
9300             NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(&gpu->pmm, chunk, accessing_gpu);
9301 
9302             uvm_pmm_sysmem_mappings_reparent_gpu_chunk_mapping(&accessing_gpu->pmm_reverse_sysmem_mappings,
9303                                                                peer_addr,
9304                                                                new);
9305         }
9306     }
9307 
9308     block_split_page_mask(&existing_gpu_state->resident,
9309                           existing_pages,
9310                           &new_gpu_state->resident,
9311                           new_pages);
9312 
9313     for (pte_bit = 0; pte_bit < UVM_PTE_BITS_GPU_MAX; pte_bit++) {
9314         block_split_page_mask(&existing_gpu_state->pte_bits[pte_bit], existing_pages,
9315                               &new_gpu_state->pte_bits[pte_bit], new_pages);
9316     }
9317 
9318     // Adjust page table ranges.
9319     gpu_va_space = uvm_gpu_va_space_get(va_space, gpu);
9320     if (gpu_va_space) {
9321         if (existing_gpu_state->page_table_range_big.table) {
9322             NvU32 big_page_size = uvm_va_block_gpu_big_page_size(existing, gpu);
9323 
9324             // existing's end has not been adjusted yet
9325             existing_pages_big = range_num_big_pages(existing->start, new->start - 1, big_page_size);
9326 
9327             // Take references on all big pages covered by new
9328             new_pages_big = uvm_va_block_num_big_pages(new, big_page_size);
9329             if (new_pages_big) {
9330                 uvm_page_table_range_get_upper(&gpu_va_space->page_tables,
9331                                                &existing_gpu_state->page_table_range_big,
9332                                                &new_gpu_state->page_table_range_big,
9333                                                new_pages_big);
9334 
9335                 // If the split point is within a big page region, we might have
9336                 // a gap since neither existing nor new can use it anymore.
9337                 // Get the top N bits from existing's mask to handle that.
9338                 bitmap_shift_right(new_gpu_state->big_ptes,
9339                                    existing_gpu_state->big_ptes,
9340                                    uvm_va_block_num_big_pages(existing, big_page_size) - new_pages_big,
9341                                    MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
9342 
9343                 new_gpu_state->initialized_big = existing_gpu_state->initialized_big;
9344             }
9345 
9346             // Drop existing's references on the big PTEs it no longer covers
9347             // now that new has references on them. Note that neither existing
9348             // nor new might have big PTEs after the split. In that case, this
9349             // shrink will free the entire old range.
9350             uvm_page_table_range_shrink(&gpu_va_space->page_tables,
9351                                         &existing_gpu_state->page_table_range_big,
9352                                         existing_pages_big);
9353 
9354             if (existing_pages_big == 0) {
9355                 memset(&existing_gpu_state->page_table_range_big, 0, sizeof(existing_gpu_state->page_table_range_big));
9356                 existing_gpu_state->initialized_big = false;
9357             }
9358 
9359             bitmap_clear(existing_gpu_state->big_ptes,
9360                          existing_pages_big,
9361                          MAX_BIG_PAGES_PER_UVM_VA_BLOCK - existing_pages_big);
9362         }
9363 
9364         if (existing_gpu_state->page_table_range_4k.table) {
9365             // Since existing and new share the same PDE we just need to bump
9366             // the ref-count on new's sub-range.
9367             uvm_page_table_range_get_upper(&gpu_va_space->page_tables,
9368                                            &existing_gpu_state->page_table_range_4k,
9369                                            &new_gpu_state->page_table_range_4k,
9370                                            uvm_va_block_size(new) / UVM_PAGE_SIZE_4K);
9371 
9372             // Drop existing's references on the PTEs it no longer covers now
9373             // that new has references on them.
9374             existing_pages_4k = existing_pages * (PAGE_SIZE / UVM_PAGE_SIZE_4K);
9375             uvm_page_table_range_shrink(&gpu_va_space->page_tables,
9376                                         &existing_gpu_state->page_table_range_4k,
9377                                         existing_pages_4k);
9378         }
9379 
9380         // We have to set this explicitly to handle the case of splitting an
9381         // invalid, active 2M PTE with no lower page tables allocated.
9382         if (existing_gpu_state->pte_is_2m) {
9383             UVM_ASSERT(!existing_gpu_state->page_table_range_big.table);
9384             UVM_ASSERT(!existing_gpu_state->page_table_range_4k.table);
9385             existing_gpu_state->pte_is_2m = false;
9386         }
9387 
9388         // existing can't possibly cover 2MB after a split, so drop any 2M PTE
9389         // references it has. We've taken the necessary references on the lower
9390         // tables above.
9391         block_put_ptes_safe(&gpu_va_space->page_tables, &existing_gpu_state->page_table_range_2m);
9392         existing_gpu_state->activated_big = false;
9393         existing_gpu_state->activated_4k = false;
9394     }
9395 
9396     block_split_page_mask(&existing_gpu_state->evicted, existing_pages, &new_gpu_state->evicted, new_pages);
9397 }
9398 
9399 NV_STATUS uvm_va_block_split(uvm_va_block_t *existing_va_block,
9400                              NvU64 new_end,
9401                              uvm_va_block_t **new_va_block,
9402                              uvm_va_range_t *new_va_range)
9403 {
9404     uvm_va_space_t *va_space;
9405     uvm_va_block_t *new_block = NULL;
9406     NV_STATUS status;
9407 
9408     va_space = new_va_range->va_space;
9409     UVM_ASSERT(existing_va_block->va_range);
9410     UVM_ASSERT(existing_va_block->va_range->va_space == va_space);
9411     UVM_ASSERT(!uvm_va_block_is_hmm(existing_va_block));
9412 
9413     // External range types can't be split
9414     UVM_ASSERT(existing_va_block->va_range->type == UVM_VA_RANGE_TYPE_MANAGED);
9415     UVM_ASSERT(new_va_range->type == UVM_VA_RANGE_TYPE_MANAGED);
9416     uvm_assert_rwsem_locked_write(&va_space->lock);
9417 
9418     UVM_ASSERT(new_end > existing_va_block->start);
9419     UVM_ASSERT(new_end < existing_va_block->end);
9420     UVM_ASSERT(PAGE_ALIGNED(new_end + 1));
9421 
9422     status = uvm_va_block_create(new_va_range, new_end + 1, existing_va_block->end, &new_block);
9423     if (status != NV_OK)
9424         return status;
9425 
9426     // We're protected from other splits and faults by the va_space lock being
9427     // held in write mode, but that doesn't stop the reverse mapping (eviction
9428     // path) from inspecting the existing block. Stop those threads by taking
9429     // the block lock. When a reverse mapping thread takes this lock after the
9430     // split has been performed, it will have to re-inspect state and may see
9431     // that it should use the newly-split block instead.
9432     uvm_mutex_lock(&existing_va_block->lock);
9433 
9434     status = uvm_va_block_split_locked(existing_va_block, new_end, new_block, new_va_range);
9435 
9436     uvm_mutex_unlock(&existing_va_block->lock);
9437 
9438     if (status != NV_OK)
9439         uvm_va_block_release(new_block);
9440     else if (new_va_block)
9441         *new_va_block = new_block;
9442 
9443     return status;
9444 }
9445 
9446 NV_STATUS uvm_va_block_split_locked(uvm_va_block_t *existing_va_block,
9447                                     NvU64 new_end,
9448                                     uvm_va_block_t *new_block,
9449                                     uvm_va_range_t *new_va_range)
9450 {
9451     uvm_va_space_t *va_space = uvm_va_block_get_va_space(existing_va_block);
9452     uvm_gpu_id_t id;
9453     NV_STATUS status;
9454     uvm_perf_event_data_t event_data;
9455 
9456     UVM_ASSERT(block_check_chunks(existing_va_block));
9457 
9458     // As soon as we update existing's reverse mappings to point to the newly-
9459     // split block, the eviction path could try to operate on the new block.
9460     // Lock that out too until new is ready.
9461     //
9462     // Note that we usually shouldn't nest block locks, but it's ok here because
9463     // we just created new_block so no other thread could possibly take it out
9464     // of order with existing's lock.
9465     uvm_mutex_lock_no_tracking(&new_block->lock);
9466 
9467     // The split has to be transactional, meaning that if we fail, the existing
9468     // block must not be modified. Handle that by pre-allocating everything we
9469     // might need under both existing and new at the start so we only have a
9470     // single point of failure.
9471 
9472     // Since pre-allocation might require allocating new PTEs, we have to handle
9473     // allocation retry which might drop existing's block lock. The
9474     // preallocation is split into two steps for that: the first part which
9475     // allocates and splits PTEs can handle having the block lock dropped then
9476     // re-taken. It won't modify existing_va_block other than adding new PTE
9477     // allocations and splitting existing PTEs, which is always safe.
9478     status = UVM_VA_BLOCK_RETRY_LOCKED(existing_va_block,
9479                                        NULL,
9480                                        block_split_presplit_ptes(existing_va_block, new_block));
9481     if (status != NV_OK)
9482         goto out;
9483 
9484     // Pre-allocate, stage two. This modifies existing_va_block in ways which
9485     // violate many assumptions (such as changing chunk size), but it will put
9486     // things back into place on a failure without dropping the block lock.
9487     status = block_split_preallocate_no_retry(existing_va_block, new_block);
9488     if (status != NV_OK)
9489         goto out;
9490 
9491     // We'll potentially be freeing page tables, so we need to wait for any
9492     // outstanding work before we start
9493     status = uvm_tracker_wait(&existing_va_block->tracker);
9494     if (status != NV_OK)
9495         goto out;
9496 
9497     // Update existing's state only once we're past all failure points
9498 
9499     event_data.block_shrink.block = existing_va_block;
9500     uvm_perf_event_notify(&va_space->perf_events, UVM_PERF_EVENT_BLOCK_SHRINK, &event_data);
9501 
9502     block_split_cpu(existing_va_block, new_block);
9503 
9504     for_each_gpu_id(id)
9505         block_split_gpu(existing_va_block, new_block, id);
9506 
9507     // Update the size of the existing block first so that
9508     // block_set_processor_masks can use block_{set,clear}_resident_processor
9509     // that relies on the size to be correct.
9510     existing_va_block->end = new_end;
9511 
9512     block_split_page_mask(&existing_va_block->read_duplicated_pages,
9513                           uvm_va_block_num_cpu_pages(existing_va_block),
9514                           &new_block->read_duplicated_pages,
9515                           uvm_va_block_num_cpu_pages(new_block));
9516 
9517     block_split_page_mask(&existing_va_block->maybe_mapped_pages,
9518                           uvm_va_block_num_cpu_pages(existing_va_block),
9519                           &new_block->maybe_mapped_pages,
9520                           uvm_va_block_num_cpu_pages(new_block));
9521 
9522     block_set_processor_masks(existing_va_block);
9523     block_set_processor_masks(new_block);
9524 
9525     if (uvm_va_block_is_hmm(existing_va_block)) {
9526         uvm_hmm_va_block_split_tree(existing_va_block, new_block);
9527         uvm_va_policy_node_split_move(existing_va_block, new_block);
9528     }
9529 
9530 out:
9531     // Run checks on existing_va_block even on failure, since an error must
9532     // leave the block in a consistent state.
9533     UVM_ASSERT(block_check_chunks(existing_va_block));
9534     UVM_ASSERT(block_check_mappings(existing_va_block));
9535     if (status == NV_OK) {
9536         UVM_ASSERT(block_check_chunks(new_block));
9537         UVM_ASSERT(block_check_mappings(new_block));
9538     }
9539     else {
9540         block_free_cpu_chunk_storage(new_block);
9541     }
9542 
9543     uvm_mutex_unlock_no_tracking(&new_block->lock);
9544 
9545     return status;
9546 }
9547 
9548 static bool block_region_might_read_duplicate(uvm_va_block_t *va_block,
9549                                               uvm_va_block_region_t region)
9550 {
9551     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
9552     uvm_va_range_t *va_range = va_block->va_range;
9553 
9554     if (!uvm_va_space_can_read_duplicate(va_space, NULL))
9555         return false;
9556 
9557     // TODO: Bug 3660922: need to implement HMM read duplication support.
9558     if (uvm_va_block_is_hmm(va_block) ||
9559         uvm_va_range_get_policy(va_range)->read_duplication == UVM_READ_DUPLICATION_DISABLED)
9560         return false;
9561 
9562     if (uvm_va_range_get_policy(va_range)->read_duplication == UVM_READ_DUPLICATION_UNSET
9563         && uvm_page_mask_region_weight(&va_block->read_duplicated_pages, region) == 0)
9564         return false;
9565 
9566     return true;
9567 }
9568 
9569 // Returns the new access permission for the processor that faulted or
9570 // triggered access counter notifications on the given page
9571 //
9572 // TODO: Bug 1766424: this function works on a single page at a time. This
9573 //       could be changed in the future to optimize multiple faults/counters on
9574 //       contiguous pages.
9575 static uvm_prot_t compute_new_permission(uvm_va_block_t *va_block,
9576                                          uvm_va_block_context_t *va_block_context,
9577                                          uvm_page_index_t page_index,
9578                                          uvm_processor_id_t fault_processor_id,
9579                                          uvm_processor_id_t new_residency,
9580                                          uvm_fault_access_type_t access_type)
9581 {
9582     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
9583     uvm_prot_t logical_prot, new_prot;
9584 
9585     // TODO: Bug 1766432: Refactor into policies. Current policy is
9586     //       query_promote: upgrade access privileges to avoid future faults IF
9587     //       they don't trigger further revocations.
9588     new_prot = uvm_fault_access_type_to_prot(access_type);
9589     logical_prot = compute_logical_prot(va_block, va_block_context, page_index);
9590 
9591     UVM_ASSERT(logical_prot >= new_prot);
9592 
9593     if (logical_prot > UVM_PROT_READ_ONLY && new_prot == UVM_PROT_READ_ONLY &&
9594         !block_region_might_read_duplicate(va_block, uvm_va_block_region_for_page(page_index))) {
9595         uvm_processor_mask_t processors_with_atomic_mapping;
9596         uvm_processor_mask_t revoke_processors;
9597 
9598         block_page_authorized_processors(va_block,
9599                                          page_index,
9600                                          UVM_PROT_READ_WRITE_ATOMIC,
9601                                          &processors_with_atomic_mapping);
9602 
9603         uvm_processor_mask_andnot(&revoke_processors,
9604                                   &processors_with_atomic_mapping,
9605                                   &va_space->has_native_atomics[uvm_id_value(new_residency)]);
9606 
9607         // Only check if there are no faultable processors in the revoke
9608         // processors mask.
9609         uvm_processor_mask_and(&revoke_processors, &revoke_processors, &va_space->faultable_processors);
9610 
9611         if (uvm_processor_mask_empty(&revoke_processors))
9612             new_prot = UVM_PROT_READ_WRITE;
9613     }
9614     if (logical_prot == UVM_PROT_READ_WRITE_ATOMIC && new_prot == UVM_PROT_READ_WRITE) {
9615         if (uvm_processor_mask_test(&va_space->has_native_atomics[uvm_id_value(new_residency)], fault_processor_id))
9616             new_prot = UVM_PROT_READ_WRITE_ATOMIC;
9617     }
9618 
9619     return new_prot;
9620 }
9621 
9622 static NV_STATUS do_block_add_mappings_after_migration(uvm_va_block_t *va_block,
9623                                                        uvm_va_block_context_t *va_block_context,
9624                                                        uvm_processor_id_t new_residency,
9625                                                        uvm_processor_id_t processor_id,
9626                                                        const uvm_processor_mask_t *map_processors,
9627                                                        uvm_va_block_region_t region,
9628                                                        const uvm_page_mask_t *map_page_mask,
9629                                                        uvm_prot_t max_prot,
9630                                                        const uvm_processor_mask_t *thrashing_processors,
9631                                                        uvm_tracker_t *tracker)
9632 {
9633     NV_STATUS status;
9634     uvm_processor_id_t map_processor_id;
9635     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
9636     uvm_prot_t new_map_prot = max_prot;
9637     uvm_processor_mask_t map_processors_local;
9638 
9639     uvm_processor_mask_copy(&map_processors_local, map_processors);
9640 
9641     // Handle atomic mappings separately
9642     if (max_prot == UVM_PROT_READ_WRITE_ATOMIC) {
9643         bool this_processor_has_native_atomics;
9644 
9645         this_processor_has_native_atomics =
9646             uvm_processor_mask_test(&va_space->has_native_atomics[uvm_id_value(new_residency)], processor_id);
9647 
9648         if (this_processor_has_native_atomics) {
9649             uvm_processor_mask_t map_atomic_processors;
9650 
9651             // Compute processors with native atomics to the residency
9652             uvm_processor_mask_and(&map_atomic_processors,
9653                                    &map_processors_local,
9654                                    &va_space->has_native_atomics[uvm_id_value(new_residency)]);
9655 
9656             // Filter out these mapped processors for the next steps
9657             uvm_processor_mask_andnot(&map_processors_local, &map_processors_local, &map_atomic_processors);
9658 
9659             for_each_id_in_mask(map_processor_id, &map_atomic_processors) {
9660                 UvmEventMapRemoteCause cause = UvmEventMapRemoteCausePolicy;
9661                 if (thrashing_processors && uvm_processor_mask_test(thrashing_processors, map_processor_id))
9662                     cause = UvmEventMapRemoteCauseThrashing;
9663 
9664                 status = uvm_va_block_map(va_block,
9665                                           va_block_context,
9666                                           map_processor_id,
9667                                           region,
9668                                           map_page_mask,
9669                                           UVM_PROT_READ_WRITE_ATOMIC,
9670                                           cause,
9671                                           tracker);
9672                 if (status != NV_OK)
9673                     return status;
9674             }
9675 
9676             new_map_prot = UVM_PROT_READ_WRITE;
9677         }
9678         else {
9679             if (UVM_ID_IS_CPU(processor_id))
9680                 new_map_prot = UVM_PROT_READ_WRITE;
9681             else
9682                 new_map_prot = UVM_PROT_READ_ONLY;
9683         }
9684     }
9685 
9686     // Map the rest of processors
9687     for_each_id_in_mask(map_processor_id, &map_processors_local) {
9688         UvmEventMapRemoteCause cause = UvmEventMapRemoteCausePolicy;
9689         uvm_prot_t final_map_prot;
9690         bool map_processor_has_enabled_system_wide_atomics =
9691             uvm_processor_mask_test(&va_space->system_wide_atomics_enabled_processors, map_processor_id);
9692 
9693         // Write mappings from processors with disabled system-wide atomics are treated like atomics
9694         if (new_map_prot == UVM_PROT_READ_WRITE && !map_processor_has_enabled_system_wide_atomics)
9695             final_map_prot = UVM_PROT_READ_WRITE_ATOMIC;
9696         else
9697             final_map_prot = new_map_prot;
9698 
9699         if (thrashing_processors && uvm_processor_mask_test(thrashing_processors, map_processor_id))
9700             cause = UvmEventMapRemoteCauseThrashing;
9701 
9702         status = uvm_va_block_map(va_block,
9703                                   va_block_context,
9704                                   map_processor_id,
9705                                   region,
9706                                   map_page_mask,
9707                                   final_map_prot,
9708                                   cause,
9709                                   tracker);
9710         if (status != NV_OK)
9711             return status;
9712     }
9713 
9714     return NV_OK;
9715 }
9716 
9717 NV_STATUS uvm_va_block_add_mappings_after_migration(uvm_va_block_t *va_block,
9718                                                     uvm_va_block_context_t *va_block_context,
9719                                                     uvm_processor_id_t new_residency,
9720                                                     uvm_processor_id_t processor_id,
9721                                                     uvm_va_block_region_t region,
9722                                                     const uvm_page_mask_t *map_page_mask,
9723                                                     uvm_prot_t max_prot,
9724                                                     const uvm_processor_mask_t *thrashing_processors)
9725 {
9726     NV_STATUS tracker_status, status = NV_OK;
9727     uvm_processor_mask_t map_other_processors, map_uvm_lite_gpus;
9728     uvm_processor_id_t map_processor_id;
9729     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
9730     const uvm_page_mask_t *final_page_mask = map_page_mask;
9731     uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
9732     const uvm_va_policy_t *policy = va_block_context->policy;
9733     uvm_processor_id_t preferred_location;
9734 
9735     uvm_assert_mutex_locked(&va_block->lock);
9736     UVM_ASSERT(uvm_va_block_check_policy_is_valid(va_block, policy, region));
9737 
9738     // Read duplication takes precedence over SetAccessedBy.
9739     //
9740     // Exclude ranges with read duplication set...
9741     if (uvm_va_policy_is_read_duplicate(policy, va_space)) {
9742         status = NV_OK;
9743         goto out;
9744     }
9745 
9746     // ... and pages read-duplicated by performance heuristics
9747     if (policy->read_duplication == UVM_READ_DUPLICATION_UNSET) {
9748         if (map_page_mask) {
9749             uvm_page_mask_andnot(&va_block_context->mapping.filtered_page_mask,
9750                                  map_page_mask,
9751                                  &va_block->read_duplicated_pages);
9752         }
9753         else {
9754             uvm_page_mask_complement(&va_block_context->mapping.filtered_page_mask, &va_block->read_duplicated_pages);
9755         }
9756         final_page_mask = &va_block_context->mapping.filtered_page_mask;
9757     }
9758 
9759     // Add mappings for accessed_by processors and the given processor mask
9760     if (thrashing_processors)
9761         uvm_processor_mask_or(&map_other_processors, &policy->accessed_by, thrashing_processors);
9762     else
9763         uvm_processor_mask_copy(&map_other_processors, &policy->accessed_by);
9764 
9765     // Only processors that can access the new location must be considered
9766     uvm_processor_mask_and(&map_other_processors,
9767                            &map_other_processors,
9768                            &va_space->accessible_from[uvm_id_value(new_residency)]);
9769 
9770     // Exclude caller processor as it must have already been mapped
9771     uvm_processor_mask_clear(&map_other_processors, processor_id);
9772 
9773     // Exclude preferred location so it won't get remote mappings
9774     preferred_location = policy->preferred_location;
9775     if (UVM_ID_IS_VALID(preferred_location) &&
9776         !uvm_id_equal(new_residency, preferred_location) &&
9777         uvm_va_space_processor_has_memory(va_space, preferred_location)) {
9778         uvm_processor_mask_clear(&map_other_processors, preferred_location);
9779     }
9780 
9781     // Map the UVM-Lite GPUs if the new location is the preferred location. This
9782     // will only create mappings on first touch. After that they're persistent
9783     // so uvm_va_block_map will be a no-op.
9784     uvm_processor_mask_and(&map_uvm_lite_gpus, &map_other_processors, block_get_uvm_lite_gpus(va_block));
9785     if (!uvm_processor_mask_empty(&map_uvm_lite_gpus) &&
9786         uvm_id_equal(new_residency, preferred_location)) {
9787         for_each_id_in_mask(map_processor_id, &map_uvm_lite_gpus) {
9788             status = uvm_va_block_map(va_block,
9789                                       va_block_context,
9790                                       map_processor_id,
9791                                       region,
9792                                       final_page_mask,
9793                                       UVM_PROT_READ_WRITE_ATOMIC,
9794                                       UvmEventMapRemoteCauseCoherence,
9795                                       &local_tracker);
9796             if (status != NV_OK)
9797                 goto out;
9798         }
9799     }
9800 
9801     uvm_processor_mask_andnot(&map_other_processors, &map_other_processors, block_get_uvm_lite_gpus(va_block));
9802 
9803     // We can't map non-migratable pages to the CPU. If we have any, build a
9804     // new mask of migratable pages and map the CPU separately.
9805     if (uvm_processor_mask_test(&map_other_processors, UVM_ID_CPU) &&
9806         !uvm_range_group_all_migratable(va_space,
9807                                         uvm_va_block_region_start(va_block, region),
9808                                         uvm_va_block_region_end(va_block, region))) {
9809         uvm_page_mask_t *migratable_mask = &va_block_context->mapping.migratable_mask;
9810 
9811         uvm_range_group_migratable_page_mask(va_block, region, migratable_mask);
9812         if (uvm_page_mask_and(migratable_mask, migratable_mask, final_page_mask)) {
9813             uvm_processor_mask_t cpu_mask;
9814             uvm_processor_mask_zero(&cpu_mask);
9815             uvm_processor_mask_set(&cpu_mask, UVM_ID_CPU);
9816 
9817             status = do_block_add_mappings_after_migration(va_block,
9818                                                            va_block_context,
9819                                                            new_residency,
9820                                                            processor_id,
9821                                                            &cpu_mask,
9822                                                            region,
9823                                                            migratable_mask,
9824                                                            max_prot,
9825                                                            thrashing_processors,
9826                                                            &local_tracker);
9827             if (status != NV_OK)
9828                 goto out;
9829         }
9830 
9831         uvm_processor_mask_clear(&map_other_processors, UVM_ID_CPU);
9832     }
9833 
9834     status = do_block_add_mappings_after_migration(va_block,
9835                                                    va_block_context,
9836                                                    new_residency,
9837                                                    processor_id,
9838                                                    &map_other_processors,
9839                                                    region,
9840                                                    final_page_mask,
9841                                                    max_prot,
9842                                                    thrashing_processors,
9843                                                    &local_tracker);
9844     if (status != NV_OK)
9845         goto out;
9846 
9847 out:
9848     tracker_status = uvm_tracker_add_tracker_safe(&va_block->tracker, &local_tracker);
9849     uvm_tracker_deinit(&local_tracker);
9850     return status == NV_OK ? tracker_status : status;
9851 }
9852 
9853 uvm_prot_t uvm_va_block_page_compute_highest_permission(uvm_va_block_t *va_block,
9854                                                         uvm_processor_id_t processor_id,
9855                                                         uvm_page_index_t page_index)
9856 {
9857     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
9858     uvm_processor_mask_t resident_processors;
9859     NvU32 resident_processors_count;
9860 
9861     if (uvm_processor_mask_test(block_get_uvm_lite_gpus(va_block), processor_id))
9862         return UVM_PROT_READ_WRITE_ATOMIC;
9863 
9864     uvm_va_block_page_resident_processors(va_block, page_index, &resident_processors);
9865     resident_processors_count = uvm_processor_mask_get_count(&resident_processors);
9866 
9867     if (resident_processors_count == 0) {
9868         return UVM_PROT_NONE;
9869     }
9870     else if (resident_processors_count > 1) {
9871         // If there are many copies, we can only map READ ONLY
9872         //
9873         // The block state doesn't track the mapping target (aperture) of each
9874         // individual PTE, just the permissions and where the data is resident.
9875         // If the data is resident in multiple places, then we have a problem
9876         // since we can't know where the PTE points. This means we won't know
9877         // what needs to be unmapped for cases like UvmUnregisterGpu and
9878         // UvmDisablePeerAccess.
9879         //
9880         // The simple way to solve this is to enforce that a read-duplication
9881         // mapping always points to local memory.
9882         if (uvm_processor_mask_test(&resident_processors, processor_id))
9883             return UVM_PROT_READ_ONLY;
9884 
9885         return UVM_PROT_NONE;
9886     }
9887     else {
9888         uvm_processor_id_t atomic_id;
9889         uvm_processor_id_t residency;
9890         uvm_processor_mask_t atomic_mappings;
9891         uvm_processor_mask_t write_mappings;
9892 
9893         // Search the id of the processor with the only resident copy
9894         residency = uvm_processor_mask_find_first_id(&resident_processors);
9895         UVM_ASSERT(UVM_ID_IS_VALID(residency));
9896 
9897         // If we cannot map the processor with the resident copy, exit
9898         if (!uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(residency)], processor_id))
9899             return UVM_PROT_NONE;
9900 
9901         // Fast path: if the page is not mapped anywhere else, it can be safely
9902         // mapped with RWA permission
9903         if (!uvm_page_mask_test(&va_block->maybe_mapped_pages, page_index))
9904             return UVM_PROT_READ_WRITE_ATOMIC;
9905 
9906         block_page_authorized_processors(va_block, page_index, UVM_PROT_READ_WRITE_ATOMIC, &atomic_mappings);
9907 
9908         // Exclude processors with system-wide atomics disabled from atomic_mappings
9909         uvm_processor_mask_and(&atomic_mappings,
9910                                &atomic_mappings,
9911                                &va_space->system_wide_atomics_enabled_processors);
9912 
9913         // Exclude the processor for which the mapping protections are being computed
9914         uvm_processor_mask_clear(&atomic_mappings, processor_id);
9915 
9916         // If there is any processor with atomic mapping, check if it has native atomics to the processor
9917         // with the resident copy. If it does not, we can only map READ ONLY
9918         atomic_id = uvm_processor_mask_find_first_id(&atomic_mappings);
9919         if (UVM_ID_IS_VALID(atomic_id) &&
9920             !uvm_processor_mask_test(&va_space->has_native_atomics[uvm_id_value(residency)], atomic_id)) {
9921             return UVM_PROT_READ_ONLY;
9922         }
9923 
9924         block_page_authorized_processors(va_block, page_index, UVM_PROT_READ_WRITE, &write_mappings);
9925 
9926         // Exclude the processor for which the mapping protections are being computed
9927         uvm_processor_mask_clear(&write_mappings, processor_id);
9928 
9929         // At this point, any processor with atomic mappings either has native
9930         // atomics support to the processor with the resident copy or has
9931         // disabled system-wide atomics. If the requesting processor has
9932         // disabled system-wide atomics or has native atomics to that processor,
9933         // we can map with ATOMIC privileges. Likewise, if there are no other
9934         // processors with WRITE or ATOMIC mappings, we can map with ATOMIC
9935         // privileges. For HMM, don't allow GPU atomic access to remote mapped
9936         // system memory even if there are no write mappings since CPU access
9937         // can be upgraded without notification.
9938         if (!uvm_processor_mask_test(&va_space->system_wide_atomics_enabled_processors, processor_id) ||
9939             uvm_processor_mask_test(&va_space->has_native_atomics[uvm_id_value(residency)], processor_id) ||
9940             (uvm_processor_mask_empty(&write_mappings) && !uvm_va_block_is_hmm(va_block))) {
9941             return UVM_PROT_READ_WRITE_ATOMIC;
9942         }
9943 
9944         return UVM_PROT_READ_WRITE;
9945     }
9946 }
9947 
9948 NV_STATUS uvm_va_block_add_mappings(uvm_va_block_t *va_block,
9949                                     uvm_va_block_context_t *va_block_context,
9950                                     uvm_processor_id_t processor_id,
9951                                     uvm_va_block_region_t region,
9952                                     const uvm_page_mask_t *page_mask,
9953                                     UvmEventMapRemoteCause cause)
9954 {
9955     uvm_va_range_t *va_range = va_block->va_range;
9956     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
9957     NV_STATUS status = NV_OK;
9958     uvm_page_index_t page_index;
9959     uvm_range_group_range_iter_t iter;
9960     uvm_prot_t prot_to_map;
9961 
9962     UVM_ASSERT(uvm_va_block_check_policy_is_valid(va_block, va_block_context->policy, region));
9963 
9964     if (UVM_ID_IS_CPU(processor_id) && !uvm_va_block_is_hmm(va_block)) {
9965         if (!uvm_va_range_vma_check(va_range, va_block_context->mm))
9966             return NV_OK;
9967 
9968         uvm_range_group_range_migratability_iter_first(va_space,
9969                                                        uvm_va_block_region_start(va_block, region),
9970                                                        uvm_va_block_region_end(va_block, region),
9971                                                        &iter);
9972     }
9973 
9974     for (prot_to_map = UVM_PROT_READ_ONLY; prot_to_map <= UVM_PROT_READ_WRITE_ATOMIC; ++prot_to_map)
9975         va_block_context->mask_by_prot[prot_to_map - 1].count = 0;
9976 
9977     for_each_va_block_page_in_region_mask(page_index, page_mask, region) {
9978         // Read duplication takes precedence over SetAccessedBy. Exclude pages
9979         // read-duplicated by performance heuristics
9980         if (uvm_page_mask_test(&va_block->read_duplicated_pages, page_index))
9981             continue;
9982 
9983         prot_to_map = uvm_va_block_page_compute_highest_permission(va_block, processor_id, page_index);
9984         if (prot_to_map == UVM_PROT_NONE)
9985             continue;
9986 
9987         if (UVM_ID_IS_CPU(processor_id) && !uvm_va_block_is_hmm(va_block)) {
9988             while (uvm_va_block_cpu_page_index(va_block, iter.end) < page_index) {
9989                 uvm_range_group_range_migratability_iter_next(va_space,
9990                                                               &iter,
9991                                                               uvm_va_block_region_end(va_block, region));
9992             }
9993 
9994             if (!iter.migratable)
9995                 continue;
9996         }
9997 
9998         if (va_block_context->mask_by_prot[prot_to_map - 1].count++ == 0)
9999             uvm_page_mask_zero(&va_block_context->mask_by_prot[prot_to_map - 1].page_mask);
10000 
10001         uvm_page_mask_set(&va_block_context->mask_by_prot[prot_to_map - 1].page_mask, page_index);
10002     }
10003 
10004     for (prot_to_map = UVM_PROT_READ_ONLY; prot_to_map <= UVM_PROT_READ_WRITE_ATOMIC; ++prot_to_map) {
10005         if (va_block_context->mask_by_prot[prot_to_map - 1].count == 0)
10006             continue;
10007 
10008         status = uvm_va_block_map(va_block,
10009                                   va_block_context,
10010                                   processor_id,
10011                                   region,
10012                                   &va_block_context->mask_by_prot[prot_to_map - 1].page_mask,
10013                                   prot_to_map,
10014                                   cause,
10015                                   &va_block->tracker);
10016         if (status != NV_OK)
10017             break;
10018     }
10019 
10020     return status;
10021 }
10022 
10023 static bool can_read_duplicate(uvm_va_block_t *va_block,
10024                                uvm_page_index_t page_index,
10025                                const uvm_va_policy_t *policy,
10026                                const uvm_perf_thrashing_hint_t *thrashing_hint)
10027 {
10028     if (uvm_va_policy_is_read_duplicate(policy, uvm_va_block_get_va_space(va_block)))
10029         return true;
10030 
10031     if (policy->read_duplication != UVM_READ_DUPLICATION_DISABLED &&
10032         uvm_page_mask_test(&va_block->read_duplicated_pages, page_index) &&
10033         thrashing_hint->type != UVM_PERF_THRASHING_HINT_TYPE_PIN)
10034         return true;
10035 
10036     return false;
10037 }
10038 
10039 // TODO: Bug 1827400: If the faulting processor has support for native
10040 //       atomics to the current location and the faults on the page were
10041 //       triggered by atomic accesses only, we keep the current residency.
10042 //       This is a short-term solution to exercise remote atomics over
10043 //       NVLINK when possible (not only when preferred location is set to
10044 //       the remote GPU) as they are much faster than relying on page
10045 //       faults and permission downgrades, which cause thrashing. In the
10046 //       future, the thrashing detection/prevention heuristics should
10047 //       detect and handle this case.
10048 static bool map_remote_on_atomic_fault(uvm_va_space_t *va_space,
10049                                        NvU32 access_type_mask,
10050                                        uvm_processor_id_t processor_id,
10051                                        uvm_processor_id_t residency)
10052 {
10053     // This policy can be enabled/disabled using a module parameter
10054     if (!uvm_perf_map_remote_on_native_atomics_fault)
10055         return false;
10056 
10057     // Only consider atomics faults
10058     if (uvm_fault_access_type_mask_lowest(access_type_mask) < UVM_FAULT_ACCESS_TYPE_ATOMIC_WEAK)
10059         return false;
10060 
10061     // We cannot differentiate CPU writes from atomics. We exclude CPU faults
10062     // from the logic explained above in order to avoid mapping CPU to vidmem
10063     // memory due to a write.
10064     if (UVM_ID_IS_CPU(processor_id))
10065         return false;
10066 
10067     // On P9 systems (which have native HW support for system-wide atomics), we
10068     // have determined experimentally that placing memory on a GPU yields the
10069     // best performance on most cases (since CPU can cache vidmem but not vice
10070     // versa). Therefore, don't map remotely if the current residency is
10071     // sysmem.
10072     if (UVM_ID_IS_CPU(residency))
10073         return false;
10074 
10075     return uvm_processor_mask_test(&va_space->has_native_atomics[uvm_id_value(residency)], processor_id);
10076 }
10077 
10078 // TODO: Bug 1766424: this function works on a single page at a time. This
10079 //       could be changed in the future to optimize multiple faults or access
10080 //       counter notifications on contiguous pages.
10081 static uvm_processor_id_t block_select_residency(uvm_va_block_t *va_block,
10082                                                  uvm_va_block_context_t *va_block_context,
10083                                                  uvm_page_index_t page_index,
10084                                                  uvm_processor_id_t processor_id,
10085                                                  NvU32 access_type_mask,
10086                                                  const uvm_va_policy_t *policy,
10087                                                  const uvm_perf_thrashing_hint_t *thrashing_hint,
10088                                                  uvm_service_operation_t operation,
10089                                                  bool *read_duplicate)
10090 {
10091     uvm_processor_id_t closest_resident_processor;
10092     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
10093     bool may_read_duplicate;
10094     uvm_processor_id_t preferred_location;
10095 
10096     // TODO: Bug 3660968: Remove uvm_hmm_force_sysmem_set() check as soon as
10097     // HMM migration is implemented VMAs other than anonymous memory.
10098     if (is_uvm_fault_force_sysmem_set() || uvm_hmm_must_use_sysmem(va_block, va_block_context)) {
10099         *read_duplicate = false;
10100         return UVM_ID_CPU;
10101     }
10102 
10103     may_read_duplicate = can_read_duplicate(va_block, page_index, policy, thrashing_hint);
10104 
10105     // Read/prefetch faults on a VA range with read duplication enabled
10106     // always create a copy of the page on the faulting processor's memory.
10107     // Note that access counters always use UVM_FAULT_ACCESS_TYPE_PREFETCH,
10108     // which will lead to read duplication if it is enabled.
10109     *read_duplicate = may_read_duplicate &&
10110                       (uvm_fault_access_type_mask_highest(access_type_mask) <= UVM_FAULT_ACCESS_TYPE_READ);
10111 
10112     if (*read_duplicate)
10113         return processor_id;
10114 
10115     *read_duplicate = false;
10116 
10117     // If read-duplication is active in the page but we are not
10118     // read-duplicating because the access type is not a read or a prefetch,
10119     // the faulting processor should get a local copy
10120     if (may_read_duplicate)
10121         return processor_id;
10122 
10123     // If the faulting processor is the preferred location always migrate
10124     preferred_location = policy->preferred_location;
10125     if (uvm_id_equal(processor_id, preferred_location)) {
10126         if (thrashing_hint->type != UVM_PERF_THRASHING_HINT_TYPE_NONE) {
10127             UVM_ASSERT(thrashing_hint->type == UVM_PERF_THRASHING_HINT_TYPE_PIN);
10128             if (uvm_va_space_processor_has_memory(va_space, processor_id))
10129                 UVM_ASSERT(uvm_id_equal(thrashing_hint->pin.residency, processor_id));
10130         }
10131 
10132         return processor_id;
10133     }
10134 
10135     // If the faulting processor is the CPU, HMM has to migrate the block to
10136     // system memory.
10137     // TODO: Bug 3900021: [UVM-HMM] investigate thrashing improvements.
10138     if (UVM_ID_IS_CPU(processor_id) && uvm_va_block_is_hmm(va_block))
10139         return processor_id;
10140 
10141     if (thrashing_hint->type == UVM_PERF_THRASHING_HINT_TYPE_PIN) {
10142         UVM_ASSERT(uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(thrashing_hint->pin.residency)],
10143                                            processor_id));
10144         return thrashing_hint->pin.residency;
10145     }
10146 
10147     closest_resident_processor = uvm_va_block_page_get_closest_resident(va_block, page_index, processor_id);
10148 
10149     // If the page is not resident anywhere, select the preferred location as
10150     // long as the preferred location is accessible from the faulting processor.
10151     // Otherwise select the faulting processor.
10152     if (UVM_ID_IS_INVALID(closest_resident_processor)) {
10153         if (UVM_ID_IS_VALID(preferred_location) &&
10154             uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(preferred_location)],
10155                                     processor_id)) {
10156             return preferred_location;
10157         }
10158 
10159         return processor_id;
10160     }
10161 
10162     // AccessedBy mappings might have not been created for the CPU if the thread
10163     // which made the memory resident did not have the proper references on the
10164     // mm_struct (for example, the GPU fault handling path when
10165     // uvm_va_space_mm_enabled() is false).
10166     //
10167     // Also, in uvm_migrate_*, we implement a two-pass scheme in which
10168     // AccessedBy mappings may be delayed to the second pass. This can produce
10169     // faults even if the faulting processor is in the accessed_by mask.
10170     //
10171     // Here, we keep it on the current residency and we just add the missing
10172     // mapping.
10173     if (uvm_processor_mask_test(&policy->accessed_by, processor_id) &&
10174         uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(closest_resident_processor)], processor_id) &&
10175         operation != UVM_SERVICE_OPERATION_ACCESS_COUNTERS) {
10176         return closest_resident_processor;
10177     }
10178 
10179     // Check if we should map the closest resident processor remotely on atomic
10180     // fault
10181     if (map_remote_on_atomic_fault(va_space, access_type_mask, processor_id, closest_resident_processor))
10182         return closest_resident_processor;
10183 
10184     // If the processor has access to the preferred location, and the page is
10185     // not resident on the accessing processor, move it to the preferred
10186     // location.
10187     if (!uvm_id_equal(closest_resident_processor, processor_id) &&
10188         UVM_ID_IS_VALID(preferred_location) &&
10189         uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(preferred_location)], processor_id))
10190         return preferred_location;
10191 
10192     // If the page is resident on a processor other than the preferred location,
10193     // or the faulting processor can't access the preferred location, we select
10194     // the faulting processor as the new residency.
10195     return processor_id;
10196 }
10197 
10198 uvm_processor_id_t uvm_va_block_select_residency(uvm_va_block_t *va_block,
10199                                                  uvm_va_block_context_t *va_block_context,
10200                                                  uvm_page_index_t page_index,
10201                                                  uvm_processor_id_t processor_id,
10202                                                  NvU32 access_type_mask,
10203                                                  const uvm_va_policy_t *policy,
10204                                                  const uvm_perf_thrashing_hint_t *thrashing_hint,
10205                                                  uvm_service_operation_t operation,
10206                                                  bool *read_duplicate)
10207 {
10208     uvm_processor_id_t id;
10209 
10210     UVM_ASSERT(uvm_va_block_check_policy_is_valid(va_block,
10211                                                   va_block_context->policy,
10212                                                   uvm_va_block_region_for_page(page_index)));
10213     UVM_ASSERT(uvm_hmm_check_context_vma_is_valid(va_block,
10214                                                   va_block_context,
10215                                                   uvm_va_block_region_for_page(page_index)));
10216 
10217     id = block_select_residency(va_block,
10218                                 va_block_context,
10219                                 page_index,
10220                                 processor_id,
10221                                 access_type_mask,
10222                                 policy,
10223                                 thrashing_hint,
10224                                 operation,
10225                                 read_duplicate);
10226 
10227     // If the intended residency doesn't have memory, fall back to the CPU.
10228     if (!block_processor_has_memory(va_block, id)) {
10229         *read_duplicate = false;
10230         return UVM_ID_CPU;
10231     }
10232 
10233     return id;
10234 }
10235 
10236 static bool check_access_counters_dont_revoke(uvm_va_block_t *block,
10237                                               uvm_va_block_context_t *block_context,
10238                                               uvm_va_block_region_t region,
10239                                               const uvm_processor_mask_t *revoke_processors,
10240                                               const uvm_page_mask_t *revoke_page_mask,
10241                                               uvm_prot_t revoke_prot)
10242 {
10243     uvm_processor_id_t id;
10244     for_each_id_in_mask(id, revoke_processors) {
10245         const uvm_page_mask_t *mapped_with_prot = block_map_with_prot_mask_get(block, id, revoke_prot);
10246 
10247         uvm_page_mask_and(&block_context->caller_page_mask, revoke_page_mask, mapped_with_prot);
10248 
10249         UVM_ASSERT(uvm_page_mask_region_weight(&block_context->caller_page_mask, region) == 0);
10250     }
10251 
10252     return true;
10253 }
10254 
10255 // Update service_context->prefetch_hint, service_context->per_processor_masks,
10256 // and service_context->region.
10257 static void uvm_va_block_get_prefetch_hint(uvm_va_block_t *va_block,
10258                                            uvm_service_block_context_t *service_context)
10259 {
10260     uvm_processor_id_t new_residency;
10261     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
10262 
10263     // Performance heuristics policy: we only consider prefetching when there
10264     // are migrations to a single processor, only.
10265     if (uvm_processor_mask_get_count(&service_context->resident_processors) == 1) {
10266         uvm_page_index_t page_index;
10267         uvm_page_mask_t *new_residency_mask;
10268         const uvm_va_policy_t *policy = service_context->block_context.policy;
10269 
10270         new_residency = uvm_processor_mask_find_first_id(&service_context->resident_processors);
10271         new_residency_mask = &service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency;
10272 
10273         // Update prefetch tracking structure with the pages that will migrate
10274         // due to faults
10275         uvm_perf_prefetch_get_hint(va_block,
10276                                    &service_context->block_context,
10277                                    new_residency,
10278                                    new_residency_mask,
10279                                    service_context->region,
10280                                    &service_context->prefetch_bitmap_tree,
10281                                    &service_context->prefetch_hint);
10282 
10283         // Obtain the prefetch hint and give a fake fault access type to the
10284         // prefetched pages
10285         if (UVM_ID_IS_VALID(service_context->prefetch_hint.residency)) {
10286             const uvm_page_mask_t *prefetch_pages_mask = &service_context->prefetch_hint.prefetch_pages_mask;
10287 
10288             for_each_va_block_page_in_mask(page_index, prefetch_pages_mask, va_block) {
10289                 UVM_ASSERT(!uvm_page_mask_test(new_residency_mask, page_index));
10290 
10291                 service_context->access_type[page_index] = UVM_FAULT_ACCESS_TYPE_PREFETCH;
10292 
10293                 if (uvm_va_policy_is_read_duplicate(policy, va_space) ||
10294                     (policy->read_duplication != UVM_READ_DUPLICATION_DISABLED &&
10295                      uvm_page_mask_test(&va_block->read_duplicated_pages, page_index))) {
10296                     if (service_context->read_duplicate_count++ == 0)
10297                         uvm_page_mask_zero(&service_context->read_duplicate_mask);
10298 
10299                     uvm_page_mask_set(&service_context->read_duplicate_mask, page_index);
10300                 }
10301             }
10302 
10303             uvm_page_mask_or(new_residency_mask, new_residency_mask, prefetch_pages_mask);
10304             service_context->region = uvm_va_block_region_from_mask(va_block, new_residency_mask);
10305         }
10306     }
10307     else {
10308         service_context->prefetch_hint.residency = UVM_ID_INVALID;
10309     }
10310 }
10311 
10312 NV_STATUS uvm_va_block_service_copy(uvm_processor_id_t processor_id,
10313                                     uvm_processor_id_t new_residency,
10314                                     uvm_va_block_t *va_block,
10315                                     uvm_va_block_retry_t *block_retry,
10316                                     uvm_service_block_context_t *service_context)
10317 {
10318     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
10319     uvm_processor_mask_t *all_involved_processors =
10320         &service_context->block_context.make_resident.all_involved_processors;
10321     uvm_page_mask_t *new_residency_mask =
10322         &service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency;
10323     uvm_page_mask_t *did_migrate_mask = &service_context->block_context.make_resident.pages_changed_residency;
10324     uvm_page_mask_t *caller_page_mask = &service_context->block_context.caller_page_mask;
10325     uvm_make_resident_cause_t cause;
10326     NV_STATUS status;
10327 
10328     // 1- Migrate pages
10329     switch (service_context->operation) {
10330         case UVM_SERVICE_OPERATION_REPLAYABLE_FAULTS:
10331             cause = UVM_MAKE_RESIDENT_CAUSE_REPLAYABLE_FAULT;
10332             break;
10333         case UVM_SERVICE_OPERATION_NON_REPLAYABLE_FAULTS:
10334             cause = UVM_MAKE_RESIDENT_CAUSE_NON_REPLAYABLE_FAULT;
10335             break;
10336         case UVM_SERVICE_OPERATION_ACCESS_COUNTERS:
10337             cause = UVM_MAKE_RESIDENT_CAUSE_ACCESS_COUNTER;
10338             break;
10339         default:
10340             UVM_ASSERT_MSG(false, "Invalid operation value %d\n", service_context->operation);
10341             // Set cause to silence compiler warning that it may be unused.
10342             cause = UVM_MAKE_RESIDENT_CAUSE_ACCESS_COUNTER;
10343             break;
10344     }
10345 
10346     // Reset masks before all of the make_resident calls
10347     uvm_page_mask_zero(did_migrate_mask);
10348     uvm_processor_mask_zero(all_involved_processors);
10349 
10350     // Handle read duplication first so that the caller_page_mask will be free
10351     // to use below and still valid in uvm_va_block_service_finish().
10352     // TODO: Bug 3660922: need to implement HMM read duplication support.
10353     if (service_context->read_duplicate_count != 0 &&
10354         uvm_page_mask_and(caller_page_mask,
10355                           new_residency_mask,
10356                           &service_context->read_duplicate_mask)) {
10357         status = uvm_va_block_make_resident_read_duplicate(va_block,
10358                                                            block_retry,
10359                                                            &service_context->block_context,
10360                                                            new_residency,
10361                                                            service_context->region,
10362                                                            caller_page_mask,
10363                                                            &service_context->prefetch_hint.prefetch_pages_mask,
10364                                                            cause);
10365         if (status != NV_OK)
10366             return status;
10367     }
10368 
10369     if (service_context->read_duplicate_count == 0 ||
10370         uvm_page_mask_andnot(caller_page_mask, new_residency_mask, &service_context->read_duplicate_mask)) {
10371         if (service_context->read_duplicate_count == 0)
10372             uvm_page_mask_copy(caller_page_mask, new_residency_mask);
10373         status = uvm_va_block_make_resident_copy(va_block,
10374                                                  block_retry,
10375                                                  &service_context->block_context,
10376                                                  new_residency,
10377                                                  service_context->region,
10378                                                  caller_page_mask,
10379                                                  &service_context->prefetch_hint.prefetch_pages_mask,
10380                                                  cause);
10381         if (status != NV_OK)
10382             return status;
10383     }
10384 
10385     if (UVM_ID_IS_CPU(processor_id) && !uvm_processor_mask_empty(all_involved_processors))
10386         service_context->cpu_fault.did_migrate = true;
10387 
10388     // 2- Check for ECC errors on all GPUs involved in the migration if CPU is
10389     //    the destination. Migrations in response to CPU faults are special
10390     //    because they're on the only path (apart from tools) where CUDA is not
10391     //    involved and wouldn't have a chance to do its own ECC checking.
10392     if (service_context->operation == UVM_SERVICE_OPERATION_REPLAYABLE_FAULTS &&
10393         UVM_ID_IS_CPU(new_residency) &&
10394         !uvm_processor_mask_empty(all_involved_processors)) {
10395         uvm_gpu_t *gpu;
10396 
10397         // Before checking for ECC errors, make sure all of the GPU work
10398         // is finished. Creating mappings on the CPU would have to wait
10399         // for the tracker anyway so this shouldn't hurt performance.
10400         status = uvm_tracker_wait(&va_block->tracker);
10401         if (status != NV_OK)
10402             return status;
10403 
10404         for_each_va_space_gpu_in_mask(gpu, va_space, all_involved_processors) {
10405             // We cannot call into RM here so use the no RM ECC check.
10406             status = uvm_gpu_check_ecc_error_no_rm(gpu);
10407             if (status == NV_WARN_MORE_PROCESSING_REQUIRED) {
10408                 // In case we need to call into RM to be sure whether
10409                 // there is an ECC error or not, signal that to the
10410                 // caller by adding the GPU to the mask.
10411                 //
10412                 // In that case the ECC error might be noticed only after
10413                 // the CPU mappings have been already created below,
10414                 // exposing different CPU threads to the possibly corrupt
10415                 // data, but this thread will fault eventually and that's
10416                 // considered to be an acceptable trade-off between
10417                 // performance and ECC error containment.
10418                 uvm_processor_mask_set(&service_context->cpu_fault.gpus_to_check_for_ecc, gpu->id);
10419                 status = NV_OK;
10420             }
10421             if (status != NV_OK)
10422                 return status;
10423         }
10424     }
10425 
10426     return NV_OK;
10427 }
10428 
10429 NV_STATUS uvm_va_block_service_finish(uvm_processor_id_t processor_id,
10430                                       uvm_va_block_t *va_block,
10431                                       uvm_service_block_context_t *service_context)
10432 {
10433     uvm_processor_id_t new_residency = service_context->block_context.make_resident.dest_id;
10434     uvm_page_mask_t *new_residency_mask =
10435         &service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency;
10436     uvm_page_mask_t *did_migrate_mask = &service_context->block_context.make_resident.pages_changed_residency;
10437     uvm_page_mask_t *caller_page_mask = &service_context->block_context.caller_page_mask;
10438     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
10439     uvm_prot_t new_prot;
10440     uvm_page_index_t page_index;
10441     NV_STATUS status;
10442 
10443     // Update residency.
10444     if (service_context->read_duplicate_count == 0 || !uvm_page_mask_empty(caller_page_mask))
10445         uvm_va_block_make_resident_finish(va_block,
10446                                           &service_context->block_context,
10447                                           service_context->region,
10448                                           caller_page_mask);
10449 
10450     uvm_page_mask_andnot(&service_context->did_not_migrate_mask, new_residency_mask, did_migrate_mask);
10451 
10452     // The loops below depend on the enums having the following values in order
10453     // to index into service_context->mappings_by_prot[].
10454     BUILD_BUG_ON(UVM_PROT_READ_ONLY != 1);
10455     BUILD_BUG_ON(UVM_PROT_READ_WRITE != 2);
10456     BUILD_BUG_ON(UVM_PROT_READ_WRITE_ATOMIC != 3);
10457     BUILD_BUG_ON(UVM_PROT_MAX != 4);
10458 
10459     // 1- Compute mapping protections for the requesting processor on the new
10460     // residency.
10461     for (new_prot = UVM_PROT_READ_ONLY; new_prot < UVM_PROT_MAX; ++new_prot)
10462         service_context->mappings_by_prot[new_prot - 1].count = 0;
10463 
10464     for_each_va_block_page_in_region_mask(page_index, new_residency_mask, service_context->region) {
10465         new_prot = compute_new_permission(va_block,
10466                                           &service_context->block_context,
10467                                           page_index,
10468                                           processor_id,
10469                                           new_residency,
10470                                           service_context->access_type[page_index]);
10471 
10472         if (service_context->mappings_by_prot[new_prot - 1].count++ == 0)
10473             uvm_page_mask_zero(&service_context->mappings_by_prot[new_prot - 1].page_mask);
10474 
10475         uvm_page_mask_set(&service_context->mappings_by_prot[new_prot - 1].page_mask, page_index);
10476     }
10477 
10478     // 2- Revoke permissions
10479     //
10480     // NOTE: uvm_va_block_make_resident_copy destroys mappings to old locations.
10481     //       Thus, we need to revoke only if residency did not change and we
10482     //       are mapping higher than READ ONLY.
10483     for (new_prot = UVM_PROT_READ_WRITE; new_prot <= UVM_PROT_READ_WRITE_ATOMIC; ++new_prot) {
10484         bool pages_need_revocation;
10485         uvm_processor_mask_t revoke_processors;
10486         uvm_prot_t revoke_prot;
10487         bool this_processor_has_enabled_atomics;
10488 
10489         if (service_context->mappings_by_prot[new_prot - 1].count == 0)
10490             continue;
10491 
10492         pages_need_revocation = uvm_page_mask_and(&service_context->revocation_mask,
10493                                                   &service_context->did_not_migrate_mask,
10494                                                   &service_context->mappings_by_prot[new_prot - 1].page_mask);
10495         if (!pages_need_revocation)
10496             continue;
10497 
10498         uvm_processor_mask_and(&revoke_processors, &va_block->mapped, &va_space->faultable_processors);
10499 
10500         // Do not revoke the processor that took the fault
10501         uvm_processor_mask_clear(&revoke_processors, processor_id);
10502 
10503         this_processor_has_enabled_atomics = uvm_processor_mask_test(&va_space->system_wide_atomics_enabled_processors,
10504                                                                      processor_id);
10505 
10506         // Atomic operations on processors with system-wide atomics
10507         // disabled or with native atomics access to new_residency
10508         // behave like writes.
10509         if (new_prot == UVM_PROT_READ_WRITE ||
10510             !this_processor_has_enabled_atomics ||
10511             uvm_processor_mask_test(&va_space->has_native_atomics[uvm_id_value(new_residency)], processor_id)) {
10512 
10513             // Exclude processors with native atomics on the resident copy
10514             uvm_processor_mask_andnot(&revoke_processors,
10515                                       &revoke_processors,
10516                                       &va_space->has_native_atomics[uvm_id_value(new_residency)]);
10517 
10518             // Exclude processors with disabled system-wide atomics
10519             uvm_processor_mask_and(&revoke_processors,
10520                                    &revoke_processors,
10521                                    &va_space->system_wide_atomics_enabled_processors);
10522         }
10523 
10524         if (UVM_ID_IS_CPU(processor_id)) {
10525             revoke_prot = UVM_PROT_READ_WRITE_ATOMIC;
10526         }
10527         else {
10528             revoke_prot = (new_prot == UVM_PROT_READ_WRITE_ATOMIC)? UVM_PROT_READ_WRITE:
10529                                                                     UVM_PROT_READ_WRITE_ATOMIC;
10530         }
10531 
10532         // UVM-Lite processors must always have RWA mappings
10533         if (uvm_processor_mask_andnot(&revoke_processors, &revoke_processors, block_get_uvm_lite_gpus(va_block))) {
10534             // Access counters should never trigger revocations apart from
10535             // read-duplication, which are performed in the calls to
10536             // uvm_va_block_make_resident_read_duplicate, above.
10537             if (service_context->operation == UVM_SERVICE_OPERATION_ACCESS_COUNTERS) {
10538                 UVM_ASSERT(check_access_counters_dont_revoke(va_block,
10539                                                              &service_context->block_context,
10540                                                              service_context->region,
10541                                                              &revoke_processors,
10542                                                              &service_context->revocation_mask,
10543                                                              revoke_prot));
10544             }
10545 
10546             // Downgrade other processors' mappings
10547             status = uvm_va_block_revoke_prot_mask(va_block,
10548                                                    &service_context->block_context,
10549                                                    &revoke_processors,
10550                                                    service_context->region,
10551                                                    &service_context->revocation_mask,
10552                                                    revoke_prot);
10553             if (status != NV_OK)
10554                 return status;
10555         }
10556     }
10557 
10558     // 3- Map requesting processor with the necessary privileges
10559     for (new_prot = UVM_PROT_READ_ONLY; new_prot <= UVM_PROT_READ_WRITE_ATOMIC; ++new_prot) {
10560         const uvm_page_mask_t *map_prot_mask = &service_context->mappings_by_prot[new_prot - 1].page_mask;
10561 
10562         if (service_context->mappings_by_prot[new_prot - 1].count == 0)
10563             continue;
10564 
10565         // 3.1 - Unmap CPU pages
10566         // HMM cpu mappings can be upgraded at any time without notification
10567         // so no need to downgrade first.
10568         if (service_context->operation != UVM_SERVICE_OPERATION_ACCESS_COUNTERS &&
10569             UVM_ID_IS_CPU(processor_id) &&
10570             !uvm_va_block_is_hmm(va_block)) {
10571             // The kernel can downgrade managed CPU mappings at any time without
10572             // notifying us, which means our PTE state could be stale. We
10573             // handle this by unmapping the CPU PTE and re-mapping it again.
10574             //
10575             // A CPU fault is unexpected if:
10576             // curr_prot == RW || (!is_write && curr_prot == RO)
10577             status = uvm_va_block_unmap(va_block,
10578                                         &service_context->block_context,
10579                                         UVM_ID_CPU,
10580                                         service_context->region,
10581                                         map_prot_mask,
10582                                         NULL);
10583             if (status != NV_OK)
10584                 return status;
10585         }
10586 
10587         // 3.2 - Add new mappings
10588 
10589         // The faulting processor can be mapped remotely due to user policy or
10590         // the thrashing mitigation heuristics. Therefore, we set the cause
10591         // accordingly in each case.
10592 
10593         // Map pages that are thrashing first
10594         if (service_context->thrashing_pin_count > 0 && va_space->tools.enabled) {
10595             uvm_page_mask_t *helper_page_mask = &service_context->block_context.caller_page_mask;
10596             bool pages_need_mapping = uvm_page_mask_and(helper_page_mask,
10597                                                         map_prot_mask,
10598                                                         &service_context->thrashing_pin_mask);
10599             if (pages_need_mapping) {
10600                 status = uvm_va_block_map(va_block,
10601                                           &service_context->block_context,
10602                                           processor_id,
10603                                           service_context->region,
10604                                           helper_page_mask,
10605                                           new_prot,
10606                                           UvmEventMapRemoteCauseThrashing,
10607                                           &va_block->tracker);
10608                 if (status != NV_OK)
10609                     return status;
10610 
10611                 // Remove thrashing pages from the map mask
10612                 pages_need_mapping = uvm_page_mask_andnot(helper_page_mask,
10613                                                           map_prot_mask,
10614                                                           &service_context->thrashing_pin_mask);
10615                 if (!pages_need_mapping)
10616                     continue;
10617 
10618                 map_prot_mask = helper_page_mask;
10619             }
10620         }
10621 
10622         status = uvm_va_block_map(va_block,
10623                                   &service_context->block_context,
10624                                   processor_id,
10625                                   service_context->region,
10626                                   map_prot_mask,
10627                                   new_prot,
10628                                   UvmEventMapRemoteCausePolicy,
10629                                   &va_block->tracker);
10630         if (status != NV_OK)
10631             return status;
10632     }
10633 
10634     // 4- If pages did migrate, map SetAccessedBy processors, except for
10635     // UVM-Lite
10636     for (new_prot = UVM_PROT_READ_ONLY; new_prot <= UVM_PROT_READ_WRITE_ATOMIC; ++new_prot) {
10637         bool pages_need_mapping;
10638 
10639         if (service_context->mappings_by_prot[new_prot - 1].count == 0)
10640             continue;
10641 
10642         pages_need_mapping = uvm_page_mask_and(caller_page_mask,
10643                                                new_residency_mask,
10644                                                &service_context->mappings_by_prot[new_prot - 1].page_mask);
10645         if (!pages_need_mapping)
10646             continue;
10647 
10648         // Map pages that are thrashing
10649         if (service_context->thrashing_pin_count > 0) {
10650             uvm_page_index_t page_index;
10651 
10652             for_each_va_block_page_in_region_mask(page_index,
10653                                                   &service_context->thrashing_pin_mask,
10654                                                   service_context->region) {
10655                 uvm_processor_mask_t *map_thrashing_processors = NULL;
10656                 NvU64 page_addr = uvm_va_block_cpu_page_address(va_block, page_index);
10657 
10658                 // Check protection type
10659                 if (!uvm_page_mask_test(caller_page_mask, page_index))
10660                     continue;
10661 
10662                 map_thrashing_processors = uvm_perf_thrashing_get_thrashing_processors(va_block, page_addr);
10663 
10664                 status = uvm_va_block_add_mappings_after_migration(va_block,
10665                                                                    &service_context->block_context,
10666                                                                    new_residency,
10667                                                                    processor_id,
10668                                                                    uvm_va_block_region_for_page(page_index),
10669                                                                    caller_page_mask,
10670                                                                    new_prot,
10671                                                                    map_thrashing_processors);
10672                 if (status != NV_OK)
10673                     return status;
10674             }
10675 
10676             pages_need_mapping = uvm_page_mask_andnot(caller_page_mask,
10677                                                       caller_page_mask,
10678                                                       &service_context->thrashing_pin_mask);
10679             if (!pages_need_mapping)
10680                 continue;
10681         }
10682 
10683         // Map the rest of pages in a single shot
10684         status = uvm_va_block_add_mappings_after_migration(va_block,
10685                                                            &service_context->block_context,
10686                                                            new_residency,
10687                                                            processor_id,
10688                                                            service_context->region,
10689                                                            caller_page_mask,
10690                                                            new_prot,
10691                                                            NULL);
10692         if (status != NV_OK)
10693             return status;
10694     }
10695 
10696     return NV_OK;
10697 }
10698 
10699 NV_STATUS uvm_va_block_service_locked(uvm_processor_id_t processor_id,
10700                                       uvm_va_block_t *va_block,
10701                                       uvm_va_block_retry_t *block_retry,
10702                                       uvm_service_block_context_t *service_context)
10703 {
10704     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
10705     uvm_processor_id_t new_residency;
10706     NV_STATUS status = NV_OK;
10707 
10708     uvm_assert_mutex_locked(&va_block->lock);
10709     UVM_ASSERT(uvm_va_block_check_policy_is_valid(va_block,
10710                                                   service_context->block_context.policy,
10711                                                   service_context->region));
10712     UVM_ASSERT(uvm_hmm_check_context_vma_is_valid(va_block,
10713                                                   &service_context->block_context,
10714                                                   service_context->region));
10715 
10716     // GPU fault servicing must be done under the VA space read lock. GPU fault
10717     // servicing is required for RM to make forward progress, and we allow other
10718     // threads to call into RM while holding the VA space lock in read mode. If
10719     // we took the VA space lock in write mode on the GPU fault service path,
10720     // we could deadlock because the thread in RM which holds the VA space lock
10721     // for read wouldn't be able to complete until fault servicing completes.
10722     if (service_context->operation != UVM_SERVICE_OPERATION_REPLAYABLE_FAULTS || UVM_ID_IS_CPU(processor_id))
10723         uvm_assert_rwsem_locked(&va_space->lock);
10724     else
10725         uvm_assert_rwsem_locked_read(&va_space->lock);
10726 
10727     uvm_va_block_get_prefetch_hint(va_block, service_context);
10728 
10729     for_each_id_in_mask(new_residency, &service_context->resident_processors) {
10730         if (uvm_va_block_is_hmm(va_block)) {
10731             status = uvm_hmm_va_block_service_locked(processor_id, new_residency, va_block, block_retry, service_context);
10732             if (status != NV_OK)
10733                 break;
10734 
10735             continue;
10736         }
10737 
10738         status = uvm_va_block_service_copy(processor_id, new_residency, va_block, block_retry, service_context);
10739         if (status != NV_OK)
10740             break;
10741 
10742         status = uvm_va_block_service_finish(processor_id, va_block, service_context);
10743         if (status != NV_OK)
10744             break;
10745     }
10746 
10747     return status;
10748 }
10749 
10750 NV_STATUS uvm_va_block_check_logical_permissions(uvm_va_block_t *va_block,
10751                                                  uvm_va_block_context_t *va_block_context,
10752                                                  uvm_processor_id_t processor_id,
10753                                                  uvm_page_index_t page_index,
10754                                                  uvm_fault_type_t access_type,
10755                                                  bool allow_migration)
10756 {
10757     uvm_va_range_t *va_range = va_block->va_range;
10758     uvm_prot_t access_prot = uvm_fault_access_type_to_prot(access_type);
10759 
10760     UVM_ASSERT(uvm_va_block_check_policy_is_valid(va_block,
10761                                                   va_block_context->policy,
10762                                                   uvm_va_block_region_for_page(page_index)));
10763     UVM_ASSERT(uvm_hmm_check_context_vma_is_valid(va_block,
10764                                                   va_block_context,
10765                                                   uvm_va_block_region_for_page(page_index)));
10766 
10767     // CPU permissions are checked later by block_map_cpu_page.
10768     //
10769     // TODO: Bug 1766124: permissions are checked by block_map_cpu_page because
10770     //       it can also be called from change_pte. Make change_pte call this
10771     //       function and only check CPU permissions here.
10772     if (UVM_ID_IS_GPU(processor_id)) {
10773         if (va_range && uvm_va_range_is_managed_zombie(va_range))
10774             return NV_ERR_INVALID_ADDRESS;
10775 
10776         // GPU faults only check vma permissions if a mm is registered with the
10777         // VA space (ie. uvm_va_space_mm_retain_lock(va_space) != NULL) or if
10778         // uvm_enable_builtin_tests is set, because the Linux kernel can change
10779         // vm_flags at any moment (for example on mprotect) and here we are not
10780         // guaranteed to have vma->vm_mm->mmap_lock. During tests we ensure that
10781         // this scenario does not happen.
10782         if ((va_block_context->mm || uvm_enable_builtin_tests) &&
10783             (access_prot > compute_logical_prot(va_block, va_block_context, page_index)))
10784             return NV_ERR_INVALID_ACCESS_TYPE;
10785     }
10786 
10787     // Non-migratable range:
10788     // - CPU accesses are always fatal, regardless of the VA range residency
10789     // - GPU accesses are fatal if the GPU can't map the preferred location
10790     if (!allow_migration) {
10791         UVM_ASSERT(!uvm_va_block_is_hmm(va_block));
10792 
10793         if (UVM_ID_IS_CPU(processor_id)) {
10794             return NV_ERR_INVALID_OPERATION;
10795         }
10796         else {
10797             uvm_va_space_t *va_space = va_range->va_space;
10798 
10799             return uvm_processor_mask_test(
10800                     &va_space->accessible_from[uvm_id_value(uvm_va_range_get_policy(va_range)->preferred_location)],
10801                     processor_id)?
10802                 NV_OK : NV_ERR_INVALID_ACCESS_TYPE;
10803         }
10804     }
10805 
10806     return NV_OK;
10807 }
10808 
10809 // Check if we are faulting on a page with valid permissions to check if we can
10810 // skip fault handling. See uvm_va_block_t::cpu::fault_authorized for more
10811 // details
10812 static bool skip_cpu_fault_with_valid_permissions(uvm_va_block_t *va_block,
10813                                                   uvm_page_index_t page_index,
10814                                                   uvm_fault_access_type_t fault_access_type)
10815 {
10816     // TODO: Bug 3900038: is skip_cpu_fault_with_valid_permissions() needed for
10817     // HMM?
10818     if (uvm_va_block_is_hmm(va_block))
10819         return false;
10820 
10821     if (block_page_is_processor_authorized(va_block,
10822                                            page_index,
10823                                            UVM_ID_CPU,
10824                                            uvm_fault_access_type_to_prot(fault_access_type))) {
10825         NvU64 now = NV_GETTIME();
10826         pid_t pid = current->pid;
10827 
10828         // Latch the pid/timestamp/page_index values for the first time
10829         if (!va_block->cpu.fault_authorized.first_fault_stamp) {
10830             va_block->cpu.fault_authorized.first_fault_stamp = now;
10831             va_block->cpu.fault_authorized.first_pid = pid;
10832             va_block->cpu.fault_authorized.page_index = page_index;
10833 
10834             return true;
10835         }
10836 
10837         // If the same thread shows up again, this means that the kernel
10838         // downgraded the page's PTEs. Service the fault to force a remap of
10839         // the page.
10840         if (va_block->cpu.fault_authorized.first_pid == pid &&
10841             va_block->cpu.fault_authorized.page_index == page_index) {
10842             va_block->cpu.fault_authorized.first_fault_stamp = 0;
10843         }
10844         else {
10845             // If the window has expired, clear the information and service the
10846             // fault. Otherwise, just return
10847             if (now - va_block->cpu.fault_authorized.first_fault_stamp > uvm_perf_authorized_cpu_fault_tracking_window_ns)
10848                 va_block->cpu.fault_authorized.first_fault_stamp = 0;
10849             else
10850                 return true;
10851         }
10852     }
10853 
10854     return false;
10855 }
10856 
10857 static NV_STATUS block_cpu_fault_locked(uvm_va_block_t *va_block,
10858                                         uvm_va_block_retry_t *va_block_retry,
10859                                         NvU64 fault_addr,
10860                                         uvm_fault_access_type_t fault_access_type,
10861                                         uvm_service_block_context_t *service_context)
10862 {
10863     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
10864     NV_STATUS status = NV_OK;
10865     uvm_page_index_t page_index;
10866     uvm_perf_thrashing_hint_t thrashing_hint;
10867     uvm_processor_id_t new_residency;
10868     bool read_duplicate;
10869 
10870     uvm_assert_rwsem_locked(&va_space->lock);
10871 
10872     UVM_ASSERT(fault_addr >= va_block->start);
10873     UVM_ASSERT(fault_addr <= va_block->end);
10874 
10875     uvm_assert_mmap_lock_locked(service_context->block_context.mm);
10876 
10877     service_context->block_context.policy = uvm_va_policy_get(va_block, fault_addr);
10878 
10879     if (service_context->num_retries == 0) {
10880         // notify event to tools/performance heuristics
10881         uvm_perf_event_notify_cpu_fault(&va_space->perf_events,
10882                                         va_block,
10883                                         service_context->block_context.policy->preferred_location,
10884                                         fault_addr,
10885                                         fault_access_type > UVM_FAULT_ACCESS_TYPE_READ,
10886                                         KSTK_EIP(current));
10887     }
10888 
10889     // Check logical permissions
10890     page_index = uvm_va_block_cpu_page_index(va_block, fault_addr);
10891     status = uvm_va_block_check_logical_permissions(va_block,
10892                                                     &service_context->block_context,
10893                                                     UVM_ID_CPU,
10894                                                     page_index,
10895                                                     fault_access_type,
10896                                                     uvm_range_group_address_migratable(va_space, fault_addr));
10897     if (status != NV_OK)
10898         return status;
10899 
10900     uvm_processor_mask_zero(&service_context->cpu_fault.gpus_to_check_for_ecc);
10901 
10902     if (skip_cpu_fault_with_valid_permissions(va_block, page_index, fault_access_type))
10903         return NV_OK;
10904 
10905     thrashing_hint = uvm_perf_thrashing_get_hint(va_block, fault_addr, UVM_ID_CPU);
10906     // Throttling is implemented by sleeping in the fault handler on the CPU
10907     if (thrashing_hint.type == UVM_PERF_THRASHING_HINT_TYPE_THROTTLE) {
10908         service_context->cpu_fault.wakeup_time_stamp = thrashing_hint.throttle.end_time_stamp;
10909         return NV_WARN_MORE_PROCESSING_REQUIRED;
10910     }
10911 
10912     service_context->read_duplicate_count = 0;
10913     service_context->thrashing_pin_count = 0;
10914     service_context->operation = UVM_SERVICE_OPERATION_REPLAYABLE_FAULTS;
10915 
10916     if (thrashing_hint.type == UVM_PERF_THRASHING_HINT_TYPE_PIN) {
10917         uvm_page_mask_zero(&service_context->thrashing_pin_mask);
10918         uvm_page_mask_set(&service_context->thrashing_pin_mask, page_index);
10919         service_context->thrashing_pin_count = 1;
10920     }
10921 
10922     // Compute new residency and update the masks
10923     new_residency = uvm_va_block_select_residency(va_block,
10924                                                   &service_context->block_context,
10925                                                   page_index,
10926                                                   UVM_ID_CPU,
10927                                                   uvm_fault_access_type_mask_bit(fault_access_type),
10928                                                   service_context->block_context.policy,
10929                                                   &thrashing_hint,
10930                                                   UVM_SERVICE_OPERATION_REPLAYABLE_FAULTS,
10931                                                   &read_duplicate);
10932 
10933     // Initialize the minimum necessary state in the fault service context
10934     uvm_processor_mask_zero(&service_context->resident_processors);
10935 
10936     // Set new residency and update the masks
10937     uvm_processor_mask_set(&service_context->resident_processors, new_residency);
10938 
10939     // The masks need to be fully zeroed as the fault region may grow due to prefetching
10940     uvm_page_mask_zero(&service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency);
10941     uvm_page_mask_set(&service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency, page_index);
10942 
10943     if (read_duplicate) {
10944         uvm_page_mask_zero(&service_context->read_duplicate_mask);
10945         uvm_page_mask_set(&service_context->read_duplicate_mask, page_index);
10946         service_context->read_duplicate_count = 1;
10947     }
10948 
10949     service_context->access_type[page_index] = fault_access_type;
10950 
10951     service_context->region = uvm_va_block_region_for_page(page_index);
10952 
10953     status = uvm_va_block_service_locked(UVM_ID_CPU, va_block, va_block_retry, service_context);
10954 
10955     ++service_context->num_retries;
10956 
10957     return status;
10958 }
10959 
10960 NV_STATUS uvm_va_block_cpu_fault(uvm_va_block_t *va_block,
10961                                  NvU64 fault_addr,
10962                                  bool is_write,
10963                                  uvm_service_block_context_t *service_context)
10964 {
10965     NV_STATUS status;
10966     uvm_va_block_retry_t va_block_retry;
10967     uvm_fault_access_type_t fault_access_type;
10968 
10969     if (is_write)
10970         fault_access_type = UVM_FAULT_ACCESS_TYPE_ATOMIC_STRONG;
10971     else
10972         fault_access_type = UVM_FAULT_ACCESS_TYPE_READ;
10973 
10974     service_context->num_retries = 0;
10975     service_context->cpu_fault.did_migrate = false;
10976 
10977     // We have to use vm_insert_page instead of handing the page to the kernel
10978     // and letting it insert the mapping, and we must do that while holding the
10979     // lock on this VA block. Otherwise there will be a window in which we think
10980     // we've mapped the page but the CPU mapping hasn't actually been created
10981     // yet. During that window a GPU fault event could arrive and claim
10982     // ownership of that VA, "unmapping" it. Then later the kernel would
10983     // eventually establish the mapping, and we'd end up with both CPU and GPU
10984     // thinking they each owned the page.
10985     //
10986     // This function must only be called when it's safe to call vm_insert_page.
10987     // That is, there must be a reference held on the vma's vm_mm, and
10988     // vm_mm->mmap_lock is held in at least read mode. Note that current->mm
10989     // might not be vma->vm_mm.
10990     status = UVM_VA_BLOCK_LOCK_RETRY(va_block,
10991                                      &va_block_retry,
10992                                      block_cpu_fault_locked(va_block,
10993                                                             &va_block_retry,
10994                                                             fault_addr,
10995                                                             fault_access_type,
10996                                                             service_context));
10997     return status;
10998 }
10999 
11000 NV_STATUS uvm_va_block_find(uvm_va_space_t *va_space, NvU64 addr, uvm_va_block_t **out_block)
11001 {
11002     uvm_va_range_t *va_range;
11003     uvm_va_block_t *block;
11004     size_t index;
11005 
11006     va_range = uvm_va_range_find(va_space, addr);
11007     if (!va_range)
11008         return uvm_hmm_va_block_find(va_space, addr, out_block);
11009 
11010     UVM_ASSERT(uvm_hmm_va_block_find(va_space, addr, out_block) == NV_ERR_INVALID_ADDRESS ||
11011                uvm_hmm_va_block_find(va_space, addr, out_block) == NV_ERR_OBJECT_NOT_FOUND);
11012 
11013     if (va_range->type != UVM_VA_RANGE_TYPE_MANAGED)
11014         return NV_ERR_INVALID_ADDRESS;
11015 
11016     index = uvm_va_range_block_index(va_range, addr);
11017     block = uvm_va_range_block(va_range, index);
11018     if (!block)
11019         return NV_ERR_OBJECT_NOT_FOUND;
11020 
11021     *out_block = block;
11022     return NV_OK;
11023 }
11024 
11025 NV_STATUS uvm_va_block_find_create_in_range(uvm_va_space_t *va_space,
11026                                             uvm_va_range_t *va_range,
11027                                             NvU64 addr,
11028                                             uvm_va_block_context_t *va_block_context,
11029                                             uvm_va_block_t **out_block)
11030 {
11031     size_t index;
11032 
11033     if (uvm_enable_builtin_tests && atomic_dec_if_positive(&va_space->test.va_block_allocation_fail_nth) == 0)
11034         return NV_ERR_NO_MEMORY;
11035 
11036     if (!va_range) {
11037         if (!va_block_context || !va_block_context->mm)
11038             return NV_ERR_INVALID_ADDRESS;
11039         return uvm_hmm_va_block_find_create(va_space, addr, va_block_context, out_block);
11040     }
11041 
11042     UVM_ASSERT(addr >= va_range->node.start);
11043     UVM_ASSERT(addr <= va_range->node.end);
11044 
11045     UVM_ASSERT(uvm_hmm_va_block_find(va_space, addr, out_block) == NV_ERR_INVALID_ADDRESS ||
11046                uvm_hmm_va_block_find(va_space, addr, out_block) == NV_ERR_OBJECT_NOT_FOUND);
11047 
11048     if (va_range->type != UVM_VA_RANGE_TYPE_MANAGED)
11049         return NV_ERR_INVALID_ADDRESS;
11050 
11051     index = uvm_va_range_block_index(va_range, addr);
11052     return uvm_va_range_block_create(va_range, index, out_block);
11053 }
11054 
11055 NV_STATUS uvm_va_block_find_create(uvm_va_space_t *va_space,
11056                                    NvU64 addr,
11057                                    uvm_va_block_context_t *va_block_context,
11058                                    uvm_va_block_t **out_block)
11059 {
11060     uvm_va_range_t *va_range = uvm_va_range_find(va_space, addr);
11061 
11062     return uvm_va_block_find_create_in_range(va_space, va_range, addr, va_block_context, out_block);
11063 }
11064 
11065 // Launch a synchronous, encrypted copy between GPU and CPU.
11066 //
11067 // The copy entails a GPU-side encryption (relying on the Copy Engine), and a
11068 // CPU-side decryption step, such that the destination CPU buffer pointed by
11069 // dst_plain will contain the unencrypted (plain text) contents. The destination
11070 // buffer can be in protected or unprotected sysmem, while the source buffer
11071 // must be in protected vidmem.
11072 //
11073 // The maximum copy size allowed is UVM_CONF_COMPUTING_DMA_BUFFER_SIZE.
11074 //
11075 // The input tracker, if not NULL, is internally acquired by the push
11076 // responsible for the encrypted copy.
11077 __attribute__ ((format(printf, 6, 7)))
11078 static NV_STATUS encrypted_memcopy_gpu_to_cpu(uvm_gpu_t *gpu,
11079                                               void *dst_plain,
11080                                               uvm_gpu_address_t src_gpu_address,
11081                                               size_t size,
11082                                               uvm_tracker_t *tracker,
11083                                               const char *format,
11084                                               ...)
11085 {
11086     NV_STATUS status;
11087     UvmCslIv decrypt_iv;
11088     uvm_push_t push;
11089     uvm_conf_computing_dma_buffer_t *dma_buffer;
11090     uvm_gpu_address_t dst_gpu_address, auth_tag_gpu_address;
11091     void *src_cipher, *auth_tag;
11092     va_list args;
11093 
11094     UVM_ASSERT(uvm_conf_computing_mode_enabled(gpu));
11095     UVM_ASSERT(size <= UVM_CONF_COMPUTING_DMA_BUFFER_SIZE);
11096 
11097     status = uvm_conf_computing_dma_buffer_alloc(&gpu->conf_computing.dma_buffer_pool, &dma_buffer, NULL);
11098     if (status != NV_OK)
11099         return status;
11100 
11101     va_start(args, format);
11102     status = uvm_push_begin_acquire(gpu->channel_manager, UVM_CHANNEL_TYPE_GPU_TO_CPU, tracker, &push, format, args);
11103     va_end(args);
11104 
11105     if (status != NV_OK)
11106         goto out;
11107 
11108     uvm_conf_computing_log_gpu_encryption(push.channel, &decrypt_iv);
11109 
11110     dst_gpu_address = uvm_mem_gpu_address_virtual_kernel(dma_buffer->alloc, gpu);
11111     auth_tag_gpu_address = uvm_mem_gpu_address_virtual_kernel(dma_buffer->auth_tag, gpu);
11112     gpu->parent->ce_hal->encrypt(&push, dst_gpu_address, src_gpu_address, size, auth_tag_gpu_address);
11113 
11114     status = uvm_push_end_and_wait(&push);
11115     if (status != NV_OK)
11116         goto out;
11117 
11118     src_cipher = uvm_mem_get_cpu_addr_kernel(dma_buffer->alloc);
11119     auth_tag = uvm_mem_get_cpu_addr_kernel(dma_buffer->auth_tag);
11120     status = uvm_conf_computing_cpu_decrypt(push.channel, dst_plain, src_cipher, &decrypt_iv, size, auth_tag);
11121 
11122  out:
11123     uvm_conf_computing_dma_buffer_free(&gpu->conf_computing.dma_buffer_pool, dma_buffer, NULL);
11124     return status;
11125 }
11126 
11127 // Launch a synchronous, encrypted copy between CPU and GPU.
11128 //
11129 // The source CPU buffer pointed by src_plain contains the unencrypted (plain
11130 // text) contents; the function internally performs a CPU-side encryption step
11131 // before launching the GPU-side CE decryption. The source buffer can be in
11132 // protected or unprotected sysmem, while the destination buffer must be in
11133 // protected vidmem.
11134 //
11135 // The maximum copy size allowed is UVM_CONF_COMPUTING_DMA_BUFFER_SIZE.
11136 //
11137 // The input tracker, if not NULL, is internally acquired by the push
11138 // responsible for the encrypted copy.
11139 __attribute__ ((format(printf, 6, 7)))
11140 static NV_STATUS encrypted_memcopy_cpu_to_gpu(uvm_gpu_t *gpu,
11141                                               uvm_gpu_address_t dst_gpu_address,
11142                                               void *src_plain,
11143                                               size_t size,
11144                                               uvm_tracker_t *tracker,
11145                                               const char *format,
11146                                               ...)
11147 {
11148     NV_STATUS status;
11149     uvm_push_t push;
11150     uvm_conf_computing_dma_buffer_t *dma_buffer;
11151     uvm_gpu_address_t src_gpu_address, auth_tag_gpu_address;
11152     void *dst_cipher, *auth_tag;
11153     va_list args;
11154 
11155     UVM_ASSERT(uvm_conf_computing_mode_enabled(gpu));
11156     UVM_ASSERT(size <= UVM_CONF_COMPUTING_DMA_BUFFER_SIZE);
11157 
11158     status = uvm_conf_computing_dma_buffer_alloc(&gpu->conf_computing.dma_buffer_pool, &dma_buffer, NULL);
11159     if (status != NV_OK)
11160         return status;
11161 
11162     va_start(args, format);
11163     status = uvm_push_begin_acquire(gpu->channel_manager, UVM_CHANNEL_TYPE_CPU_TO_GPU, tracker, &push, format, args);
11164     va_end(args);
11165 
11166     if (status != NV_OK)
11167         goto out;
11168 
11169     dst_cipher = uvm_mem_get_cpu_addr_kernel(dma_buffer->alloc);
11170     auth_tag = uvm_mem_get_cpu_addr_kernel(dma_buffer->auth_tag);
11171     uvm_conf_computing_cpu_encrypt(push.channel, dst_cipher, src_plain, NULL, size, auth_tag);
11172 
11173     src_gpu_address = uvm_mem_gpu_address_virtual_kernel(dma_buffer->alloc, gpu);
11174     auth_tag_gpu_address = uvm_mem_gpu_address_virtual_kernel(dma_buffer->auth_tag, gpu);
11175     gpu->parent->ce_hal->decrypt(&push, dst_gpu_address, src_gpu_address, size, auth_tag_gpu_address);
11176 
11177     status = uvm_push_end_and_wait(&push);
11178 
11179 out:
11180     uvm_conf_computing_dma_buffer_free(&gpu->conf_computing.dma_buffer_pool, dma_buffer, NULL);
11181     return status;
11182 }
11183 
11184 static NV_STATUS va_block_write_cpu_to_gpu(uvm_va_block_t *va_block,
11185                                            uvm_gpu_t *gpu,
11186                                            uvm_gpu_address_t dst_gpu_address,
11187                                            NvU64 dst,
11188                                            uvm_mem_t *src_mem,
11189                                            size_t size)
11190 {
11191     NV_STATUS status;
11192     uvm_push_t push;
11193     uvm_gpu_address_t src_gpu_address;
11194 
11195     if (uvm_conf_computing_mode_enabled(gpu)) {
11196         return encrypted_memcopy_cpu_to_gpu(gpu,
11197                                             dst_gpu_address,
11198                                             uvm_mem_get_cpu_addr_kernel(src_mem),
11199                                             size,
11200                                             &va_block->tracker,
11201                                             "Encrypted write to [0x%llx, 0x%llx)",
11202                                             dst,
11203                                             dst + size);
11204     }
11205 
11206     status = uvm_push_begin_acquire(gpu->channel_manager,
11207                                     UVM_CHANNEL_TYPE_CPU_TO_GPU,
11208                                     &va_block->tracker,
11209                                     &push,
11210                                     "Direct write to [0x%llx, 0x%llx)",
11211                                     dst,
11212                                     dst + size);
11213     if (status != NV_OK)
11214         return status;
11215 
11216     src_gpu_address = uvm_mem_gpu_address_virtual_kernel(src_mem, gpu);
11217     gpu->parent->ce_hal->memcopy(&push, dst_gpu_address, src_gpu_address, size);
11218     return uvm_push_end_and_wait(&push);
11219 }
11220 
11221 NV_STATUS uvm_va_block_write_from_cpu(uvm_va_block_t *va_block,
11222                                       uvm_va_block_context_t *block_context,
11223                                       NvU64 dst,
11224                                       uvm_mem_t *src_mem,
11225                                       size_t size)
11226 {
11227     NV_STATUS status;
11228     uvm_page_index_t page_index = uvm_va_block_cpu_page_index(va_block, dst);
11229     NvU64 page_offset = dst & (PAGE_SIZE - 1);
11230     uvm_processor_id_t proc = uvm_va_block_page_get_closest_resident(va_block, page_index, UVM_ID_CPU);
11231     uvm_va_block_region_t region = uvm_va_block_region_for_page(page_index);
11232 
11233     uvm_assert_mutex_locked(&va_block->lock);
11234     UVM_ASSERT_MSG(page_offset + size <= PAGE_SIZE, "Write spans multiple pages: dst 0x%llx, size 0x%zx\n", dst, size);
11235 
11236     if (UVM_ID_IS_INVALID(proc))
11237         proc = UVM_ID_CPU;
11238 
11239     block_context->policy = uvm_va_policy_get(va_block, dst);
11240 
11241     // Use make_resident() in all cases to break read-duplication, but
11242     // block_retry can be NULL as if the page is not resident yet we will make
11243     // it resident on the CPU.
11244     // Notably we don't care about coherence with respect to atomics from other
11245     // processors.
11246     status = uvm_va_block_make_resident(va_block,
11247                                         NULL,
11248                                         block_context,
11249                                         proc,
11250                                         region,
11251                                         NULL,
11252                                         NULL,
11253                                         UVM_MAKE_RESIDENT_CAUSE_API_TOOLS);
11254 
11255     if (status != NV_OK)
11256         return status;
11257 
11258     if (UVM_ID_IS_CPU(proc)) {
11259         char *mapped_page;
11260         struct page *page = uvm_cpu_chunk_get_cpu_page(va_block, page_index);
11261         void *src = uvm_mem_get_cpu_addr_kernel(src_mem);
11262 
11263         status = uvm_tracker_wait(&va_block->tracker);
11264         if (status != NV_OK)
11265             return status;
11266 
11267         mapped_page = (char *)kmap(page);
11268         memcpy(mapped_page + page_offset, src, size);
11269         kunmap(page);
11270 
11271         return NV_OK;
11272     }
11273     else {
11274         uvm_gpu_t *dst_gpu;
11275         uvm_gpu_address_t dst_gpu_address;
11276 
11277         UVM_ASSERT(UVM_ID_IS_GPU(proc));
11278 
11279         dst_gpu = block_get_gpu(va_block, proc);
11280 
11281         dst_gpu_address = block_phys_page_copy_address(va_block, block_phys_page(proc, page_index), dst_gpu);
11282         dst_gpu_address.address += page_offset;
11283 
11284         return va_block_write_cpu_to_gpu(va_block, dst_gpu, dst_gpu_address, dst, src_mem, size);
11285     }
11286 }
11287 
11288 static NV_STATUS va_block_read_gpu_to_cpu(uvm_va_block_t *va_block,
11289                                           uvm_mem_t *dst_mem,
11290                                           uvm_gpu_t *gpu,
11291                                           uvm_gpu_address_t src_gpu_address,
11292                                           NvU64 src,
11293                                           size_t size)
11294 {
11295     NV_STATUS status;
11296     uvm_push_t push;
11297     uvm_gpu_address_t dst_gpu_address;
11298 
11299     if (uvm_conf_computing_mode_enabled(gpu)) {
11300         return encrypted_memcopy_gpu_to_cpu(gpu,
11301                                             uvm_mem_get_cpu_addr_kernel(dst_mem),
11302                                             src_gpu_address,
11303                                             size,
11304                                             &va_block->tracker,
11305                                             "Encrypted read from [0x%llx, 0x%llx)",
11306                                             src,
11307                                             src + size);
11308     }
11309 
11310     status = uvm_push_begin_acquire(gpu->channel_manager,
11311                                     UVM_CHANNEL_TYPE_GPU_TO_CPU,
11312                                     &va_block->tracker,
11313                                     &push,
11314                                     "Direct read from [0x%llx, 0x%llx)",
11315                                     src,
11316                                     src + size);
11317     if (status != NV_OK)
11318         return status;
11319 
11320     dst_gpu_address = uvm_mem_gpu_address_virtual_kernel(dst_mem, gpu);
11321     gpu->parent->ce_hal->memcopy(&push, dst_gpu_address, src_gpu_address, size);
11322     return uvm_push_end_and_wait(&push);
11323 }
11324 
11325 NV_STATUS uvm_va_block_read_to_cpu(uvm_va_block_t *va_block, uvm_mem_t *dst_mem, NvU64 src, size_t size)
11326 {
11327     uvm_page_index_t page_index = uvm_va_block_cpu_page_index(va_block, src);
11328     NvU64 page_offset = src & (PAGE_SIZE - 1);
11329     uvm_processor_id_t proc = uvm_va_block_page_get_closest_resident(va_block, page_index, UVM_ID_CPU);
11330     void *dst = uvm_mem_get_cpu_addr_kernel(dst_mem);
11331 
11332     uvm_assert_mutex_locked(&va_block->lock);
11333     UVM_ASSERT_MSG(page_offset + size <= PAGE_SIZE, "Read spans multiple pages: src 0x%llx, size 0x%zx\n", src, size);
11334 
11335     if (UVM_ID_IS_INVALID(proc)) {
11336         memset(dst, 0, size);
11337         return NV_OK;
11338     }
11339     else if (UVM_ID_IS_CPU(proc)) {
11340         NV_STATUS status;
11341         char *mapped_page;
11342         struct page *page = uvm_cpu_chunk_get_cpu_page(va_block, page_index);
11343 
11344         status = uvm_tracker_wait(&va_block->tracker);
11345         if (status != NV_OK)
11346             return status;
11347 
11348         mapped_page = (char *)kmap(page);
11349         memcpy(dst, mapped_page + page_offset, size);
11350         kunmap(page);
11351 
11352         return NV_OK;
11353     }
11354     else {
11355         uvm_gpu_address_t src_gpu_address;
11356         uvm_gpu_t *gpu = block_get_gpu(va_block, proc);
11357 
11358         src_gpu_address = block_phys_page_copy_address(va_block, block_phys_page(proc, page_index), gpu);
11359         src_gpu_address.address += page_offset;
11360 
11361         return va_block_read_gpu_to_cpu(va_block, dst_mem, gpu, src_gpu_address, src, size);
11362     }
11363 }
11364 
11365 // Deferred work item reestablishing accessed by mappings after eviction. On
11366 // GPUs with access counters enabled, the evicted GPU will also get remote
11367 // mappings.
11368 static void block_add_eviction_mappings(void *args)
11369 {
11370     uvm_va_block_t *va_block = (uvm_va_block_t*)args;
11371     uvm_va_space_t *va_space;
11372     uvm_processor_id_t id;
11373     uvm_va_block_context_t *block_context = NULL;
11374     struct mm_struct *mm = NULL;
11375 
11376     uvm_mutex_lock(&va_block->lock);
11377     va_space = uvm_va_block_get_va_space_maybe_dead(va_block);
11378     uvm_mutex_unlock(&va_block->lock);
11379 
11380     if (!va_space) {
11381         // Block has been killed in the meantime
11382         goto done;
11383     }
11384 
11385     mm = uvm_va_space_mm_retain_lock(va_space);
11386 
11387     block_context = uvm_va_block_context_alloc(mm);
11388     if (!block_context)
11389         goto done;
11390 
11391     // The block wasn't dead when we checked above and that's enough to
11392     // guarantee that the VA space is still around, because
11393     // uvm_va_space_destroy() flushes the associated nv_kthread_q, and that
11394     // flush waits for this function call to finish.
11395     uvm_va_space_down_read(va_space);
11396 
11397     // Now that we have the VA space lock held, we can check whether the block
11398     // is still alive since the VA space write lock is needed to kill blocks.
11399     if (uvm_va_block_is_dead(va_block))
11400         goto unlock;
11401 
11402     if (uvm_va_block_is_hmm(va_block)) {
11403         uvm_hmm_block_add_eviction_mappings(va_space, va_block, block_context);
11404     }
11405     else {
11406         uvm_va_range_t *va_range = va_block->va_range;
11407         NV_STATUS status = NV_OK;
11408 
11409         block_context->policy = uvm_va_range_get_policy(va_range);
11410         for_each_id_in_mask(id, &uvm_va_range_get_policy(va_range)->accessed_by) {
11411             status = uvm_va_block_set_accessed_by(va_block, block_context, id);
11412             if (status != NV_OK)
11413                 break;
11414         }
11415 
11416         if (status == NV_OK && uvm_va_space_map_remote_on_eviction(va_space)) {
11417             uvm_processor_mask_t map_processors;
11418 
11419             // Exclude the processors that have been already mapped due to
11420             // AccessedBy
11421             uvm_processor_mask_andnot(&map_processors,
11422                                       &va_block->evicted_gpus,
11423                                       &uvm_va_range_get_policy(va_range)->accessed_by);
11424 
11425             for_each_gpu_id_in_mask(id, &map_processors) {
11426                 uvm_gpu_t *gpu = uvm_va_space_get_gpu(va_space, id);
11427                 uvm_va_block_gpu_state_t *gpu_state;
11428 
11429                 if (!gpu->parent->access_counters_supported)
11430                     continue;
11431 
11432                 gpu_state = uvm_va_block_gpu_state_get(va_block, id);
11433                 UVM_ASSERT(gpu_state);
11434 
11435                 // TODO: Bug 2096389: uvm_va_block_add_mappings does not add
11436                 // remote mappings to read-duplicated pages. Add support for it
11437                 // or create a new function.
11438                 status = UVM_VA_BLOCK_LOCK_RETRY(va_block, NULL,
11439                                                  uvm_va_block_add_mappings(va_block,
11440                                                                            block_context,
11441                                                                            id,
11442                                                                            uvm_va_block_region_from_block(va_block),
11443                                                                            &gpu_state->evicted,
11444                                                                            UvmEventMapRemoteCauseEviction));
11445                 if (status != NV_OK)
11446                     break;
11447             }
11448         }
11449 
11450         if (status != NV_OK) {
11451             UVM_ERR_PRINT("Deferred mappings to evicted memory for block [0x%llx, 0x%llx] failed %s, processor %s\n",
11452                           va_block->start,
11453                           va_block->end,
11454                           nvstatusToString(status),
11455                           uvm_va_space_processor_name(va_space, id));
11456         }
11457     }
11458 
11459 unlock:
11460     uvm_va_space_up_read(va_space);
11461     uvm_va_block_context_free(block_context);
11462 
11463 done:
11464     uvm_va_space_mm_release_unlock(va_space, mm);
11465     uvm_va_block_release(va_block);
11466 }
11467 
11468 static void block_add_eviction_mappings_entry(void *args)
11469 {
11470     UVM_ENTRY_VOID(block_add_eviction_mappings(args));
11471 }
11472 
11473 NV_STATUS uvm_va_block_evict_chunks(uvm_va_block_t *va_block,
11474                                     uvm_gpu_t *gpu,
11475                                     uvm_gpu_chunk_t *root_chunk,
11476                                     uvm_tracker_t *tracker)
11477 {
11478     NV_STATUS status = NV_OK;
11479     NvU32 i;
11480     uvm_va_block_gpu_state_t *gpu_state;
11481     uvm_va_block_region_t chunk_region;
11482     size_t num_gpu_chunks = block_num_gpu_chunks(va_block, gpu);
11483     size_t chunks_to_evict = 0;
11484     uvm_va_block_context_t *block_context;
11485     uvm_page_mask_t *pages_to_evict;
11486     uvm_va_block_test_t *va_block_test = uvm_va_block_get_test(va_block);
11487     uvm_va_space_t *va_space = uvm_va_block_get_va_space_maybe_dead(va_block);
11488     struct mm_struct *mm;
11489     bool accessed_by_set = false;
11490 
11491     uvm_assert_mutex_locked(&va_block->lock);
11492 
11493     // The block might have been killed in the meantime
11494     if (!va_space)
11495         return NV_OK;
11496 
11497     gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id);
11498     if (!gpu_state)
11499         return NV_OK;
11500 
11501     if (va_block_test && va_block_test->inject_eviction_error) {
11502         va_block_test->inject_eviction_error = false;
11503         return NV_ERR_NO_MEMORY;
11504     }
11505 
11506     // We cannot take this block's VA space or mmap_lock locks on the eviction
11507     // path, however, we retain mm in order to support accounting of CPU memory
11508     // allocations. If mappings need to be created,
11509     // block_add_eviction_mappings() will be scheduled below.
11510     mm = uvm_va_space_mm_retain(va_space);
11511     block_context = uvm_va_block_context_alloc(mm);
11512     if (!block_context) {
11513         if (mm)
11514             uvm_va_space_mm_release(va_space);
11515         return NV_ERR_NO_MEMORY;
11516     }
11517 
11518     pages_to_evict = &block_context->caller_page_mask;
11519     uvm_page_mask_zero(pages_to_evict);
11520     chunk_region.outer = 0;
11521 
11522     // Find all chunks that are subchunks of the root chunk
11523     for (i = 0; i < num_gpu_chunks; ++i) {
11524         uvm_chunk_size_t chunk_size;
11525         size_t chunk_index = block_gpu_chunk_index(va_block, gpu, chunk_region.outer, &chunk_size);
11526         UVM_ASSERT(chunk_index == i);
11527         chunk_region.first = chunk_region.outer;
11528         chunk_region.outer = chunk_region.first + chunk_size / PAGE_SIZE;
11529 
11530         if (!gpu_state->chunks[i])
11531             continue;
11532         if (!uvm_gpu_chunk_same_root(gpu_state->chunks[i], root_chunk))
11533             continue;
11534 
11535         if (uvm_va_block_is_hmm(va_block)) {
11536             status = uvm_hmm_va_block_evict_chunk_prep(va_block, block_context, gpu_state->chunks[i], chunk_region);
11537             if (status != NV_OK)
11538                 break;
11539         }
11540 
11541         uvm_page_mask_region_fill(pages_to_evict, chunk_region);
11542         ++chunks_to_evict;
11543     }
11544 
11545     if (chunks_to_evict == 0)
11546         goto out;
11547 
11548     // Only move pages resident on the GPU
11549     uvm_page_mask_and(pages_to_evict, pages_to_evict, uvm_va_block_resident_mask_get(va_block, gpu->id));
11550     uvm_processor_mask_zero(&block_context->make_resident.all_involved_processors);
11551 
11552     if (uvm_va_block_is_hmm(va_block)) {
11553         status = uvm_hmm_va_block_evict_chunks(va_block,
11554                                                block_context,
11555                                                pages_to_evict,
11556                                                uvm_va_block_region_from_block(va_block),
11557                                                &accessed_by_set);
11558     }
11559     else {
11560         block_context->policy = uvm_va_range_get_policy(va_block->va_range);
11561         accessed_by_set = uvm_processor_mask_get_count(&block_context->policy->accessed_by) > 0;
11562 
11563         // TODO: Bug 1765193: make_resident() breaks read-duplication, but it's
11564         // not necessary to do so for eviction. Add a version that unmaps only
11565         // the processors that have mappings to the pages being evicted.
11566         status = uvm_va_block_make_resident(va_block,
11567                                             NULL,
11568                                             block_context,
11569                                             UVM_ID_CPU,
11570                                             uvm_va_block_region_from_block(va_block),
11571                                             pages_to_evict,
11572                                             NULL,
11573                                             UVM_MAKE_RESIDENT_CAUSE_EVICTION);
11574     }
11575     if (status != NV_OK)
11576         goto out;
11577 
11578     // VA space lock may not be held and hence we cannot reestablish any
11579     // mappings here and need to defer it to a work queue.
11580     //
11581     // Reading the accessed_by mask without the VA space lock is safe because
11582     // adding a new processor to the mask triggers going over all the VA blocks
11583     // in the range and locking them. And we hold one of the VA block's locks.
11584     //
11585     // If uvm_va_range_set_accessed_by() hasn't called
11586     // uvm_va_block_set_accessed_by() for this block yet then it will take care
11587     // of adding the mapping after we are done. If it already did then we are
11588     // guaranteed to see the new processor in the accessed_by mask because we
11589     // locked the block's lock that the thread calling
11590     // uvm_va_range_set_accessed_by() unlocked after updating the mask.
11591     //
11592     // If a processor gets removed from the mask then we might not notice and
11593     // schedule the work item anyway, but that's benign as
11594     // block_add_eviction_mappings() re-examines the mask.
11595     //
11596     // Checking if access counters migrations are enabled on a VA space is racy
11597     // without holding the VA space lock. However, this is fine as
11598     // block_add_eviction_mappings() reexamines the value with the VA space
11599     // lock being held.
11600     if (accessed_by_set || (gpu->parent->access_counters_supported && uvm_va_space_map_remote_on_eviction(va_space))) {
11601         // Always retain the VA block first so that it's safe for the deferred
11602         // callback to release it immediately after it runs.
11603         uvm_va_block_retain(va_block);
11604 
11605         if (!nv_kthread_q_schedule_q_item(&g_uvm_global.global_q,
11606                                           &va_block->eviction_mappings_q_item)) {
11607             // And release it if no new callback was scheduled
11608             uvm_va_block_release_no_destroy(va_block);
11609         }
11610     }
11611 
11612     status = uvm_tracker_add_tracker_safe(tracker, &va_block->tracker);
11613     if (status != NV_OK)
11614         goto out;
11615 
11616     for (i = 0; i < num_gpu_chunks; ++i) {
11617         uvm_gpu_id_t accessing_gpu_id;
11618         uvm_gpu_chunk_t *chunk = gpu_state->chunks[i];
11619 
11620         if (!chunk)
11621             continue;
11622         if (!uvm_gpu_chunk_same_root(chunk, root_chunk))
11623             continue;
11624 
11625         // Remove the mappings of indirect peers from the reverse map. We
11626         // access the indirect peer mask from the VA space without holding the
11627         // VA space lock. Therefore, we can race with enable_peer/disable_peer
11628         // operations. However this is fine:
11629         //
11630         // The enable_peer sequence is as follows:
11631         //
11632         // set_bit in va_space->indirect_peers
11633         // uvm_va_block_enable_peer;
11634         //
11635         // - If we read the mask BEFORE it is set or AFTER the mapping has
11636         // been added to the map there is no race.
11637         // - If we read the mask AFTER it is set but BEFORE adding the mapping
11638         // to the reverse map, we will try to remove it although it is not
11639         // there yet. Therefore, we use
11640         // uvm_pmm_sysmem_mappings_remove_gpu_mapping_on_eviction, which does
11641         // not check if the mapping is present in the reverse map.
11642         //
11643         // The disable_peer sequence is as follows:
11644         //
11645         // uvm_va_block_disable_peer;
11646         // clear_bit in va_space->indirect_peers
11647         //
11648         // - If we read the mask BEFORE the mapping has been added to the map
11649         // or AFTER the bit has been cleared, there is no race.
11650         // - If we read the mask AFTER the mapping has been removed and BEFORE
11651         // the bit is cleared, we will try to remove the mapping, too.
11652         // Again, uvm_pmm_sysmem_mappings_remove_gpu_mapping_on_eviction works
11653         // in this scenario.
11654         // Obtain the uvm_gpu_t directly via the parent GPU's id since indirect
11655         // peers are not supported when SMC is enabled.
11656         for_each_gpu_id_in_mask(accessing_gpu_id, &va_space->indirect_peers[uvm_id_value(gpu->id)]) {
11657             uvm_gpu_t *accessing_gpu = uvm_va_space_get_gpu(va_space, accessing_gpu_id);
11658             NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(&gpu->pmm, chunk, accessing_gpu);
11659 
11660             uvm_pmm_sysmem_mappings_remove_gpu_mapping_on_eviction(&accessing_gpu->pmm_reverse_sysmem_mappings,
11661                                                                    peer_addr);
11662         }
11663 
11664         uvm_mmu_chunk_unmap(chunk, tracker);
11665 
11666         uvm_pmm_gpu_mark_chunk_evicted(&gpu->pmm, gpu_state->chunks[i]);
11667         gpu_state->chunks[i] = NULL;
11668     }
11669 
11670 out:
11671     uvm_va_block_context_free(block_context);
11672     if (mm)
11673         uvm_va_space_mm_release(va_space);
11674 
11675     return status;
11676 }
11677 
11678 static NV_STATUS block_gpu_force_4k_ptes(uvm_va_block_t *block, uvm_va_block_context_t *block_context, uvm_gpu_t *gpu)
11679 {
11680     uvm_va_block_gpu_state_t *gpu_state = block_gpu_state_get_alloc(block, gpu);
11681     uvm_push_t push;
11682     NV_STATUS status;
11683 
11684     // See comment in uvm_va_block_set_cancel
11685     UVM_ASSERT(!gpu->parent->fault_cancel_va_supported);
11686 
11687     if (!gpu_state)
11688         return NV_ERR_NO_MEMORY;
11689 
11690     // Force all pages to be 4K and prevent future upgrades during cancel
11691     gpu_state->force_4k_ptes = true;
11692 
11693     // If we have no page tables we're done. For fault cancel we need to make
11694     // sure that fatal faults are on different 4k PTEs than non-fatal faults,
11695     // and we need to service all non-fatal faults before issuing the cancel. So
11696     // either all faults are fatal and we have no PTEs (we're PROT_NONE), or
11697     // we'll allocate PTEs later when we service the non-fatal faults. Those
11698     // PTEs will be 4k since force_4k_ptes is set.
11699     if (!block_gpu_has_page_tables(block, gpu))
11700         return NV_OK;
11701 
11702     // Are we 4k already?
11703     if (!gpu_state->pte_is_2m && bitmap_empty(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK))
11704         return NV_OK;
11705 
11706     status = block_alloc_ptes_with_retry(block, gpu, UVM_PAGE_SIZE_4K, NULL);
11707     if (status != NV_OK)
11708         return status;
11709 
11710     status = uvm_push_begin_acquire(gpu->channel_manager,
11711                                     UVM_CHANNEL_TYPE_MEMOPS,
11712                                     &block->tracker,
11713                                     &push,
11714                                     "Forcing 4k PTEs on block [0x%llx, 0x%llx)",
11715                                     block->start,
11716                                     block->end + 1);
11717     if (status != NV_OK)
11718         return status;
11719 
11720     if (gpu_state->pte_is_2m)
11721         block_gpu_split_2m(block, block_context, gpu, NULL, &push);
11722     else
11723         block_gpu_split_big(block, block_context, gpu, gpu_state->big_ptes, &push);
11724 
11725     uvm_push_end(&push);
11726 
11727     UVM_ASSERT(block_check_mappings(block));
11728 
11729     return uvm_tracker_add_push_safe(&block->tracker, &push);
11730 }
11731 
11732 NV_STATUS uvm_va_block_set_cancel(uvm_va_block_t *va_block, uvm_va_block_context_t *block_context, uvm_gpu_t *gpu)
11733 {
11734     uvm_assert_mutex_locked(&va_block->lock);
11735 
11736     // Volta+ devices support a global VA cancel method that does not require
11737     // 4k PTEs. Thus, skip doing this PTE splitting, particularly because it
11738     // could result in 4k PTEs on P9 systems which otherwise would never need
11739     // them.
11740     if (gpu->parent->fault_cancel_va_supported)
11741         return NV_OK;
11742 
11743     return block_gpu_force_4k_ptes(va_block, block_context, gpu);
11744 }
11745 
11746 NV_STATUS uvm_test_va_block_inject_error(UVM_TEST_VA_BLOCK_INJECT_ERROR_PARAMS *params, struct file *filp)
11747 {
11748     uvm_va_space_t *va_space = uvm_va_space_get(filp);
11749     struct mm_struct *mm;
11750     uvm_va_block_t *va_block;
11751     uvm_va_block_test_t *va_block_test;
11752     uvm_va_block_context_t *block_context = NULL;
11753     NV_STATUS status = NV_OK;
11754 
11755     mm = uvm_va_space_mm_or_current_retain_lock(va_space);
11756     uvm_va_space_down_read(va_space);
11757 
11758     block_context = uvm_va_block_context_alloc(mm);
11759     if (!block_context) {
11760         status = NV_ERR_NO_MEMORY;
11761         goto out;
11762     }
11763 
11764     status = uvm_va_block_find_create(va_space, params->lookup_address, block_context, &va_block);
11765     if (status != NV_OK)
11766         goto out;
11767 
11768     va_block_test = uvm_va_block_get_test(va_block);
11769     UVM_ASSERT(va_block_test);
11770 
11771     uvm_mutex_lock(&va_block->lock);
11772 
11773     if (params->page_table_allocation_retry_force_count)
11774         va_block_test->page_table_allocation_retry_force_count = params->page_table_allocation_retry_force_count;
11775 
11776     if (params->user_pages_allocation_retry_force_count)
11777         va_block_test->user_pages_allocation_retry_force_count = params->user_pages_allocation_retry_force_count;
11778 
11779     if (params->cpu_chunk_allocation_size_mask) {
11780         if (params->cpu_chunk_allocation_size_mask & ~UVM_CPU_CHUNK_SIZES ||
11781             !(params->cpu_chunk_allocation_size_mask & PAGE_SIZE)) {
11782             status = NV_ERR_INVALID_ARGUMENT;
11783             goto block_unlock;
11784         }
11785 
11786         va_block_test->cpu_chunk_allocation_size_mask = params->cpu_chunk_allocation_size_mask & UVM_CPU_CHUNK_SIZES;
11787     }
11788 
11789     if (params->eviction_error)
11790         va_block_test->inject_eviction_error = params->eviction_error;
11791 
11792     if (params->cpu_pages_allocation_error_count)
11793         va_block_test->inject_cpu_pages_allocation_error_count = params->cpu_pages_allocation_error_count;
11794 
11795     if (params->populate_error)
11796         va_block_test->inject_populate_error = params->populate_error;
11797 
11798 block_unlock:
11799     uvm_mutex_unlock(&va_block->lock);
11800 
11801 out:
11802     uvm_va_space_up_read(va_space);
11803     uvm_va_space_mm_or_current_release_unlock(va_space, mm);
11804     uvm_va_block_context_free(block_context);
11805     return status;
11806 }
11807 
11808 static uvm_prot_t g_uvm_test_pte_mapping_to_prot[UVM_TEST_PTE_MAPPING_MAX] =
11809 {
11810     [UVM_TEST_PTE_MAPPING_INVALID]           = UVM_PROT_NONE,
11811     [UVM_TEST_PTE_MAPPING_READ_ONLY]         = UVM_PROT_READ_ONLY,
11812     [UVM_TEST_PTE_MAPPING_READ_WRITE]        = UVM_PROT_READ_WRITE,
11813     [UVM_TEST_PTE_MAPPING_READ_WRITE_ATOMIC] = UVM_PROT_READ_WRITE_ATOMIC,
11814 };
11815 
11816 static UVM_TEST_PTE_MAPPING g_uvm_prot_to_test_pte_mapping[UVM_PROT_MAX] =
11817 {
11818     [UVM_PROT_NONE]              = UVM_TEST_PTE_MAPPING_INVALID,
11819     [UVM_PROT_READ_ONLY]         = UVM_TEST_PTE_MAPPING_READ_ONLY,
11820     [UVM_PROT_READ_WRITE]        = UVM_TEST_PTE_MAPPING_READ_WRITE,
11821     [UVM_PROT_READ_WRITE_ATOMIC] = UVM_TEST_PTE_MAPPING_READ_WRITE_ATOMIC,
11822 };
11823 
11824 NV_STATUS uvm_test_change_pte_mapping(UVM_TEST_CHANGE_PTE_MAPPING_PARAMS *params, struct file *filp)
11825 {
11826     uvm_va_space_t *va_space = uvm_va_space_get(filp);
11827     uvm_va_block_t *block;
11828     struct mm_struct *mm;
11829     NV_STATUS status = NV_OK;
11830     uvm_prot_t curr_prot, new_prot;
11831     uvm_gpu_t *gpu = NULL;
11832     uvm_processor_id_t id;
11833     uvm_tracker_t local_tracker;
11834     uvm_va_block_region_t region;
11835     uvm_va_block_context_t *block_context = NULL;
11836 
11837     if (!PAGE_ALIGNED(params->va))
11838         return NV_ERR_INVALID_ADDRESS;
11839 
11840     if (params->mapping >= UVM_TEST_PTE_MAPPING_MAX)
11841         return NV_ERR_INVALID_ARGUMENT;
11842 
11843     new_prot = g_uvm_test_pte_mapping_to_prot[params->mapping];
11844 
11845     // mmap_lock isn't needed for invalidating CPU mappings, but it will be
11846     // needed for inserting them.
11847     mm = uvm_va_space_mm_or_current_retain_lock(va_space);
11848     uvm_va_space_down_read(va_space);
11849 
11850     if (uvm_uuid_is_cpu(&params->uuid)) {
11851         id = UVM_ID_CPU;
11852     }
11853     else {
11854         gpu = uvm_va_space_get_gpu_by_uuid_with_gpu_va_space(va_space, &params->uuid);
11855         if (!gpu) {
11856             status = NV_ERR_INVALID_DEVICE;
11857             goto out;
11858         }
11859 
11860         // Check if the GPU can access the VA
11861         if (!uvm_gpu_can_address(gpu, params->va, PAGE_SIZE)) {
11862             status = NV_ERR_OUT_OF_RANGE;
11863             goto out;
11864         }
11865 
11866         id = gpu->id;
11867     }
11868 
11869     block_context = uvm_va_block_context_alloc(mm);
11870     if (!block_context) {
11871         status = NV_ERR_NO_MEMORY;
11872         goto out;
11873     }
11874 
11875     status = uvm_va_block_find_create(va_space, params->va, block_context, &block);
11876     if (status != NV_OK)
11877         goto out;
11878 
11879     // TODO: Bug 3912902: UvmTestChangePteMapping() doesn't work on CPU.
11880     if (UVM_ID_IS_CPU(id) && uvm_va_block_is_hmm(block))
11881         goto out;
11882 
11883     uvm_mutex_lock(&block->lock);
11884 
11885     region = uvm_va_block_region_from_start_size(block, params->va, PAGE_SIZE);
11886     curr_prot = block_page_prot(block, id, region.first);
11887 
11888     if (new_prot == curr_prot) {
11889         status = NV_OK;
11890         goto out_block;
11891     }
11892 
11893     // TODO: Bug 1766124: Upgrades might require revoking other processors'
11894     //       access privileges. We just fail for now. Only downgrades are
11895     //       supported. If we allowed upgrades, we would need to check the mm
11896     //       like we do for revocation below.
11897     if (new_prot > curr_prot) {
11898         status = NV_ERR_INVALID_OPERATION;
11899         goto out_block;
11900     }
11901 
11902     block_context->policy = uvm_va_policy_get(block, params->va);
11903 
11904     if (new_prot == UVM_PROT_NONE) {
11905         status = uvm_va_block_unmap(block, block_context, id, region, NULL, &block->tracker);
11906     }
11907     else {
11908         UVM_ASSERT(block_is_page_resident_anywhere(block, region.first));
11909 
11910         // Revoking CPU mappings performs a combination of unmap + map. The map
11911         // portion requires a valid mm.
11912         if (UVM_ID_IS_CPU(id) && !uvm_va_range_vma_check(block->va_range, mm)) {
11913             status = NV_ERR_INVALID_STATE;
11914         }
11915         else {
11916             status = uvm_va_block_revoke_prot(block,
11917                                               block_context,
11918                                               id,
11919                                               region,
11920                                               NULL,
11921                                               new_prot + 1,
11922                                               &block->tracker);
11923         }
11924     }
11925 
11926 out_block:
11927     if (status == NV_OK)
11928         status = uvm_tracker_init_from(&local_tracker, &block->tracker);
11929 
11930     uvm_mutex_unlock(&block->lock);
11931 
11932     if (status == NV_OK)
11933         status = uvm_tracker_wait_deinit(&local_tracker);
11934 
11935 out:
11936     uvm_va_space_up_read(va_space);
11937     uvm_va_space_mm_or_current_release_unlock(va_space, mm);
11938 
11939     uvm_va_block_context_free(block_context);
11940 
11941     return status;
11942 }
11943 
11944 NV_STATUS uvm_test_va_block_info(UVM_TEST_VA_BLOCK_INFO_PARAMS *params, struct file *filp)
11945 {
11946     uvm_va_space_t *va_space = uvm_va_space_get(filp);
11947     uvm_va_block_t *va_block;
11948     uvm_va_range_t *va_range;
11949     struct mm_struct *mm;
11950     size_t index;
11951     NV_STATUS status = NV_OK;
11952 
11953     BUILD_BUG_ON(UVM_TEST_VA_BLOCK_SIZE != UVM_VA_BLOCK_SIZE);
11954 
11955     mm = uvm_va_space_mm_or_current_retain_lock(va_space);
11956     uvm_va_space_down_read(va_space);
11957 
11958     va_range = uvm_va_range_find(va_space, params->lookup_address);
11959     if (!va_range) {
11960         status = uvm_hmm_va_block_find(va_space, params->lookup_address, &va_block);
11961         if (status == NV_ERR_OBJECT_NOT_FOUND) {
11962             status = uvm_hmm_va_block_range_bounds(va_space,
11963                                                    mm,
11964                                                    params->lookup_address,
11965                                                    &params->va_block_start,
11966                                                    &params->va_block_end,
11967                                                    NULL);
11968             goto out;
11969         }
11970         else if (status != NV_OK) {
11971             goto out;
11972         }
11973     }
11974     else {
11975         index = uvm_va_range_block_index(va_range, params->lookup_address);
11976         va_block = uvm_va_range_block(va_range, index);
11977         if (!va_block) {
11978             status = NV_ERR_OBJECT_NOT_FOUND;
11979             goto out;
11980         }
11981     }
11982 
11983     params->va_block_start = va_block->start;
11984     params->va_block_end   = va_block->end;
11985 
11986 out:
11987     uvm_va_space_up_read(va_space);
11988     uvm_va_space_mm_or_current_release_unlock(va_space, mm);
11989     return status;
11990 }
11991 
11992 NV_STATUS uvm_test_va_residency_info(UVM_TEST_VA_RESIDENCY_INFO_PARAMS *params, struct file *filp)
11993 {
11994     NV_STATUS status = NV_OK;
11995     uvm_va_space_t *va_space = uvm_va_space_get(filp);
11996     uvm_va_range_t *va_range;
11997     uvm_va_block_t *block = NULL;
11998     struct mm_struct *mm;
11999     NvU32 count = 0;
12000     uvm_processor_mask_t resident_on_mask;
12001     uvm_processor_id_t id;
12002     uvm_page_index_t page_index;
12003     unsigned release_block_count = 0;
12004     NvU64 addr = UVM_ALIGN_DOWN(params->lookup_address, PAGE_SIZE);
12005     size_t index;
12006 
12007     mm = uvm_va_space_mm_or_current_retain_lock(va_space);
12008     uvm_va_space_down_read(va_space);
12009 
12010     // Inline uvm_va_block_find() to get the va_range.
12011     va_range = uvm_va_range_find(va_space, addr);
12012     if (!va_range) {
12013         NvU64 start, end;
12014 
12015         status = uvm_hmm_va_block_find(va_space, addr, &block);
12016         if (status != NV_OK) {
12017             if (status != NV_ERR_OBJECT_NOT_FOUND)
12018                 goto out;
12019             status = uvm_hmm_va_block_range_bounds(va_space, mm, addr, &start, &end, params);
12020             goto out;
12021         }
12022         // Update current CPU mapping information.
12023         status = uvm_hmm_va_block_update_residency_info(block, mm, addr, false);
12024         if (status != NV_OK) {
12025             block = NULL;
12026             goto out;
12027         }
12028     }
12029     else if (va_range->type != UVM_VA_RANGE_TYPE_MANAGED) {
12030         status = NV_ERR_INVALID_ADDRESS;
12031         goto out;
12032     }
12033     else {
12034         index = uvm_va_range_block_index(va_range, addr);
12035         block = uvm_va_range_block(va_range, index);
12036         if (!block) {
12037             params->resident_on_count = 0;
12038             params->populated_on_count = 0;
12039             params->mapped_on_count = 0;
12040 
12041             status = NV_OK;
12042 
12043             goto out;
12044         }
12045     }
12046 
12047     uvm_mutex_lock(&block->lock);
12048 
12049     page_index = uvm_va_block_cpu_page_index(block, addr);
12050     uvm_va_block_page_resident_processors(block, page_index, &resident_on_mask);
12051 
12052     for_each_id_in_mask(id, &resident_on_mask) {
12053         block_phys_page_t block_page = block_phys_page(id, page_index);
12054         uvm_va_space_processor_uuid(va_space, &params->resident_on[count], id);
12055         params->resident_physical_size[count] = block_phys_page_size(block, block_page);
12056         if (UVM_ID_IS_CPU(id)) {
12057             params->resident_physical_address[count] = page_to_phys(uvm_cpu_chunk_get_cpu_page(block, page_index));
12058         }
12059         else {
12060             params->resident_physical_address[count] =
12061                 block_phys_page_address(block, block_page, uvm_va_space_get_gpu(va_space, id)).address;
12062         }
12063         ++count;
12064     }
12065     params->resident_on_count = count;
12066 
12067     count = 0;
12068     for_each_id_in_mask(id, &block->mapped) {
12069         uvm_processor_id_t processor_to_map;
12070         block_phys_page_t block_page;
12071         NvU32 page_size = uvm_va_block_page_size_processor(block, id, page_index);
12072 
12073         if (page_size == 0)
12074             continue;
12075 
12076         uvm_va_space_processor_uuid(va_space, &params->mapped_on[count], id);
12077 
12078         params->mapping_type[count] = g_uvm_prot_to_test_pte_mapping[block_page_prot(block, id, page_index)];
12079         UVM_ASSERT(params->mapping_type[count] != UVM_TEST_PTE_MAPPING_INVALID);
12080         processor_to_map = block_get_processor_to_map(block, id, page_index);
12081         block_page = block_phys_page(processor_to_map, page_index);
12082 
12083         if (!UVM_ID_IS_CPU(id)) {
12084             uvm_gpu_phys_address_t gpu_phys_addr = block_phys_page_address(block,
12085                                                                            block_page,
12086                                                                            uvm_va_space_get_gpu(va_space, id));
12087             params->mapping_physical_address[count] = gpu_phys_addr.address;
12088         }
12089         else {
12090             struct page *page = block_page_get(block, block_page);
12091 
12092             params->mapping_physical_address[count] = page_to_phys(page);
12093         }
12094 
12095         params->page_size[count] = page_size;
12096         ++count;
12097     }
12098 
12099     if (params->resident_on_count == 1) {
12100         if (uvm_processor_mask_test(&resident_on_mask, UVM_ID_CPU)) {
12101             if (uvm_pmm_sysmem_mappings_indirect_supported()) {
12102                 for_each_gpu_id(id) {
12103                     NvU32 page_size = uvm_va_block_page_size_processor(block, id, page_index);
12104                     uvm_reverse_map_t sysmem_page;
12105                     uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page_index);
12106                     size_t num_pages;
12107                     uvm_gpu_t *gpu;
12108 
12109                     if (!uvm_va_block_gpu_state_get(block, id))
12110                         continue;
12111 
12112                     gpu = uvm_va_space_get_gpu(va_space, id);
12113 
12114                     if (!gpu->parent->access_counters_supported)
12115                         continue;
12116 
12117                     num_pages = uvm_pmm_sysmem_mappings_dma_to_virt(&gpu->pmm_reverse_sysmem_mappings,
12118                                                                     uvm_cpu_chunk_get_gpu_phys_addr(chunk, gpu->parent),
12119                                                                     uvm_cpu_chunk_get_size(chunk),
12120                                                                     &sysmem_page,
12121                                                                     1);
12122                     if (page_size > 0)
12123                         UVM_ASSERT(num_pages == 1);
12124                     else
12125                         UVM_ASSERT(num_pages <= 1);
12126 
12127                     if (num_pages == 1) {
12128                         UVM_ASSERT(sysmem_page.va_block == block);
12129                         UVM_ASSERT(uvm_reverse_map_start(&sysmem_page) <= addr);
12130                         UVM_ASSERT(uvm_reverse_map_end(&sysmem_page) > addr);
12131 
12132                         ++release_block_count;
12133                     }
12134                 }
12135             }
12136         }
12137         else {
12138             uvm_gpu_id_t id = uvm_processor_mask_find_first_id(&resident_on_mask);
12139             uvm_reverse_map_t gpu_mapping;
12140             size_t num_pages;
12141             uvm_gpu_t *gpu = uvm_va_space_get_gpu(va_space, id);
12142             uvm_gpu_phys_address_t phys_addr;
12143 
12144             phys_addr = uvm_va_block_gpu_phys_page_address(block, page_index, gpu);
12145             num_pages = uvm_pmm_gpu_phys_to_virt(&gpu->pmm, phys_addr.address, PAGE_SIZE, &gpu_mapping);
12146 
12147             // Chunk may be in TEMP_PINNED state so it may not have a VA block
12148             // assigned. In that case, we don't get a valid translation.
12149             if (num_pages > 0) {
12150                 UVM_ASSERT(num_pages == 1);
12151                 UVM_ASSERT(gpu_mapping.va_block == block);
12152                 UVM_ASSERT(uvm_reverse_map_start(&gpu_mapping) == addr);
12153 
12154                 ++release_block_count;
12155             }
12156         }
12157     }
12158 
12159     params->mapped_on_count = count;
12160 
12161     count = 0;
12162     for_each_processor_id(id) {
12163         if (!block_processor_page_is_populated(block, id, page_index))
12164             continue;
12165 
12166         uvm_va_space_processor_uuid(va_space, &params->populated_on[count], id);
12167         ++count;
12168     }
12169     params->populated_on_count = count;
12170 
12171 out:
12172     if (block) {
12173         if (!params->is_async && status == NV_OK)
12174             status = uvm_tracker_wait(&block->tracker);
12175         uvm_mutex_unlock(&block->lock);
12176         while (release_block_count--)
12177             uvm_va_block_release(block);
12178     }
12179     uvm_va_space_up_read(va_space);
12180     uvm_va_space_mm_or_current_release_unlock(va_space, mm);
12181     return status;
12182 }
12183 
12184 void uvm_va_block_mark_cpu_dirty(uvm_va_block_t *va_block)
12185 {
12186     block_mark_region_cpu_dirty(va_block, uvm_va_block_region_from_block(va_block));
12187 }
12188