1 /*******************************************************************************
2     Copyright (c) 2015-2023 NVIDIA Corporation
3 
4     Permission is hereby granted, free of charge, to any person obtaining a copy
5     of this software and associated documentation files (the "Software"), to
6     deal in the Software without restriction, including without limitation the
7     rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
8     sell copies of the Software, and to permit persons to whom the Software is
9     furnished to do so, subject to the following conditions:
10 
11         The above copyright notice and this permission notice shall be
12         included in all copies or substantial portions of the Software.
13 
14     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17     THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19     FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20     DEALINGS IN THE SOFTWARE.
21 
22 *******************************************************************************/
23 
24 #include "uvm_linux.h"
25 #include "uvm_common.h"
26 #include "uvm_api.h"
27 #include "uvm_global.h"
28 #include "uvm_gpu.h"
29 #include "uvm_va_space.h"
30 #include "uvm_va_range.h"
31 #include "uvm_va_block.h"
32 #include "uvm_hal_types.h"
33 #include "uvm_kvmalloc.h"
34 #include "uvm_tools.h"
35 #include "uvm_processors.h"
36 #include "uvm_push.h"
37 #include "uvm_hal.h"
38 #include "uvm_perf_thrashing.h"
39 #include "uvm_perf_prefetch.h"
40 #include "uvm_mem.h"
41 #include "uvm_gpu_access_counters.h"
42 #include "uvm_va_space_mm.h"
43 #include "uvm_test_ioctl.h"
44 #include "uvm_conf_computing.h"
45 
46 typedef enum
47 {
48     BLOCK_PTE_OP_MAP,
49     BLOCK_PTE_OP_REVOKE,
50     BLOCK_PTE_OP_COUNT
51 } block_pte_op_t;
52 
53 static NvU64 uvm_perf_authorized_cpu_fault_tracking_window_ns = 300000;
54 
55 static struct kmem_cache *g_uvm_va_block_cache __read_mostly;
56 static struct kmem_cache *g_uvm_va_block_gpu_state_cache __read_mostly;
57 static struct kmem_cache *g_uvm_page_mask_cache __read_mostly;
58 static struct kmem_cache *g_uvm_va_block_context_cache __read_mostly;
59 static struct kmem_cache *g_uvm_va_block_cpu_node_state_cache __read_mostly;
60 
61 static int uvm_fault_force_sysmem __read_mostly = 0;
62 module_param(uvm_fault_force_sysmem, int, S_IRUGO|S_IWUSR);
63 MODULE_PARM_DESC(uvm_fault_force_sysmem, "Force (1) using sysmem storage for pages that faulted. Default: 0.");
64 
65 static int uvm_perf_map_remote_on_eviction __read_mostly = 1;
66 module_param(uvm_perf_map_remote_on_eviction, int, S_IRUGO);
67 
68 static int uvm_block_cpu_to_cpu_copy_with_ce __read_mostly = 0;
69 module_param(uvm_block_cpu_to_cpu_copy_with_ce, int, S_IRUGO | S_IWUSR);
70 MODULE_PARM_DESC(uvm_block_cpu_to_cpu_copy_with_ce, "Use GPU CEs for CPU-to-CPU migrations.");
71 
72 // Caching is always disabled for mappings to remote memory. The following two
73 // module parameters can be used to force caching for GPU peer/sysmem mappings.
74 //
75 // However, it is important to note that it may not be safe to enable caching
76 // in the general case so the enablement should only be used for experiments.
77 static unsigned uvm_exp_gpu_cache_peermem __read_mostly = 0;
78 module_param(uvm_exp_gpu_cache_peermem, uint, S_IRUGO);
79 MODULE_PARM_DESC(uvm_exp_gpu_cache_peermem,
80                  "Force caching for mappings to peer memory. "
81                  "This is an experimental parameter that may cause correctness issues if used.");
82 
83 static unsigned uvm_exp_gpu_cache_sysmem __read_mostly = 0;
84 module_param(uvm_exp_gpu_cache_sysmem, uint, S_IRUGO);
85 MODULE_PARM_DESC(uvm_exp_gpu_cache_sysmem,
86                  "Force caching for mappings to system memory. "
87                  "This is an experimental parameter that may cause correctness issues if used.");
88 
89 static void block_add_eviction_mappings_entry(void *args);
90 
uvm_va_block_get_va_space_maybe_dead(uvm_va_block_t * va_block)91 uvm_va_space_t *uvm_va_block_get_va_space_maybe_dead(uvm_va_block_t *va_block)
92 {
93 #if UVM_IS_CONFIG_HMM()
94     if (va_block->hmm.va_space)
95         return va_block->hmm.va_space;
96 #endif
97 
98     if (va_block->va_range)
99         return va_block->va_range->va_space;
100 
101     return NULL;
102 }
103 
uvm_va_block_get_va_space(uvm_va_block_t * va_block)104 uvm_va_space_t *uvm_va_block_get_va_space(uvm_va_block_t *va_block)
105 {
106     uvm_va_space_t *va_space;
107 
108     UVM_ASSERT(!uvm_va_block_is_dead(va_block));
109 
110     va_space = uvm_va_block_get_va_space_maybe_dead(va_block);
111     UVM_ASSERT(va_space);
112 
113     return va_space;
114 }
115 
block_gpu_pte_flag_cacheable(uvm_va_block_t * block,uvm_gpu_t * gpu,uvm_processor_id_t resident_id)116 static NvU64 block_gpu_pte_flag_cacheable(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_processor_id_t resident_id)
117 {
118     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
119 
120     UVM_ASSERT(UVM_ID_IS_VALID(resident_id));
121 
122     // Local vidmem is always cached
123     if (uvm_id_equal(resident_id, gpu->id))
124         return UVM_MMU_PTE_FLAGS_CACHED;
125 
126     if (UVM_ID_IS_CPU(resident_id))
127         return uvm_exp_gpu_cache_sysmem == 0 ? UVM_MMU_PTE_FLAGS_NONE : UVM_MMU_PTE_FLAGS_CACHED;
128 
129     UVM_ASSERT(uvm_processor_mask_test(&va_space->can_access[uvm_id_value(gpu->id)], resident_id));
130 
131     return uvm_exp_gpu_cache_peermem == 0 ? UVM_MMU_PTE_FLAGS_NONE : UVM_MMU_PTE_FLAGS_CACHED;
132 }
133 
block_get_gpu(uvm_va_block_t * block,uvm_gpu_id_t gpu_id)134 static uvm_gpu_t *block_get_gpu(uvm_va_block_t *block, uvm_gpu_id_t gpu_id)
135 {
136     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
137 
138     return uvm_va_space_get_gpu(va_space, gpu_id);
139 }
140 
block_processor_name(uvm_va_block_t * block,uvm_processor_id_t id)141 static const char *block_processor_name(uvm_va_block_t *block, uvm_processor_id_t id)
142 {
143     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
144 
145     return uvm_va_space_processor_name(va_space, id);
146 }
147 
block_processor_has_memory(uvm_va_block_t * block,uvm_processor_id_t id)148 static bool block_processor_has_memory(uvm_va_block_t *block, uvm_processor_id_t id)
149 {
150     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
151 
152     return uvm_va_space_processor_has_memory(va_space, id);
153 }
154 
is_uvm_fault_force_sysmem_set(void)155 static bool is_uvm_fault_force_sysmem_set(void)
156 {
157     // Only enforce this during testing
158     return uvm_enable_builtin_tests && uvm_fault_force_sysmem != 0;
159 }
160 
uvm_va_space_map_remote_on_eviction(uvm_va_space_t * va_space)161 bool uvm_va_space_map_remote_on_eviction(uvm_va_space_t *va_space)
162 {
163     return uvm_perf_map_remote_on_eviction &&
164            uvm_va_space_has_access_counter_migrations(va_space);
165 }
166 
block_get_uvm_lite_gpus(uvm_va_block_t * va_block)167 static const uvm_processor_mask_t *block_get_uvm_lite_gpus(uvm_va_block_t *va_block)
168 {
169     // Note that for HMM we always return a pointer to a zero bitmap
170     // (not allocated on the stack) since uvm_lite GPUs are not supported.
171     if (uvm_va_block_is_hmm(va_block))
172         return &g_uvm_processor_mask_empty;
173     else
174         return &va_block->va_range->uvm_lite_gpus;
175 }
176 
uvm_va_block_retry_init(uvm_va_block_retry_t * retry)177 void uvm_va_block_retry_init(uvm_va_block_retry_t *retry)
178 {
179     if (!retry)
180         return;
181 
182     uvm_tracker_init(&retry->tracker);
183     INIT_LIST_HEAD(&retry->used_chunks);
184     INIT_LIST_HEAD(&retry->free_chunks);
185 }
186 
node_to_index(int nid)187 static size_t node_to_index(int nid)
188 {
189     UVM_ASSERT(nid != NUMA_NO_NODE);
190     UVM_ASSERT(nid < MAX_NUMNODES);
191     return __nodes_weight(&node_possible_map, nid);
192 }
193 
block_node_state_get(uvm_va_block_t * block,int nid)194 static uvm_va_block_cpu_node_state_t *block_node_state_get(uvm_va_block_t *block, int nid)
195 {
196     size_t index = node_to_index(nid);
197     UVM_ASSERT(block->cpu.node_state[index]);
198     return block->cpu.node_state[index];
199 }
200 
block_tracking_node_mask_get(uvm_va_block_context_t * va_block_context,int nid)201 static uvm_page_mask_t *block_tracking_node_mask_get(uvm_va_block_context_t *va_block_context, int nid)
202 {
203     size_t index = node_to_index(nid);
204     UVM_ASSERT(va_block_context->make_resident.cpu_pages_used.node_masks[index]);
205     return va_block_context->make_resident.cpu_pages_used.node_masks[index];
206 }
207 
208 // The bottom bit of uvm_va_block_t::chunks is used to indicate how CPU chunks
209 // are stored.
210 //
211 // CPU chunk storage is handled in three different ways depending on the
212 // type of chunks the VA block owns. This is done to minimize the memory
213 // required to hold metadata.
214 typedef enum
215 {
216     // The uvm_va_block_t::chunk pointer points to a single 2MB
217     // CPU chunk.
218     UVM_CPU_CHUNK_STORAGE_CHUNK = 0,
219 
220     // The uvm_va_block_t::chunks pointer points to a
221     // structure of mixed (64K and 4K) chunks.
222     UVM_CPU_CHUNK_STORAGE_MIXED,
223     UVM_CPU_CHUNK_STORAGE_COUNT,
224 } uvm_cpu_chunk_storage_type_t;
225 
226 #define UVM_CPU_CHUNK_STORAGE_MASK 0x1
227 
228 // The maximum number of slots in the mixed chunk mode (64K + 4K chunks) is
229 // MAX_BIG_PAGES_PER_UVM_VA_BLOCK. Any leading/trailing misaligned pages will
230 // be stored in the first/last entry, respectively.
231 #define MAX_BIG_CPU_CHUNK_SLOTS_PER_UVM_VA_BLOCK MAX_BIG_PAGES_PER_UVM_VA_BLOCK
232 
233 #define MAX_SMALL_CHUNKS_PER_BIG_SLOT (UVM_MIN_BIG_PAGE_SIZE / PAGE_SIZE)
234 
235 // This structure is used when a VA block contains 64K or a mix of 64K and 4K
236 // CPU chunks.
237 // For every 64K CPU chunks, big_chunks will have its corresponding bit set
238 // and the corresponding index in slots will point directly to the
239 // uvm_cpu_chunk_t structure.
240 //
241 // For 4K CPU chunks, the corresponding bit in big_chunks will be clear and
242 // the element in slots will point to an array of 16 uvm_cpu_chunk_t pointers.
243 typedef struct {
244     DECLARE_BITMAP(big_chunks, MAX_BIG_CPU_CHUNK_SLOTS_PER_UVM_VA_BLOCK);
245     void *slots[MAX_BIG_CPU_CHUNK_SLOTS_PER_UVM_VA_BLOCK];
246 } uvm_cpu_chunk_storage_mixed_t;
247 
uvm_cpu_chunk_block_region(uvm_va_block_t * va_block,uvm_cpu_chunk_t * chunk,uvm_page_index_t page_index)248 static uvm_va_block_region_t uvm_cpu_chunk_block_region(uvm_va_block_t *va_block,
249                                                         uvm_cpu_chunk_t *chunk,
250                                                         uvm_page_index_t page_index)
251 {
252     UVM_ASSERT(chunk);
253     return uvm_va_block_chunk_region(va_block, uvm_cpu_chunk_get_size(chunk), page_index);
254 }
255 
uvm_cpu_storage_get_ptr(uvm_va_block_cpu_node_state_t * node_state)256 static void *uvm_cpu_storage_get_ptr(uvm_va_block_cpu_node_state_t *node_state)
257 {
258     return (void *)(node_state->chunks & ~UVM_CPU_CHUNK_STORAGE_MASK);
259 }
260 
uvm_cpu_storage_get_type(uvm_va_block_cpu_node_state_t * node_state)261 static uvm_cpu_chunk_storage_type_t uvm_cpu_storage_get_type(uvm_va_block_cpu_node_state_t *node_state)
262 {
263     return node_state->chunks & UVM_CPU_CHUNK_STORAGE_MASK;
264 }
265 
block_get_page_node_residency(uvm_va_block_t * block,uvm_page_index_t page_index)266 static int block_get_page_node_residency(uvm_va_block_t *block, uvm_page_index_t page_index)
267 {
268     int nid;
269 
270     for_each_possible_uvm_node(nid) {
271         if (uvm_va_block_cpu_is_page_resident_on(block, nid, page_index))
272             return nid;
273     }
274 
275     return NUMA_NO_NODE;
276 }
277 
compute_page_prefix(uvm_va_block_t * va_block,uvm_chunk_size_t size)278 static uvm_page_index_t compute_page_prefix(uvm_va_block_t *va_block, uvm_chunk_size_t size)
279 {
280     return (UVM_ALIGN_UP(va_block->start, size) - va_block->start) / PAGE_SIZE;
281 }
282 
compute_slot_index(uvm_va_block_t * va_block,uvm_page_index_t page_index)283 static size_t compute_slot_index(uvm_va_block_t *va_block, uvm_page_index_t page_index)
284 {
285     uvm_va_block_region_t block_region = uvm_va_block_region_from_block(va_block);
286     uvm_page_index_t prefix;
287     size_t slot_index;
288 
289     UVM_ASSERT(page_index < block_region.outer);
290     prefix = compute_page_prefix(va_block, UVM_PAGE_SIZE_64K);
291 
292     if (page_index < prefix)
293         return 0;
294 
295     slot_index = ((page_index - prefix) / MAX_SMALL_CHUNKS_PER_BIG_SLOT) + !!prefix;
296     UVM_ASSERT(slot_index < MAX_BIG_CPU_CHUNK_SLOTS_PER_UVM_VA_BLOCK);
297 
298     return slot_index;
299 }
300 
compute_small_index(uvm_va_block_t * va_block,uvm_page_index_t page_index)301 static size_t compute_small_index(uvm_va_block_t *va_block, uvm_page_index_t page_index)
302 {
303     size_t prefix = compute_page_prefix(va_block, UVM_PAGE_SIZE_64K);
304 
305     if (page_index < prefix)
306         return page_index;
307 
308     return (page_index - prefix) % MAX_SMALL_CHUNKS_PER_BIG_SLOT;
309 }
310 
uvm_cpu_chunk_insert_in_block(uvm_va_block_t * va_block,uvm_cpu_chunk_t * chunk,uvm_page_index_t page_index)311 NV_STATUS uvm_cpu_chunk_insert_in_block(uvm_va_block_t *va_block, uvm_cpu_chunk_t *chunk, uvm_page_index_t page_index)
312 {
313     uvm_chunk_size_t chunk_size = uvm_cpu_chunk_get_size(chunk);
314     uvm_va_block_region_t chunk_region = uvm_va_block_region(page_index, page_index + uvm_cpu_chunk_num_pages(chunk));
315     int nid = uvm_cpu_chunk_get_numa_node(chunk);
316     uvm_va_block_cpu_node_state_t *node_state = block_node_state_get(va_block, nid);
317     size_t slot_index;
318     uvm_cpu_chunk_storage_mixed_t *mixed;
319     uvm_cpu_chunk_t **chunks = NULL;
320 
321     // We only want to use the bottom bit of a pointer.
322     BUILD_BUG_ON(UVM_CPU_CHUNK_STORAGE_COUNT > 2);
323 
324     // We want to protect against two threads manipulating the VA block's CPU
325     // chunks at the same time. However, when a block is split, the new block's
326     // lock is locked without tracking. So, we can't use
327     // uvm_assert_mutex_locked().
328     UVM_ASSERT(mutex_is_locked(&va_block->lock.m));
329 
330     if (chunk_size == UVM_CHUNK_SIZE_2M) {
331         UVM_ASSERT(uvm_va_block_size(va_block) == UVM_PAGE_SIZE_2M);
332         UVM_ASSERT(!node_state->chunks);
333         node_state->chunks = (unsigned long)chunk | UVM_CPU_CHUNK_STORAGE_CHUNK;
334     }
335     else {
336         if (!node_state->chunks) {
337             mixed = uvm_kvmalloc_zero(sizeof(*mixed));
338             if (!mixed)
339                 return NV_ERR_NO_MEMORY;
340 
341             node_state->chunks = (unsigned long)mixed | UVM_CPU_CHUNK_STORAGE_MIXED;
342         }
343 
344         UVM_ASSERT(uvm_cpu_storage_get_type(node_state) == UVM_CPU_CHUNK_STORAGE_MIXED);
345         mixed = uvm_cpu_storage_get_ptr(node_state);
346         slot_index = compute_slot_index(va_block, page_index);
347         UVM_ASSERT(compute_slot_index(va_block, page_index + uvm_cpu_chunk_num_pages(chunk) - 1) == slot_index);
348         UVM_ASSERT(!test_bit(slot_index, mixed->big_chunks));
349 
350         if (chunk_size == UVM_CHUNK_SIZE_64K) {
351             mixed->slots[slot_index] = chunk;
352             set_bit(slot_index, mixed->big_chunks);
353         }
354         else {
355             size_t small_index;
356 
357             UVM_ASSERT(chunk_size == UVM_CHUNK_SIZE_4K);
358             chunks = mixed->slots[slot_index];
359 
360             if (!chunks) {
361                 chunks = uvm_kvmalloc_zero(sizeof(*chunks) * MAX_SMALL_CHUNKS_PER_BIG_SLOT);
362                 if (!chunks)
363                     return NV_ERR_NO_MEMORY;
364                 mixed->slots[slot_index] = chunks;
365             }
366 
367             small_index = compute_small_index(va_block, page_index);
368             chunks[small_index] = chunk;
369         }
370     }
371 
372     uvm_page_mask_region_fill(&node_state->allocated, chunk_region);
373     uvm_page_mask_region_fill(&va_block->cpu.allocated, chunk_region);
374     return NV_OK;
375 }
376 
uvm_cpu_chunk_get_chunk_for_page(uvm_va_block_t * va_block,int nid,uvm_page_index_t page_index)377 uvm_cpu_chunk_t *uvm_cpu_chunk_get_chunk_for_page(uvm_va_block_t *va_block, int nid, uvm_page_index_t page_index)
378 {
379     uvm_va_block_cpu_node_state_t *node_state;
380     uvm_cpu_chunk_storage_mixed_t *mixed;
381     uvm_cpu_chunk_t *chunk;
382     uvm_cpu_chunk_t **chunks;
383     size_t slot_index;
384 
385     UVM_ASSERT(page_index < uvm_va_block_num_cpu_pages(va_block));
386     UVM_ASSERT(nid != NUMA_NO_NODE);
387     node_state = block_node_state_get(va_block, nid);
388     if (!uvm_page_mask_test(&node_state->allocated, page_index))
389         return NULL;
390 
391     UVM_ASSERT(node_state->chunks);
392 
393     if (uvm_cpu_storage_get_type(node_state) == UVM_CPU_CHUNK_STORAGE_CHUNK) {
394         return uvm_cpu_storage_get_ptr(node_state);
395     }
396     else {
397         mixed = uvm_cpu_storage_get_ptr(node_state);
398         slot_index = compute_slot_index(va_block, page_index);
399         UVM_ASSERT(mixed->slots[slot_index] != NULL);
400         if (test_bit(slot_index, mixed->big_chunks))
401             return mixed->slots[slot_index];
402 
403         chunks = mixed->slots[slot_index];
404         chunk = chunks[compute_small_index(va_block, page_index)];
405     }
406 
407     UVM_ASSERT(chunk);
408     return chunk;
409 }
410 
uvm_cpu_chunk_get_any_chunk_for_page(uvm_va_block_t * va_block,uvm_page_index_t page_index)411 uvm_cpu_chunk_t *uvm_cpu_chunk_get_any_chunk_for_page(uvm_va_block_t *va_block, uvm_page_index_t page_index)
412 {
413     int nid;
414     uvm_va_block_cpu_node_state_t *node_state;
415 
416     // Callers for managed blocks should already know the correct nid and
417     // shouldn't need to call this function.
418     UVM_ASSERT(uvm_va_block_is_hmm(va_block));
419 
420     for_each_possible_uvm_node(nid) {
421         node_state = block_node_state_get(va_block, nid);
422         if (uvm_page_mask_test(&node_state->allocated, page_index))
423             return uvm_cpu_chunk_get_chunk_for_page(va_block, nid, page_index);
424     }
425 
426     return NULL;
427 }
428 
uvm_cpu_chunk_get_chunk_for_page_resident(uvm_va_block_t * va_block,uvm_page_index_t page_index)429 static uvm_cpu_chunk_t *uvm_cpu_chunk_get_chunk_for_page_resident(uvm_va_block_t *va_block, uvm_page_index_t page_index)
430 {
431     uvm_cpu_chunk_t *chunk = NULL;
432     int nid = block_get_page_node_residency(va_block, page_index);
433 
434     if (nid != NUMA_NO_NODE)
435         chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, nid, page_index);
436 
437     return chunk;
438 }
439 
uvm_cpu_chunk_remove_from_block(uvm_va_block_t * va_block,int nid,uvm_page_index_t page_index)440 void uvm_cpu_chunk_remove_from_block(uvm_va_block_t *va_block, int nid, uvm_page_index_t page_index)
441 {
442     uvm_va_block_cpu_node_state_t *node_state = block_node_state_get(va_block, nid);
443     uvm_cpu_chunk_storage_mixed_t *mixed;
444     uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, nid, page_index);
445     uvm_va_block_region_t chunk_region = uvm_cpu_chunk_block_region(va_block, chunk, page_index);
446     size_t slot_index;
447     uvm_cpu_chunk_t **chunks;
448     int nid_iter;
449 
450     // We want to protect against two threads manipulating the VA block's CPU
451     // chunks at the same time. However, when a block is split, the new block's
452     // lock is locked without tracking. So, we can't use
453     // uvm_assert_mutex_locked().
454     UVM_ASSERT(mutex_is_locked(&va_block->lock.m));
455     UVM_ASSERT(node_state->chunks);
456     UVM_ASSERT(uvm_va_block_region_num_pages(chunk_region) == uvm_cpu_chunk_num_pages(chunk));
457 
458     if (uvm_cpu_storage_get_type(node_state) == UVM_CPU_CHUNK_STORAGE_CHUNK) {
459         UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_2M);
460         UVM_ASSERT(uvm_cpu_storage_get_ptr(node_state) == chunk);
461         node_state->chunks = 0;
462     }
463     else {
464         UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) != UVM_CHUNK_SIZE_2M);
465         mixed = uvm_cpu_storage_get_ptr(node_state);
466         slot_index = compute_slot_index(va_block, page_index);
467         UVM_ASSERT(mixed->slots[slot_index] != NULL);
468 
469         if (test_bit(slot_index, mixed->big_chunks)) {
470             UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_64K);
471             UVM_ASSERT(mixed->slots[slot_index] == chunk);
472             mixed->slots[slot_index] = NULL;
473             clear_bit(slot_index, mixed->big_chunks);
474         }
475         else {
476             size_t small_index;
477 
478             UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_4K);
479             chunks = mixed->slots[slot_index];
480             small_index = compute_small_index(va_block, page_index);
481             UVM_ASSERT(chunks[small_index] == chunk);
482             chunks[small_index] = NULL;
483 
484             for (small_index = 0; small_index < MAX_SMALL_CHUNKS_PER_BIG_SLOT; small_index++) {
485                 if (chunks[small_index])
486                     break;
487             }
488 
489             if (small_index == MAX_SMALL_CHUNKS_PER_BIG_SLOT) {
490                 uvm_kvfree(chunks);
491                 mixed->slots[slot_index] = NULL;
492             }
493         }
494     }
495 
496     uvm_page_mask_region_clear(&node_state->allocated, chunk_region);
497     uvm_page_mask_zero(&va_block->cpu.allocated);
498     for_each_possible_uvm_node(nid_iter) {
499         uvm_va_block_cpu_node_state_t *iter_node_state = block_node_state_get(va_block, nid_iter);
500         uvm_page_mask_or(&va_block->cpu.allocated, &va_block->cpu.allocated, &iter_node_state->allocated);
501     }
502 
503     if (uvm_page_mask_empty(&node_state->allocated) && node_state->chunks) {
504         uvm_kvfree(uvm_cpu_storage_get_ptr(node_state));
505         node_state->chunks = 0;
506     }
507 }
508 
uvm_cpu_chunk_get_cpu_page(uvm_va_block_t * va_block,uvm_cpu_chunk_t * chunk,uvm_page_index_t page_index)509 struct page *uvm_cpu_chunk_get_cpu_page(uvm_va_block_t *va_block, uvm_cpu_chunk_t *chunk, uvm_page_index_t page_index)
510 {
511     uvm_va_block_region_t chunk_region;
512 
513     UVM_ASSERT(chunk);
514     UVM_ASSERT(chunk->page);
515     chunk_region = uvm_va_block_chunk_region(va_block, uvm_cpu_chunk_get_size(chunk), page_index);
516     return chunk->page + (page_index - chunk_region.first);
517 }
518 
uvm_va_block_get_cpu_page(uvm_va_block_t * va_block,uvm_page_index_t page_index)519 struct page *uvm_va_block_get_cpu_page(uvm_va_block_t *va_block, uvm_page_index_t page_index)
520 {
521     uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page_resident(va_block, page_index);
522 
523     return uvm_cpu_chunk_get_cpu_page(va_block, chunk, page_index);
524 }
525 
uvm_cpu_chunk_first_in_region(uvm_va_block_t * va_block,uvm_va_block_region_t region,int nid,uvm_page_index_t * first_chunk_page)526 static uvm_cpu_chunk_t *uvm_cpu_chunk_first_in_region(uvm_va_block_t *va_block,
527                                                       uvm_va_block_region_t region,
528                                                       int nid,
529                                                       uvm_page_index_t *first_chunk_page)
530 {
531     uvm_cpu_chunk_t *chunk = NULL;
532     uvm_page_index_t page_index;
533     uvm_va_block_cpu_node_state_t *node_state = block_node_state_get(va_block, nid);
534 
535     if (!node_state)
536         return NULL;
537 
538     page_index = uvm_va_block_first_page_in_mask(region, &node_state->allocated);
539     if (page_index < region.outer)
540         chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, nid, page_index);
541 
542     if (first_chunk_page && chunk) {
543         uvm_va_block_region_t chunk_region = uvm_cpu_chunk_block_region(va_block, chunk, page_index);
544         *first_chunk_page = chunk_region.first;
545     }
546 
547     return chunk;
548 }
549 
uvm_cpu_chunk_next_in_region(uvm_va_block_t * va_block,uvm_va_block_region_t region,int nid,uvm_page_index_t prev_page_index,uvm_page_index_t * next_chunk_page)550 static uvm_cpu_chunk_t *uvm_cpu_chunk_next_in_region(uvm_va_block_t *va_block,
551                                                      uvm_va_block_region_t region,
552                                                      int nid,
553                                                      uvm_page_index_t prev_page_index,
554                                                      uvm_page_index_t *next_chunk_page)
555 {
556     if (prev_page_index >= region.outer)
557         return NULL;
558 
559     return uvm_cpu_chunk_first_in_region(va_block,
560                                          uvm_va_block_region(prev_page_index, region.outer),
561                                          nid, next_chunk_page);
562 }
563 
564 #define for_each_cpu_chunk_in_block_region(chunk, chunk_start, va_block, nid, region)                                  \
565     for ((chunk) = uvm_cpu_chunk_first_in_region((va_block), (region), (nid), &(chunk_start));                         \
566          (chunk) != NULL;                                                                                              \
567          (chunk) = uvm_cpu_chunk_next_in_region((va_block),                                                            \
568                                                 (region),                                                              \
569                                                 (nid),                                                                 \
570                                                 (chunk_start) + uvm_cpu_chunk_num_pages((chunk)),                      \
571                                                 &(chunk_start)))
572 
573 #define for_each_cpu_chunk_in_block_region_safe(chunk, chunk_start, next_chunk_start, va_block, nid, region)           \
574     for ((chunk) = uvm_cpu_chunk_first_in_region((va_block), (region), (nid), &(chunk_start)),                         \
575              (next_chunk_start) = (chunk_start) + (chunk ? uvm_cpu_chunk_num_pages(chunk) : 0);                        \
576          (chunk) != NULL;                                                                                              \
577          (chunk) = uvm_cpu_chunk_next_in_region((va_block), (region), (nid), (next_chunk_start), &(chunk_start)),      \
578              (next_chunk_start) = (chunk_start) + ((chunk) ? uvm_cpu_chunk_num_pages((chunk)) : 0))
579 
580 #define for_each_cpu_chunk_in_block(chunk, chunk_start, va_block, nid)                                                 \
581     for_each_cpu_chunk_in_block_region((chunk),                                                                        \
582                                        (chunk_start),                                                                  \
583                                        (va_block),                                                                     \
584                                        (nid),                                                                          \
585                                        uvm_va_block_region_from_block((va_block)))
586 
587 #define for_each_cpu_chunk_in_block_safe(chunk, chunk_start, next_chunk_start, va_block, nid)                          \
588     for_each_cpu_chunk_in_block_region_safe((chunk),                                                                   \
589                                             (chunk_start),                                                             \
590                                             (next_chunk_start),                                                        \
591                                             (va_block),                                                                \
592                                             (nid),                                                                     \
593                                             uvm_va_block_region_from_block((va_block)))
594 
block_update_cpu_resident_mask(uvm_va_block_t * va_block)595 static void block_update_cpu_resident_mask(uvm_va_block_t *va_block)
596 {
597     int nid;
598 
599     uvm_page_mask_zero(&va_block->cpu.resident);
600     for_each_possible_uvm_node(nid) {
601         uvm_va_block_cpu_node_state_t *node_state = block_node_state_get(va_block, nid);
602         uvm_page_mask_or(&va_block->cpu.resident, &va_block->cpu.resident, &node_state->resident);
603     }
604 }
605 
uvm_va_block_cpu_set_resident_page(uvm_va_block_t * va_block,int nid,uvm_page_index_t page_index)606 void uvm_va_block_cpu_set_resident_page(uvm_va_block_t *va_block, int nid, uvm_page_index_t page_index)
607 {
608     uvm_va_block_cpu_node_state_t *node_state;
609 
610     node_state = block_node_state_get(va_block, nid);
611     UVM_ASSERT(node_state);
612     UVM_ASSERT(uvm_page_mask_test(&node_state->allocated, page_index));
613     uvm_page_mask_set(&node_state->resident, page_index);
614     uvm_page_mask_set(&va_block->cpu.resident, page_index);
615     uvm_processor_mask_set(&va_block->resident, UVM_ID_CPU);
616 }
617 
618 // Set all CPU pages in the mask as resident on NUMA node nid.
619 // nid cannot be NUMA_NO_NODE.
uvm_va_block_cpu_set_resident_mask(uvm_va_block_t * va_block,int nid,const uvm_page_mask_t * mask)620 static void uvm_va_block_cpu_set_resident_mask(uvm_va_block_t *va_block, int nid, const uvm_page_mask_t *mask)
621 {
622     uvm_va_block_cpu_node_state_t *node_state;
623 
624     node_state = block_node_state_get(va_block, nid);
625     UVM_ASSERT(node_state);
626     UVM_ASSERT(uvm_page_mask_subset(mask, &node_state->allocated));
627     uvm_page_mask_or(&node_state->resident, &node_state->resident, mask);
628     uvm_page_mask_or(&va_block->cpu.resident, &va_block->cpu.resident, mask);
629 }
630 
uvm_va_block_cpu_set_resident_all_chunks(uvm_va_block_t * va_block,uvm_va_block_context_t * va_block_context,const uvm_page_mask_t * page_mask)631 static void uvm_va_block_cpu_set_resident_all_chunks(uvm_va_block_t *va_block,
632                                                      uvm_va_block_context_t *va_block_context,
633                                                      const uvm_page_mask_t *page_mask)
634 {
635     uvm_make_resident_page_tracking_t *tracking = &va_block_context->make_resident.cpu_pages_used;
636     uvm_page_mask_t *node_pages_mask = &va_block_context->make_resident.node_pages_mask;
637     uvm_page_mask_t *page_mask_copy = &va_block_context->scratch_page_mask;
638     int nid;
639 
640     if (uvm_page_mask_empty(page_mask))
641         return;
642 
643     uvm_page_mask_copy(page_mask_copy, page_mask);
644     for_each_node_mask(nid, tracking->nodes) {
645         uvm_page_mask_t *node_mask = block_tracking_node_mask_get(va_block_context, nid);
646 
647         if (uvm_page_mask_and(node_pages_mask, page_mask_copy, node_mask)) {
648             uvm_va_block_cpu_set_resident_mask(va_block, nid, node_pages_mask);
649             uvm_page_mask_andnot(page_mask_copy, page_mask_copy, node_pages_mask);
650         }
651     }
652 
653     UVM_ASSERT(uvm_page_mask_empty(page_mask_copy));
654 }
655 
656 // Clear residency for all CPU pages in the mask.
657 // nid cannot be NUMA_NO_NODE.
uvm_va_block_cpu_clear_resident_mask(uvm_va_block_t * va_block,int nid,const uvm_page_mask_t * mask)658 static void uvm_va_block_cpu_clear_resident_mask(uvm_va_block_t *va_block, int nid, const uvm_page_mask_t *mask)
659 {
660     uvm_va_block_cpu_node_state_t *node_state;
661 
662     node_state = block_node_state_get(va_block, nid);
663     UVM_ASSERT(node_state);
664     uvm_page_mask_andnot(&node_state->resident, &node_state->resident, mask);
665     block_update_cpu_resident_mask(va_block);
666 }
667 
uvm_va_block_cpu_clear_resident_region(uvm_va_block_t * va_block,int nid,uvm_va_block_region_t region)668 static void uvm_va_block_cpu_clear_resident_region(uvm_va_block_t *va_block, int nid, uvm_va_block_region_t region)
669 {
670     uvm_va_block_cpu_node_state_t *node_state;
671 
672     node_state = block_node_state_get(va_block, nid);
673     UVM_ASSERT(node_state);
674     uvm_page_mask_region_clear(&node_state->resident, region);
675     block_update_cpu_resident_mask(va_block);
676 }
677 
678 // Clear residency bits from any/all processors that might have had pages resident.
679 // Note that both the destination processor and any CPU NUMA nodes where pages are
680 // migrating to need to be skipped as the block logic sets the new page residency
681 // before clearing the old ones (see uvm_va_block_make_resident_finish()).
uvm_va_block_cpu_clear_resident_all_chunks(uvm_va_block_t * va_block,uvm_va_block_context_t * va_block_context,uvm_page_mask_t * page_mask)682 static void uvm_va_block_cpu_clear_resident_all_chunks(uvm_va_block_t *va_block,
683                                                        uvm_va_block_context_t *va_block_context,
684                                                        uvm_page_mask_t *page_mask)
685 {
686     int nid;
687 
688     if (UVM_ID_IS_CPU(va_block_context->make_resident.dest_id) &&
689         nodes_empty(va_block_context->make_resident.cpu_pages_used.nodes))
690         return;
691 
692     for_each_possible_uvm_node(nid) {
693         // If the destination is the CPU and pages were allocated on this node
694         // for the migration, clear residency on the node only for pages that
695         // are in the page_mask but not in the node's allocated mask.
696         if (UVM_ID_IS_CPU(va_block_context->make_resident.dest_id) &&
697             node_isset(nid, va_block_context->make_resident.cpu_pages_used.nodes)) {
698             uvm_page_mask_t *node_pages_mask = &va_block_context->make_resident.node_pages_mask;
699             uvm_page_mask_t *node_alloc_mask = block_tracking_node_mask_get(va_block_context, nid);
700             uvm_page_mask_t *nid_resident = uvm_va_block_resident_mask_get(va_block, UVM_ID_CPU, nid);
701             uvm_page_mask_t *migrated_pages = &va_block_context->make_resident.pages_migrated;
702 
703             uvm_page_mask_andnot(node_pages_mask, nid_resident, node_alloc_mask);
704             if (uvm_page_mask_and(node_pages_mask, migrated_pages, node_pages_mask))
705                 uvm_va_block_cpu_clear_resident_mask(va_block, nid, node_pages_mask);
706         }
707         else {
708             uvm_va_block_cpu_clear_resident_mask(va_block, nid, page_mask);
709         }
710     }
711 }
712 
uvm_va_block_cpu_is_page_resident_on(uvm_va_block_t * va_block,int nid,uvm_page_index_t page_index)713 bool uvm_va_block_cpu_is_page_resident_on(uvm_va_block_t *va_block, int nid, uvm_page_index_t page_index)
714 {
715     uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, UVM_ID_CPU, nid);
716 
717     return uvm_page_mask_test(resident_mask, page_index);
718 }
719 
uvm_va_block_cpu_is_region_resident_on(uvm_va_block_t * va_block,int nid,uvm_va_block_region_t region)720 bool uvm_va_block_cpu_is_region_resident_on(uvm_va_block_t *va_block, int nid, uvm_va_block_region_t region)
721 {
722     uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, UVM_ID_CPU, nid);
723 
724     return uvm_page_mask_region_full(resident_mask, region);
725 }
726 
727 // Return the preferred NUMA node ID for the block's policy.
728 // If the preferred node ID is NUMA_NO_NODE, the nearest NUMA node ID
729 // with memory is returned. In most cases, this should be the current
730 // NUMA node.
uvm_va_block_context_get_node(uvm_va_block_context_t * va_block_context)731 static int uvm_va_block_context_get_node(uvm_va_block_context_t *va_block_context)
732 {
733     if (va_block_context->make_resident.dest_nid != NUMA_NO_NODE)
734         return va_block_context->make_resident.dest_nid;
735 
736     return numa_mem_id();
737 }
738 
uvm_va_block_find_vma_region(uvm_va_block_t * va_block,struct mm_struct * mm,NvU64 start,uvm_va_block_region_t * region)739 struct vm_area_struct *uvm_va_block_find_vma_region(uvm_va_block_t *va_block,
740                                                     struct mm_struct *mm,
741                                                     NvU64 start,
742                                                     uvm_va_block_region_t *region)
743 {
744     struct vm_area_struct *vma;
745     NvU64 end;
746 
747     if (start > va_block->end)
748         return NULL;
749 
750     vma = find_vma_intersection(mm, start, va_block->end + 1);
751     if (!vma)
752         return NULL;
753 
754     if (start < vma->vm_start)
755         start = vma->vm_start;
756 
757     end = vma->vm_end - 1;
758     if (end > va_block->end)
759         end = va_block->end;
760 
761     *region = uvm_va_block_region_from_start_end(va_block, start, end);
762 
763     return vma;
764 }
765 
block_check_cpu_chunks(uvm_va_block_t * block)766 static bool block_check_cpu_chunks(uvm_va_block_t *block)
767 {
768     int nid;
769     uvm_page_mask_t *temp_resident_mask;
770 
771     temp_resident_mask = kmem_cache_alloc(g_uvm_page_mask_cache, NV_UVM_GFP_FLAGS | __GFP_ZERO);
772 
773     for_each_possible_uvm_node(nid) {
774         uvm_cpu_chunk_t *chunk;
775         uvm_page_index_t page_index;
776         uvm_va_block_region_t prev_region = {0};
777         uvm_va_block_cpu_node_state_t *node_state = block_node_state_get(block, nid);
778         size_t alloced_pages = 0;
779 
780         for_each_cpu_chunk_in_block(chunk, page_index, block, nid) {
781             uvm_va_block_region_t chunk_region = uvm_cpu_chunk_block_region(block, chunk, page_index);
782             size_t num_chunk_pages = uvm_cpu_chunk_num_pages(chunk);
783             uvm_page_index_t chunk_page;
784 
785             UVM_ASSERT(prev_region.outer <= chunk_region.first);
786             UVM_ASSERT(IS_ALIGNED(uvm_va_block_region_start(block, chunk_region), uvm_cpu_chunk_get_size(chunk)));
787             UVM_ASSERT(chunk_region.outer <= uvm_va_block_num_cpu_pages(block));
788 
789             alloced_pages += uvm_cpu_chunk_num_pages(chunk);
790             UVM_ASSERT(uvm_page_mask_region_full(&node_state->allocated, chunk_region));
791             prev_region = chunk_region;
792 
793             for (chunk_page = page_index; chunk_page < page_index + num_chunk_pages; chunk_page++)
794                 UVM_ASSERT(uvm_cpu_chunk_get_chunk_for_page(block, nid, chunk_page) == chunk);
795         }
796 
797         UVM_ASSERT(alloced_pages == uvm_page_mask_weight(&node_state->allocated));
798         UVM_ASSERT(uvm_page_mask_subset(&node_state->resident, &node_state->allocated));
799         UVM_ASSERT(uvm_page_mask_subset(&node_state->resident, &block->cpu.resident));
800         if (temp_resident_mask && !uvm_page_mask_empty(&node_state->resident)) {
801             UVM_ASSERT(!uvm_page_mask_intersects(&node_state->resident, temp_resident_mask));
802             uvm_page_mask_or(temp_resident_mask, temp_resident_mask, &node_state->resident);
803         }
804     }
805 
806     if (temp_resident_mask) {
807         UVM_ASSERT(uvm_page_mask_equal(temp_resident_mask, &block->cpu.resident));
808         kmem_cache_free(g_uvm_page_mask_cache, temp_resident_mask);
809     }
810 
811     return true;
812 }
813 
814 // Frees any left-over free chunks and unpins all the used chunks
uvm_va_block_retry_deinit(uvm_va_block_retry_t * retry,uvm_va_block_t * va_block)815 void uvm_va_block_retry_deinit(uvm_va_block_retry_t *retry, uvm_va_block_t *va_block)
816 {
817     uvm_gpu_t *gpu;
818     uvm_gpu_chunk_t *gpu_chunk;
819     uvm_gpu_chunk_t *next_chunk;
820 
821     if (!retry)
822         return;
823 
824     uvm_tracker_deinit(&retry->tracker);
825 
826     // Free any unused chunks
827     list_for_each_entry_safe(gpu_chunk, next_chunk, &retry->free_chunks, list) {
828         list_del_init(&gpu_chunk->list);
829         gpu = uvm_gpu_chunk_get_gpu(gpu_chunk);
830         uvm_pmm_gpu_free(&gpu->pmm, gpu_chunk, NULL);
831     }
832 
833     // Unpin all the used chunks now that we are done
834     list_for_each_entry_safe(gpu_chunk, next_chunk, &retry->used_chunks, list) {
835         list_del_init(&gpu_chunk->list);
836         gpu = uvm_gpu_chunk_get_gpu(gpu_chunk);
837         // HMM should have already moved allocated blocks to the referenced
838         // state so any left over were not migrated and should be freed.
839         if (uvm_va_block_is_hmm(va_block))
840             uvm_pmm_gpu_free(&gpu->pmm, gpu_chunk, NULL);
841         else
842             uvm_pmm_gpu_unpin_allocated(&gpu->pmm, gpu_chunk, va_block);
843     }
844 }
845 
block_retry_add_free_chunk(uvm_va_block_retry_t * retry,uvm_gpu_chunk_t * gpu_chunk)846 static void block_retry_add_free_chunk(uvm_va_block_retry_t *retry, uvm_gpu_chunk_t *gpu_chunk)
847 {
848     list_add_tail(&gpu_chunk->list, &retry->free_chunks);
849 }
850 
block_retry_add_used_chunk(uvm_va_block_retry_t * retry,uvm_gpu_chunk_t * gpu_chunk)851 static void block_retry_add_used_chunk(uvm_va_block_retry_t *retry, uvm_gpu_chunk_t *gpu_chunk)
852 {
853     list_add_tail(&gpu_chunk->list, &retry->used_chunks);
854 }
855 
block_retry_get_free_chunk(uvm_va_block_retry_t * retry,uvm_gpu_t * gpu,uvm_chunk_size_t size)856 static uvm_gpu_chunk_t *block_retry_get_free_chunk(uvm_va_block_retry_t *retry, uvm_gpu_t *gpu, uvm_chunk_size_t size)
857 {
858     uvm_gpu_chunk_t *gpu_chunk;
859 
860     list_for_each_entry(gpu_chunk, &retry->free_chunks, list) {
861         if (uvm_gpu_chunk_get_gpu(gpu_chunk) == gpu && uvm_gpu_chunk_get_size(gpu_chunk) == size) {
862             list_del_init(&gpu_chunk->list);
863             return gpu_chunk;
864         }
865     }
866 
867     return NULL;
868 }
869 
870 // Encapsulates a reference to a physical page belonging to a specific processor
871 // within a VA block.
872 typedef struct
873 {
874     // Processor the page is on
875     uvm_processor_id_t processor;
876 
877     // The page index
878     uvm_page_index_t page_index;
879 
880     // If processor is the CPU, the NUMA node of the page.
881     int nid;
882 } block_phys_page_t;
883 
block_phys_page(uvm_processor_id_t processor,int nid,uvm_page_index_t page_index)884 static block_phys_page_t block_phys_page(uvm_processor_id_t processor, int nid, uvm_page_index_t page_index)
885 {
886     if (UVM_ID_IS_CPU(processor))
887         UVM_ASSERT(nid != NUMA_NO_NODE);
888 
889     return (block_phys_page_t){ processor, page_index, nid };
890 }
891 
uvm_va_block_init(void)892 NV_STATUS uvm_va_block_init(void)
893 {
894     if (uvm_enable_builtin_tests)
895         g_uvm_va_block_cache = NV_KMEM_CACHE_CREATE("uvm_va_block_wrapper_t", uvm_va_block_wrapper_t);
896     else
897         g_uvm_va_block_cache = NV_KMEM_CACHE_CREATE("uvm_va_block_t", uvm_va_block_t);
898 
899     if (!g_uvm_va_block_cache)
900         return NV_ERR_NO_MEMORY;
901 
902     g_uvm_va_block_gpu_state_cache = NV_KMEM_CACHE_CREATE("uvm_va_block_gpu_state_t", uvm_va_block_gpu_state_t);
903     if (!g_uvm_va_block_gpu_state_cache)
904         return NV_ERR_NO_MEMORY;
905 
906     g_uvm_page_mask_cache = NV_KMEM_CACHE_CREATE("uvm_page_mask_t", uvm_page_mask_t);
907     if (!g_uvm_page_mask_cache)
908         return NV_ERR_NO_MEMORY;
909 
910     g_uvm_va_block_context_cache = NV_KMEM_CACHE_CREATE("uvm_va_block_context_t", uvm_va_block_context_t);
911     if (!g_uvm_va_block_context_cache)
912         return NV_ERR_NO_MEMORY;
913 
914     g_uvm_va_block_cpu_node_state_cache = NV_KMEM_CACHE_CREATE("uvm_va_block_cpu_node_state_t",
915                                                                uvm_va_block_cpu_node_state_t);
916     if (!g_uvm_va_block_cpu_node_state_cache)
917         return NV_ERR_NO_MEMORY;
918 
919     return NV_OK;
920 }
921 
uvm_va_block_exit(void)922 void uvm_va_block_exit(void)
923 {
924     kmem_cache_destroy_safe(&g_uvm_va_block_cpu_node_state_cache);
925     kmem_cache_destroy_safe(&g_uvm_va_block_context_cache);
926     kmem_cache_destroy_safe(&g_uvm_page_mask_cache);
927     kmem_cache_destroy_safe(&g_uvm_va_block_gpu_state_cache);
928     kmem_cache_destroy_safe(&g_uvm_va_block_cache);
929 }
930 
block_context_free_tracking(uvm_make_resident_page_tracking_t * tracking)931 static void block_context_free_tracking(uvm_make_resident_page_tracking_t *tracking)
932 {
933     size_t index;
934 
935     for (index = 0; index < num_possible_nodes(); index++) {
936         if (tracking->node_masks[index])
937             kmem_cache_free(g_uvm_page_mask_cache, tracking->node_masks[index]);
938     }
939 
940     uvm_kvfree(tracking->node_masks);
941 }
942 
block_context_alloc_tracking(uvm_make_resident_page_tracking_t * tracking)943 static NV_STATUS block_context_alloc_tracking(uvm_make_resident_page_tracking_t *tracking)
944 {
945     size_t index;
946 
947     tracking->node_masks = uvm_kvmalloc_zero(num_possible_nodes() * sizeof(*tracking->node_masks));
948     if (!tracking->node_masks)
949         return NV_ERR_NO_MEMORY;
950 
951     for (index = 0; index < num_possible_nodes(); index++) {
952         tracking->node_masks[index] = kmem_cache_alloc(g_uvm_page_mask_cache, NV_UVM_GFP_FLAGS);
953         if (!tracking->node_masks[index])
954             goto error;
955     }
956 
957     return NV_OK;
958 
959 error:
960     block_context_free_tracking(tracking);
961     return NV_ERR_NO_MEMORY;
962 }
963 
uvm_va_block_context_alloc(struct mm_struct * mm)964 uvm_va_block_context_t *uvm_va_block_context_alloc(struct mm_struct *mm)
965 {
966     uvm_va_block_context_t *block_context = kmem_cache_alloc(g_uvm_va_block_context_cache, NV_UVM_GFP_FLAGS);
967     NV_STATUS status;
968 
969     if (!block_context)
970         return NULL;
971 
972     status = block_context_alloc_tracking(&block_context->make_resident.cpu_pages_used);
973     if (status != NV_OK) {
974         kmem_cache_free(g_uvm_va_block_context_cache, block_context);
975         return NULL;
976     }
977 
978     uvm_va_block_context_init(block_context, mm);
979     return block_context;
980 }
981 
uvm_va_block_context_init(uvm_va_block_context_t * va_block_context,struct mm_struct * mm)982 void uvm_va_block_context_init(uvm_va_block_context_t *va_block_context, struct mm_struct *mm)
983 {
984     UVM_ASSERT(va_block_context);
985 
986     // Write garbage into the VA Block context to ensure that the UVM code
987     // clears masks appropriately
988     if (UVM_IS_DEBUG()) {
989         uvm_page_mask_t **mask_array = va_block_context->make_resident.cpu_pages_used.node_masks;
990         int nid;
991 
992         memset(va_block_context, 0xff, sizeof(*va_block_context));
993 
994         for_each_possible_uvm_node(nid)
995             uvm_page_mask_fill(mask_array[node_to_index(nid)]);
996 
997         va_block_context->make_resident.cpu_pages_used.node_masks = mask_array;
998     }
999 
1000     va_block_context->mm = mm;
1001     va_block_context->make_resident.dest_nid = NUMA_NO_NODE;
1002     nodes_clear(va_block_context->make_resident.cpu_pages_used.nodes);
1003 }
1004 
uvm_va_block_context_free(uvm_va_block_context_t * va_block_context)1005 void uvm_va_block_context_free(uvm_va_block_context_t *va_block_context)
1006 {
1007     if (va_block_context) {
1008         block_context_free_tracking(&va_block_context->make_resident.cpu_pages_used);
1009         kmem_cache_free(g_uvm_va_block_context_cache, va_block_context);
1010     }
1011 }
1012 
1013 // Convert from page_index to chunk_index. The goal is for each system page in
1014 // the region [start, start + size) to be covered by the largest naturally-
1015 // aligned user chunk size.
uvm_va_block_gpu_chunk_index_range(NvU64 start,NvU64 size,uvm_gpu_t * gpu,uvm_page_index_t page_index,uvm_chunk_size_t * out_chunk_size)1016 size_t uvm_va_block_gpu_chunk_index_range(NvU64 start,
1017                                           NvU64 size,
1018                                           uvm_gpu_t *gpu,
1019                                           uvm_page_index_t page_index,
1020                                           uvm_chunk_size_t *out_chunk_size)
1021 {
1022     uvm_chunk_sizes_mask_t chunk_sizes = gpu->parent->mmu_user_chunk_sizes;
1023     uvm_chunk_size_t chunk_size, final_chunk_size;
1024     size_t num_chunks, num_chunks_total;
1025     NvU64 addr, end, aligned_start, aligned_addr, aligned_end, temp_size;
1026 
1027     UVM_ASSERT(PAGE_ALIGNED(start));
1028     UVM_ASSERT(PAGE_ALIGNED(size));
1029     UVM_ASSERT(size > 0);
1030     UVM_ASSERT(size <= UVM_CHUNK_SIZE_2M);
1031     UVM_ASSERT(UVM_ALIGN_DOWN(start, UVM_CHUNK_SIZE_2M) == UVM_ALIGN_DOWN(start + size - 1, UVM_CHUNK_SIZE_2M));
1032     BUILD_BUG_ON(UVM_VA_BLOCK_SIZE != UVM_CHUNK_SIZE_2M);
1033 
1034     // PAGE_SIZE needs to be the lowest natively-supported chunk size in the
1035     // mask, since we never deal with chunk sizes smaller than that (although we
1036     // may have PTEs mapping pages smaller than that).
1037     UVM_ASSERT(uvm_chunk_find_first_size(chunk_sizes) == PAGE_SIZE);
1038 
1039     // Optimize the ideal Pascal+ case: the whole block is covered by a single
1040     // 2M page.
1041     if ((chunk_sizes & UVM_CHUNK_SIZE_2M) && size == UVM_CHUNK_SIZE_2M) {
1042         UVM_ASSERT(IS_ALIGNED(start, UVM_CHUNK_SIZE_2M));
1043         final_chunk_size = UVM_CHUNK_SIZE_2M;
1044         num_chunks_total = 0;
1045         goto out;
1046     }
1047 
1048     // Only one 2M chunk can fit within a VA block on any GPU architecture, so
1049     // remove that size from consideration.
1050     chunk_sizes &= ~UVM_CHUNK_SIZE_2M;
1051 
1052     // Next common case: the whole block is aligned and sized to perfectly fit
1053     // the largest page size.
1054     final_chunk_size = uvm_chunk_find_last_size(chunk_sizes);
1055     if (IS_ALIGNED(start, final_chunk_size) && IS_ALIGNED(size, final_chunk_size)) {
1056         num_chunks_total = (size_t)uvm_div_pow2_64(page_index * PAGE_SIZE, final_chunk_size);
1057         goto out;
1058     }
1059 
1060     // We didn't hit our special paths. Do it the hard way.
1061 
1062     num_chunks_total = 0;
1063     addr = start + page_index * PAGE_SIZE;
1064     end = start + size;
1065     final_chunk_size = 0;
1066     UVM_ASSERT(addr < end);
1067 
1068     // The below loop collapses almost completely when chunk_size == PAGE_SIZE
1069     // since in that lowest-common-denominator case everything is already
1070     // aligned. Skip it and handle that specially after the loop.
1071     //
1072     // Note that since we removed 2M already above, this loop will only iterate
1073     // once on x86 Pascal+ since only 64K is left.
1074     chunk_sizes &= ~PAGE_SIZE;
1075 
1076     // This loop calculates the number of chunks between start and addr by
1077     // calculating the number of whole chunks of each size between them,
1078     // starting with the largest allowed chunk size. This requires fewer
1079     // iterations than if we began from start and kept calculating the next
1080     // larger chunk size boundary.
1081     for_each_chunk_size_rev(chunk_size, chunk_sizes) {
1082         aligned_start = UVM_ALIGN_UP(start, chunk_size);
1083         aligned_addr  = UVM_ALIGN_DOWN(addr, chunk_size);
1084         aligned_end   = UVM_ALIGN_DOWN(end, chunk_size);
1085 
1086         // If addr and start are within the same chunk, try smaller
1087         if (aligned_start > aligned_addr)
1088             continue;
1089 
1090         // If addr and end are not in the same chunk, then addr is covered by a
1091         // single chunk of the current size. Ignore smaller boundaries between
1092         // addr and aligned_addr.
1093         if (aligned_addr < aligned_end && final_chunk_size == 0) {
1094             addr = aligned_addr;
1095             final_chunk_size = chunk_size;
1096         }
1097 
1098         // How many chunks of this size are between start and addr? Note that
1099         // this might be 0 since aligned_addr and aligned_start could be in the
1100         // same chunk.
1101         num_chunks = uvm_div_pow2_32(((NvU32)aligned_addr - aligned_start), chunk_size);
1102         num_chunks_total += num_chunks;
1103 
1104         // We've already accounted for these chunks, so "remove" them by
1105         // bringing start, addr, and end closer together to calculate the
1106         // remaining chunk sizes.
1107         temp_size = num_chunks * chunk_size;
1108         addr -= temp_size;
1109         end -= temp_size;
1110 
1111         // Once there's no separation between addr and start, and we've
1112         // successfully found the right chunk size when taking end into account,
1113         // we're done.
1114         if (addr == start && final_chunk_size)
1115             break;
1116     }
1117 
1118     // Handle PAGE_SIZE cleanup since we skipped it in the loop
1119     num_chunks_total += (addr - start) / PAGE_SIZE;
1120     if (final_chunk_size == 0)
1121         final_chunk_size = PAGE_SIZE;
1122 
1123 out:
1124     if (out_chunk_size)
1125         *out_chunk_size = final_chunk_size;
1126 
1127     return num_chunks_total;
1128 }
1129 
block_gpu_chunk_index_range(uvm_va_block_t * va_block,NvU64 start,NvU64 size,uvm_gpu_t * gpu,uvm_page_index_t page_index,uvm_chunk_size_t * out_chunk_size)1130 static size_t block_gpu_chunk_index_range(uvm_va_block_t *va_block,
1131                                           NvU64 start,
1132                                           NvU64 size,
1133                                           uvm_gpu_t *gpu,
1134                                           uvm_page_index_t page_index,
1135                                           uvm_chunk_size_t *out_chunk_size)
1136 {
1137     if (uvm_va_block_is_hmm(va_block)) {
1138         if (out_chunk_size)
1139             *out_chunk_size = PAGE_SIZE;
1140         return page_index;
1141     }
1142 
1143     return uvm_va_block_gpu_chunk_index_range(start, size, gpu, page_index, out_chunk_size);
1144 }
1145 
block_gpu_chunk_index(uvm_va_block_t * block,uvm_gpu_t * gpu,uvm_page_index_t page_index,uvm_chunk_size_t * out_chunk_size)1146 static size_t block_gpu_chunk_index(uvm_va_block_t *block,
1147                                     uvm_gpu_t *gpu,
1148                                     uvm_page_index_t page_index,
1149                                     uvm_chunk_size_t *out_chunk_size)
1150 {
1151     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
1152     uvm_chunk_size_t size;
1153     uvm_gpu_chunk_t *chunk;
1154     size_t index;
1155 
1156     index = block_gpu_chunk_index_range(block, block->start, uvm_va_block_size(block), gpu, page_index, &size);
1157 
1158     UVM_ASSERT(size >= PAGE_SIZE);
1159 
1160     if (gpu_state) {
1161         UVM_ASSERT(gpu_state->chunks);
1162         chunk = gpu_state->chunks[index];
1163         if (chunk) {
1164             UVM_ASSERT(uvm_gpu_chunk_get_size(chunk) == size);
1165             UVM_ASSERT(chunk->state != UVM_PMM_GPU_CHUNK_STATE_PMA_OWNED);
1166             UVM_ASSERT(chunk->state != UVM_PMM_GPU_CHUNK_STATE_FREE);
1167         }
1168     }
1169 
1170     if (out_chunk_size)
1171         *out_chunk_size = size;
1172 
1173     return index;
1174 }
1175 
1176 // Compute the size of the chunk known to start at start_page_index
block_gpu_chunk_size(uvm_va_block_t * block,uvm_gpu_t * gpu,uvm_page_index_t start_page_index)1177 static uvm_chunk_size_t block_gpu_chunk_size(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_page_index_t start_page_index)
1178 {
1179     uvm_chunk_sizes_mask_t chunk_sizes = gpu->parent->mmu_user_chunk_sizes;
1180     uvm_chunk_sizes_mask_t start_alignments, pow2_leq_size, allowed_sizes;
1181     NvU64 start = uvm_va_block_cpu_page_address(block, start_page_index);
1182     NvU64 size = block->end - start + 1;
1183 
1184     if (uvm_va_block_is_hmm(block))
1185         return PAGE_SIZE;
1186 
1187     // Create a mask of all sizes for which start is aligned. x ^ (x-1) yields a
1188     // mask of the rightmost 1 bit in x, as well as all trailing 0 bits in x.
1189     // Example: 1011000 -> 0001111
1190     start_alignments = (uvm_chunk_sizes_mask_t)(start ^ (start - 1));
1191 
1192     // Next, compute all sizes (powers of two) which are <= size.
1193     pow2_leq_size = (uvm_chunk_sizes_mask_t)rounddown_pow_of_two(size);
1194     pow2_leq_size |= pow2_leq_size - 1;
1195 
1196     // Now and them all together to get our list of GPU-supported chunk sizes
1197     // which are aligned to start and will fit within size.
1198     allowed_sizes = chunk_sizes & start_alignments & pow2_leq_size;
1199 
1200     // start and size must always be aligned to at least the smallest supported
1201     // chunk size (PAGE_SIZE).
1202     UVM_ASSERT(allowed_sizes >= PAGE_SIZE);
1203 
1204     // Take the largest allowed size
1205     return uvm_chunk_find_last_size(allowed_sizes);
1206 }
1207 
block_num_gpu_chunks(uvm_va_block_t * block,uvm_gpu_t * gpu)1208 static size_t block_num_gpu_chunks(uvm_va_block_t *block, uvm_gpu_t *gpu)
1209 {
1210     return block_gpu_chunk_index(block, gpu, uvm_va_block_cpu_page_index(block, block->end), NULL) + 1;
1211 }
1212 
block_num_gpu_chunks_range(uvm_va_block_t * block,NvU64 start,NvU64 size,uvm_gpu_t * gpu)1213 static size_t block_num_gpu_chunks_range(uvm_va_block_t *block, NvU64 start, NvU64 size, uvm_gpu_t *gpu)
1214 {
1215     uvm_page_index_t last_page_index = (size_t)((size / PAGE_SIZE) - 1);
1216     return block_gpu_chunk_index_range(block, start, size, gpu, last_page_index, NULL) + 1;
1217 }
1218 
uvm_va_block_lookup_gpu_chunk(uvm_va_block_t * va_block,uvm_gpu_t * gpu,NvU64 address)1219 uvm_gpu_chunk_t *uvm_va_block_lookup_gpu_chunk(uvm_va_block_t *va_block, uvm_gpu_t *gpu, NvU64 address)
1220 {
1221     size_t chunk_index;
1222     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id);
1223     uvm_page_index_t page_index = uvm_va_block_cpu_page_index(va_block, address);
1224 
1225     uvm_assert_mutex_locked(&va_block->lock);
1226 
1227     if (!gpu_state)
1228         return NULL;
1229 
1230     chunk_index = block_gpu_chunk_index(va_block, gpu, page_index, NULL);
1231 
1232     return gpu_state->chunks[chunk_index];
1233 }
1234 
uvm_va_block_free(uvm_va_block_t * block)1235 static void uvm_va_block_free(uvm_va_block_t *block)
1236 {
1237     if (uvm_enable_builtin_tests) {
1238         uvm_va_block_wrapper_t *block_wrapper = container_of(block, uvm_va_block_wrapper_t, block);
1239 
1240         kmem_cache_free(g_uvm_va_block_cache, block_wrapper);
1241     }
1242     else {
1243         kmem_cache_free(g_uvm_va_block_cache, block);
1244     }
1245 }
1246 
uvm_va_block_create(uvm_va_range_t * va_range,NvU64 start,NvU64 end,uvm_va_block_t ** out_block)1247 NV_STATUS uvm_va_block_create(uvm_va_range_t *va_range,
1248                               NvU64 start,
1249                               NvU64 end,
1250                               uvm_va_block_t **out_block)
1251 {
1252     uvm_va_block_t *block = NULL;
1253     NvU64 size = end - start + 1;
1254     int nid;
1255 
1256     UVM_ASSERT(PAGE_ALIGNED(start));
1257     UVM_ASSERT(PAGE_ALIGNED(end + 1));
1258     UVM_ASSERT(PAGE_ALIGNED(size));
1259     UVM_ASSERT(size > 0);
1260     UVM_ASSERT(size <= UVM_VA_BLOCK_SIZE);
1261 
1262     if (va_range) {
1263         // Create a managed va_block.
1264         UVM_ASSERT(start >= va_range->node.start);
1265         UVM_ASSERT(end <= va_range->node.end);
1266         UVM_ASSERT(va_range->type == UVM_VA_RANGE_TYPE_MANAGED);
1267     }
1268 
1269     // Blocks can't span a block alignment boundary
1270     UVM_ASSERT(UVM_VA_BLOCK_ALIGN_DOWN(start) == UVM_VA_BLOCK_ALIGN_DOWN(end));
1271 
1272     if (uvm_enable_builtin_tests) {
1273         uvm_va_block_wrapper_t *block_wrapper = nv_kmem_cache_zalloc(g_uvm_va_block_cache, NV_UVM_GFP_FLAGS);
1274 
1275         if (block_wrapper) {
1276             block = &block_wrapper->block;
1277             block_wrapper->test.cpu_chunk_allocation_target_id = NUMA_NO_NODE;
1278             block_wrapper->test.cpu_chunk_allocation_actual_id = NUMA_NO_NODE;
1279         }
1280     }
1281     else {
1282         block = nv_kmem_cache_zalloc(g_uvm_va_block_cache, NV_UVM_GFP_FLAGS);
1283     }
1284 
1285     if (!block)
1286         return NV_ERR_NO_MEMORY;
1287 
1288     block->cpu.node_state = uvm_kvmalloc_zero(sizeof(*block->cpu.node_state) * num_possible_nodes());
1289     if (!block->cpu.node_state)
1290         goto error_block_free;
1291 
1292     for_each_possible_uvm_node(nid) {
1293         size_t index = node_to_index(nid);
1294 
1295         block->cpu.node_state[index] = nv_kmem_cache_zalloc(g_uvm_va_block_cpu_node_state_cache, NV_UVM_GFP_FLAGS);
1296         if (!block->cpu.node_state[index])
1297             goto error;
1298     }
1299 
1300     nv_kref_init(&block->kref);
1301     uvm_mutex_init(&block->lock, UVM_LOCK_ORDER_VA_BLOCK);
1302     block->start = start;
1303     block->end = end;
1304     block->va_range = va_range;
1305     uvm_tracker_init(&block->tracker);
1306     block->prefetch_info.last_migration_proc_id = UVM_ID_INVALID;
1307 
1308     nv_kthread_q_item_init(&block->eviction_mappings_q_item, block_add_eviction_mappings_entry, block);
1309 
1310     *out_block = block;
1311     return NV_OK;
1312 
1313 error:
1314     if (block->cpu.node_state) {
1315         for_each_possible_uvm_node(nid) {
1316             size_t index = node_to_index(nid);
1317 
1318             if (block->cpu.node_state[index])
1319                 kmem_cache_free(g_uvm_va_block_cpu_node_state_cache, block->cpu.node_state[index]);
1320         }
1321     }
1322 
1323     uvm_kvfree(block->cpu.node_state);
1324 
1325 error_block_free:
1326     uvm_va_block_free(block);
1327     return NV_ERR_NO_MEMORY;
1328 }
1329 
cpu_chunk_remove_sysmem_gpu_mapping(uvm_cpu_chunk_t * chunk,uvm_gpu_t * gpu)1330 static void cpu_chunk_remove_sysmem_gpu_mapping(uvm_cpu_chunk_t *chunk, uvm_gpu_t *gpu)
1331 {
1332     NvU64 gpu_mapping_addr = uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk, gpu->parent);
1333     if (gpu_mapping_addr == 0)
1334         return;
1335 
1336     uvm_pmm_sysmem_mappings_remove_gpu_mapping(&gpu->pmm_reverse_sysmem_mappings, gpu_mapping_addr);
1337     uvm_cpu_chunk_unmap_parent_gpu_phys(chunk, gpu->parent);
1338 }
1339 
cpu_chunk_add_sysmem_gpu_mapping(uvm_cpu_chunk_t * chunk,uvm_va_block_t * block,uvm_page_index_t page_index,uvm_gpu_t * gpu)1340 static NV_STATUS cpu_chunk_add_sysmem_gpu_mapping(uvm_cpu_chunk_t *chunk,
1341                                                   uvm_va_block_t *block,
1342                                                   uvm_page_index_t page_index,
1343                                                   uvm_gpu_t *gpu)
1344 {
1345     NV_STATUS status;
1346     uvm_chunk_size_t chunk_size;
1347 
1348     // When the Confidential Computing feature is enabled the transfers don't
1349     // use the DMA mapping of CPU chunks (since it's protected memory), but
1350     // the DMA address of the unprotected dma buffer.
1351     if (g_uvm_global.conf_computing_enabled)
1352         return NV_OK;
1353 
1354     status = uvm_cpu_chunk_map_gpu(chunk, gpu);
1355     if (status != NV_OK)
1356         return status;
1357 
1358     chunk_size = uvm_cpu_chunk_get_size(chunk);
1359 
1360     // TODO: Bug 3744779: Handle benign assertion in
1361     //       pmm_sysmem_mappings_remove_gpu_mapping() in case of a
1362     //       failure.
1363     status = uvm_pmm_sysmem_mappings_add_gpu_mapping(&gpu->pmm_reverse_sysmem_mappings,
1364                                                      uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk, gpu->parent),
1365                                                      uvm_va_block_cpu_page_address(block, page_index),
1366                                                      chunk_size,
1367                                                      block,
1368                                                      UVM_ID_CPU);
1369     if (status != NV_OK)
1370         cpu_chunk_remove_sysmem_gpu_mapping(chunk, gpu);
1371 
1372     return status;
1373 }
1374 
block_gpu_unmap_phys_all_cpu_pages(uvm_va_block_t * block,uvm_gpu_t * gpu)1375 static void block_gpu_unmap_phys_all_cpu_pages(uvm_va_block_t *block, uvm_gpu_t *gpu)
1376 {
1377     uvm_cpu_chunk_t *chunk;
1378     uvm_page_index_t page_index;
1379     int nid;
1380 
1381     for_each_possible_uvm_node(nid) {
1382         for_each_cpu_chunk_in_block(chunk, page_index, block, nid)
1383             cpu_chunk_remove_sysmem_gpu_mapping(chunk, gpu);
1384     }
1385 }
1386 
block_gpu_map_phys_all_cpu_pages(uvm_va_block_t * block,uvm_gpu_t * gpu)1387 static NV_STATUS block_gpu_map_phys_all_cpu_pages(uvm_va_block_t *block, uvm_gpu_t *gpu)
1388 {
1389     NV_STATUS status;
1390     uvm_cpu_chunk_t *chunk;
1391     NvU64 block_mapping_size = uvm_va_block_size(block);
1392     uvm_page_index_t page_index;
1393     int nid;
1394 
1395     UVM_ASSERT(IS_ALIGNED(block_mapping_size, UVM_PAGE_SIZE_4K));
1396 
1397     for_each_possible_uvm_node(nid) {
1398         for_each_cpu_chunk_in_block(chunk, page_index, block, nid) {
1399             UVM_ASSERT_MSG(uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk, gpu->parent) == 0,
1400                            "GPU%u DMA address 0x%llx\n",
1401                            uvm_id_value(gpu->id),
1402                            uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk, gpu->parent));
1403 
1404             status = cpu_chunk_add_sysmem_gpu_mapping(chunk, block, page_index, gpu);
1405             if (status != NV_OK)
1406                 goto error;
1407         }
1408     }
1409 
1410     return NV_OK;
1411 
1412 error:
1413     block_gpu_unmap_phys_all_cpu_pages(block, gpu);
1414     return status;
1415 }
1416 
block_sysmem_mappings_add_gpu_chunk(uvm_va_block_t * block,uvm_gpu_t * local_gpu,uvm_gpu_chunk_t * chunk,uvm_gpu_t * accessing_gpu)1417 static NV_STATUS block_sysmem_mappings_add_gpu_chunk(uvm_va_block_t *block,
1418                                                      uvm_gpu_t *local_gpu,
1419                                                      uvm_gpu_chunk_t *chunk,
1420                                                      uvm_gpu_t *accessing_gpu)
1421 {
1422     NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(&local_gpu->pmm, chunk, accessing_gpu);
1423     return uvm_pmm_sysmem_mappings_add_gpu_chunk_mapping(&accessing_gpu->pmm_reverse_sysmem_mappings,
1424                                                          peer_addr,
1425                                                          block->start + chunk->va_block_page_index * PAGE_SIZE,
1426                                                          uvm_gpu_chunk_get_size(chunk),
1427                                                          block,
1428                                                          local_gpu->id);
1429 }
1430 
block_sysmem_mappings_remove_gpu_chunk(uvm_gpu_t * local_gpu,uvm_gpu_chunk_t * chunk,uvm_gpu_t * accessing_gpu)1431 static void block_sysmem_mappings_remove_gpu_chunk(uvm_gpu_t *local_gpu,
1432                                                    uvm_gpu_chunk_t *chunk,
1433                                                    uvm_gpu_t *accessing_gpu)
1434 {
1435     NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(&local_gpu->pmm, chunk, accessing_gpu);
1436     uvm_pmm_sysmem_mappings_remove_gpu_chunk_mapping(&accessing_gpu->pmm_reverse_sysmem_mappings, peer_addr);
1437 }
1438 
block_gpu_map_all_chunks_indirect_peer(uvm_va_block_t * block,uvm_gpu_t * local_gpu,uvm_gpu_t * accessing_gpu)1439 static NV_STATUS block_gpu_map_all_chunks_indirect_peer(uvm_va_block_t *block,
1440                                                         uvm_gpu_t *local_gpu,
1441                                                         uvm_gpu_t *accessing_gpu)
1442 {
1443     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, local_gpu->id);
1444     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
1445     size_t num_chunks, i;
1446     NV_STATUS status;
1447 
1448     UVM_ASSERT(uvm_processor_mask_test(&va_space->indirect_peers[uvm_id_value(local_gpu->id)],
1449                                        accessing_gpu->id));
1450 
1451     // If no chunks are allocated currently, the mappings will be created later
1452     // at chunk allocation.
1453     if (!gpu_state || !gpu_state->chunks)
1454         return NV_OK;
1455 
1456     num_chunks = block_num_gpu_chunks(block, local_gpu);
1457     for (i = 0; i < num_chunks; i++) {
1458         uvm_gpu_chunk_t *chunk = gpu_state->chunks[i];
1459         if (!chunk)
1460             continue;
1461 
1462         status = uvm_pmm_gpu_indirect_peer_map(&local_gpu->pmm, chunk, accessing_gpu);
1463         if (status != NV_OK)
1464             goto error;
1465 
1466         status = block_sysmem_mappings_add_gpu_chunk(block, local_gpu, chunk, accessing_gpu);
1467         if (status != NV_OK)
1468             goto error;
1469     }
1470 
1471     return NV_OK;
1472 
1473 error:
1474     while (i-- > 0) {
1475         uvm_gpu_chunk_t *chunk = gpu_state->chunks[i];
1476         if (chunk) {
1477             // Indirect peer mappings are removed lazily by PMM, so if an error
1478             // occurs the mappings established above will be removed when the
1479             // chunk is freed later on. We only need to remove the sysmem
1480             // reverse mappings.
1481             block_sysmem_mappings_remove_gpu_chunk(local_gpu, chunk, accessing_gpu);
1482         }
1483     }
1484 
1485     return status;
1486 }
1487 
1488 // Mappings for indirect peers are removed lazily by PMM, but we need to remove
1489 // the entries from the reverse map.
block_gpu_unmap_all_chunks_indirect_peer(uvm_va_block_t * block,uvm_gpu_t * local_gpu,uvm_gpu_t * accessing_gpu)1490 static void block_gpu_unmap_all_chunks_indirect_peer(uvm_va_block_t *block,
1491                                                      uvm_gpu_t *local_gpu,
1492                                                      uvm_gpu_t *accessing_gpu)
1493 {
1494     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, local_gpu->id);
1495     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
1496     size_t num_chunks, i;
1497 
1498     UVM_ASSERT(uvm_processor_mask_test(&va_space->indirect_peers[uvm_id_value(local_gpu->id)],
1499                                        accessing_gpu->id));
1500 
1501     // Exit if no chunks are allocated currently.
1502     if (!gpu_state || !gpu_state->chunks)
1503         return;
1504 
1505     num_chunks = block_num_gpu_chunks(block, local_gpu);
1506     for (i = 0; i < num_chunks; i++) {
1507         uvm_gpu_chunk_t *chunk = gpu_state->chunks[i];
1508         if (chunk)
1509             block_sysmem_mappings_remove_gpu_chunk(local_gpu, chunk, accessing_gpu);
1510     }
1511 }
1512 
1513 // Retrieves the gpu_state for the given GPU. The returned pointer is
1514 // internally managed and will be allocated (and freed) automatically,
1515 // rather than by the caller.
block_gpu_state_get_alloc(uvm_va_block_t * block,uvm_gpu_t * gpu)1516 static uvm_va_block_gpu_state_t *block_gpu_state_get_alloc(uvm_va_block_t *block, uvm_gpu_t *gpu)
1517 {
1518     NV_STATUS status;
1519     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
1520 
1521     if (gpu_state)
1522         return gpu_state;
1523 
1524     gpu_state = nv_kmem_cache_zalloc(g_uvm_va_block_gpu_state_cache, NV_UVM_GFP_FLAGS);
1525     if (!gpu_state)
1526         return NULL;
1527 
1528     gpu_state->chunks = uvm_kvmalloc_zero(block_num_gpu_chunks(block, gpu) * sizeof(gpu_state->chunks[0]));
1529     if (!gpu_state->chunks)
1530         goto error;
1531 
1532     block->gpus[uvm_id_gpu_index(gpu->id)] = gpu_state;
1533 
1534     status = block_gpu_map_phys_all_cpu_pages(block, gpu);
1535     if (status != NV_OK)
1536         goto error;
1537 
1538     return gpu_state;
1539 
1540 error:
1541     uvm_kvfree(gpu_state->chunks);
1542     kmem_cache_free(g_uvm_va_block_gpu_state_cache, gpu_state);
1543     block->gpus[uvm_id_gpu_index(gpu->id)] = NULL;
1544 
1545     return NULL;
1546 }
1547 
uvm_va_block_gpu_state_alloc(uvm_va_block_t * va_block)1548 NV_STATUS uvm_va_block_gpu_state_alloc(uvm_va_block_t *va_block)
1549 {
1550     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
1551     uvm_gpu_id_t gpu_id;
1552 
1553     UVM_ASSERT(uvm_va_block_is_hmm(va_block));
1554     uvm_assert_mutex_locked(&va_block->lock);
1555 
1556     for_each_gpu_id_in_mask(gpu_id, &va_space->registered_gpus) {
1557         if (!block_gpu_state_get_alloc(va_block, uvm_va_space_get_gpu(va_space, gpu_id)))
1558             return NV_ERR_NO_MEMORY;
1559     }
1560 
1561     return NV_OK;
1562 }
1563 
uvm_va_block_unmap_cpu_chunk_on_gpus(uvm_va_block_t * block,uvm_cpu_chunk_t * chunk,uvm_page_index_t page_index)1564 void uvm_va_block_unmap_cpu_chunk_on_gpus(uvm_va_block_t *block,
1565                                           uvm_cpu_chunk_t *chunk,
1566                                           uvm_page_index_t page_index)
1567 {
1568     uvm_gpu_id_t id;
1569 
1570     for_each_gpu_id(id) {
1571         if (uvm_va_block_gpu_state_get(block, id))
1572             cpu_chunk_remove_sysmem_gpu_mapping(chunk, block_get_gpu(block, id));
1573     }
1574 }
1575 
uvm_va_block_map_cpu_chunk_on_gpus(uvm_va_block_t * block,uvm_cpu_chunk_t * chunk,uvm_page_index_t page_index)1576 NV_STATUS uvm_va_block_map_cpu_chunk_on_gpus(uvm_va_block_t *block,
1577                                              uvm_cpu_chunk_t *chunk,
1578                                              uvm_page_index_t page_index)
1579 {
1580     NV_STATUS status;
1581     uvm_gpu_id_t id;
1582     uvm_chunk_size_t chunk_size = uvm_cpu_chunk_get_size(chunk);
1583     uvm_va_block_region_t chunk_region = uvm_va_block_chunk_region(block, chunk_size, page_index);
1584 
1585     // We can't iterate over va_space->registered_gpus because we might be
1586     // on the eviction path, which does not have the VA space lock held. We have
1587     // the VA block lock held however, so the gpu_states can't change.
1588     uvm_assert_mutex_locked(&block->lock);
1589 
1590     for_each_gpu_id(id) {
1591         uvm_gpu_t *gpu;
1592 
1593         if (!uvm_va_block_gpu_state_get(block, id))
1594             continue;
1595 
1596         gpu = block_get_gpu(block, id);
1597         status = cpu_chunk_add_sysmem_gpu_mapping(chunk, block, chunk_region.first, gpu);
1598         if (status != NV_OK)
1599             goto error;
1600     }
1601 
1602     return NV_OK;
1603 
1604 error:
1605     uvm_va_block_unmap_cpu_chunk_on_gpus(block, chunk, page_index);
1606     return status;
1607 }
1608 
uvm_va_block_remove_cpu_chunks(uvm_va_block_t * va_block,uvm_va_block_region_t region)1609 void uvm_va_block_remove_cpu_chunks(uvm_va_block_t *va_block, uvm_va_block_region_t region)
1610 {
1611     uvm_cpu_chunk_t *chunk;
1612     uvm_page_index_t page_index, next_page_index;
1613     uvm_va_block_region_t chunk_region;
1614     int nid;
1615 
1616     for_each_possible_uvm_node(nid) {
1617         for_each_cpu_chunk_in_block_region_safe(chunk, page_index, next_page_index, va_block, nid, region) {
1618             chunk_region = uvm_va_block_region(page_index, page_index + uvm_cpu_chunk_num_pages(chunk));
1619 
1620             uvm_page_mask_region_clear(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ], chunk_region);
1621             uvm_page_mask_region_clear(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE], chunk_region);
1622             uvm_va_block_cpu_clear_resident_region(va_block, nid, chunk_region);
1623             uvm_cpu_chunk_remove_from_block(va_block, nid, page_index);
1624             uvm_va_block_unmap_cpu_chunk_on_gpus(va_block, chunk, page_index);
1625             uvm_cpu_chunk_free(chunk);
1626         }
1627     }
1628 
1629     if (uvm_page_mask_empty(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ]))
1630         uvm_processor_mask_clear(&va_block->mapped, UVM_ID_CPU);
1631 
1632     if (uvm_page_mask_empty(uvm_va_block_resident_mask_get(va_block, UVM_ID_CPU, NUMA_NO_NODE)))
1633         uvm_processor_mask_clear(&va_block->resident, UVM_ID_CPU);
1634 }
1635 
1636 // Create physical mappings to allow other GPUs to access this chunk.
block_map_indirect_peers_to_gpu_chunk(uvm_va_block_t * block,uvm_gpu_t * gpu,uvm_gpu_chunk_t * chunk)1637 static NV_STATUS block_map_indirect_peers_to_gpu_chunk(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_gpu_chunk_t *chunk)
1638 {
1639     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
1640     uvm_gpu_t *accessing_gpu, *remove_gpu;
1641     NV_STATUS status;
1642 
1643     // Unlike uvm_va_block_map_cpu_chunk_on_gpus, this function isn't called on
1644     // the eviction path, so we can assume that the VA space is locked.
1645     //
1646     // TODO: Bug 2007346: In the future we may want to enable eviction to peers,
1647     //       meaning we may need to allocate peer memory and map it on the
1648     //       eviction path. That will require making sure that peers can't be
1649     //       enabled or disabled either in the VA space or globally within this
1650     //       function.
1651     uvm_assert_rwsem_locked(&va_space->lock);
1652     uvm_assert_mutex_locked(&block->lock);
1653 
1654     for_each_va_space_gpu_in_mask(accessing_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)]) {
1655         status = uvm_pmm_gpu_indirect_peer_map(&gpu->pmm, chunk, accessing_gpu);
1656         if (status != NV_OK)
1657             goto error;
1658 
1659         status = block_sysmem_mappings_add_gpu_chunk(block, gpu, chunk, accessing_gpu);
1660         if (status != NV_OK)
1661             goto error;
1662     }
1663 
1664     return NV_OK;
1665 
1666 error:
1667     for_each_va_space_gpu_in_mask(remove_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)]) {
1668         if (remove_gpu == accessing_gpu)
1669             break;
1670 
1671         // Indirect peer mappings are removed lazily by PMM, so if an error
1672         // occurs the mappings established above will be removed when the
1673         // chunk is freed later on. We only need to remove the sysmem
1674         // reverse mappings.
1675         block_sysmem_mappings_remove_gpu_chunk(gpu, chunk, remove_gpu);
1676     }
1677 
1678     return status;
1679 }
1680 
block_unmap_indirect_peers_from_gpu_chunk(uvm_va_block_t * block,uvm_gpu_t * gpu,uvm_gpu_chunk_t * chunk)1681 static void block_unmap_indirect_peers_from_gpu_chunk(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_gpu_chunk_t *chunk)
1682 {
1683     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
1684     uvm_gpu_t *peer_gpu;
1685 
1686     uvm_assert_rwsem_locked(&va_space->lock);
1687     uvm_assert_mutex_locked(&block->lock);
1688 
1689     // Indirect peer mappings are removed lazily by PMM, so we only need to
1690     // remove the sysmem reverse mappings.
1691     for_each_va_space_gpu_in_mask(peer_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)])
1692         block_sysmem_mappings_remove_gpu_chunk(gpu, chunk, peer_gpu);
1693 }
1694 
1695 // Mark a CPU page as dirty.
block_mark_cpu_page_dirty(uvm_va_block_t * block,uvm_page_index_t page_index,int nid)1696 static void block_mark_cpu_page_dirty(uvm_va_block_t *block, uvm_page_index_t page_index, int nid)
1697 {
1698     uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, nid, page_index);
1699     uvm_va_block_region_t chunk_region = uvm_va_block_chunk_region(block, uvm_cpu_chunk_get_size(chunk), page_index);
1700     uvm_cpu_chunk_mark_dirty(chunk, page_index - chunk_region.first);
1701 }
1702 
1703 // Mark a CPU page as clean.
block_mark_cpu_page_clean(uvm_va_block_t * block,uvm_page_index_t page_index,int nid)1704 static void block_mark_cpu_page_clean(uvm_va_block_t *block, uvm_page_index_t page_index, int nid)
1705 {
1706     uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, nid, page_index);
1707     uvm_va_block_region_t chunk_region = uvm_va_block_chunk_region(block, uvm_cpu_chunk_get_size(chunk), page_index);
1708     uvm_cpu_chunk_mark_clean(chunk, page_index - chunk_region.first);
1709 }
1710 
1711 // Check if a CPU page is dirty.
block_cpu_page_is_dirty(uvm_va_block_t * block,uvm_page_index_t page_index,int nid)1712 static bool block_cpu_page_is_dirty(uvm_va_block_t *block, uvm_page_index_t page_index, int nid)
1713 {
1714     uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, nid, page_index);
1715     uvm_va_block_region_t chunk_region = uvm_va_block_chunk_region(block, uvm_cpu_chunk_get_size(chunk), page_index);
1716     return uvm_cpu_chunk_is_dirty(chunk, page_index - chunk_region.first);
1717 }
1718 
block_alloc_cpu_chunk_inject_error(uvm_va_block_t * block,uvm_chunk_size_t alloc_size,uvm_cpu_chunk_alloc_flags_t flags,int nid,uvm_cpu_chunk_t ** chunk)1719 static NV_STATUS block_alloc_cpu_chunk_inject_error(uvm_va_block_t *block,
1720                                                     uvm_chunk_size_t alloc_size,
1721                                                     uvm_cpu_chunk_alloc_flags_t flags,
1722                                                     int nid,
1723                                                     uvm_cpu_chunk_t **chunk)
1724 {
1725     uvm_va_block_test_t *block_test = uvm_va_block_get_test(block);
1726 
1727     if (block_test) {
1728         // Return out of memory error if the tests have requested it. As opposed
1729         // to other error injection settings, this one fails N times and then
1730         // succeeds.
1731         // TODO: Bug 3701182: This will print a warning in Linux kernels newer
1732         // than 5.16.0-rc1+.
1733         if (block_test->inject_cpu_pages_allocation_error_count) {
1734             if (block_test->inject_cpu_pages_allocation_error_count != ~(NvU32)0)
1735                 block_test->inject_cpu_pages_allocation_error_count--;
1736             return NV_ERR_NO_MEMORY;
1737         }
1738 
1739         if (block_test->cpu_chunk_allocation_actual_id != NUMA_NO_NODE)
1740             nid = block_test->cpu_chunk_allocation_actual_id;
1741     }
1742 
1743     return uvm_cpu_chunk_alloc(alloc_size, flags, nid, chunk);
1744 }
1745 
1746 // Allocate a CPU chunk with the given properties. This may involve retrying if
1747 // allocations fail. Allocating larger chunk sizes takes priority over
1748 // allocating on the specified node in the following manner:
1749 
1750 // 1. Attempt to allocate the largest chunk on nid.
1751 // 2. If that fails attempt allocation of the largest chunk on any nid.
1752 // 3. If that fails attempt progressively smaller allocations on any nid.
1753 //
1754 // Returns NV_OK on success. Returns NV_WARN_MORE_PROCESSING_REQUIRED if
1755 // UVM_CPU_CHUNK_ALLOC_FLAGS_STRICT was ignored to successfully allocate.
block_alloc_cpu_chunk(uvm_va_block_t * block,uvm_chunk_sizes_mask_t cpu_allocation_sizes,uvm_cpu_chunk_alloc_flags_t flags,int nid,uvm_cpu_chunk_t ** chunk)1756 static NV_STATUS block_alloc_cpu_chunk(uvm_va_block_t *block,
1757                                        uvm_chunk_sizes_mask_t cpu_allocation_sizes,
1758                                        uvm_cpu_chunk_alloc_flags_t flags,
1759                                        int nid,
1760                                        uvm_cpu_chunk_t **chunk)
1761 {
1762     NV_STATUS status = NV_ERR_NO_MEMORY;
1763     uvm_chunk_size_t alloc_size;
1764     bool numa_fallback = false;
1765 
1766     for_each_chunk_size_rev(alloc_size, cpu_allocation_sizes) {
1767         status = block_alloc_cpu_chunk_inject_error(block, alloc_size, flags, nid, chunk);
1768         if (status == NV_OK)
1769             break;
1770 
1771         if (flags & UVM_CPU_CHUNK_ALLOC_FLAGS_STRICT) {
1772             flags &= ~UVM_CPU_CHUNK_ALLOC_FLAGS_STRICT;
1773             numa_fallback = true;
1774             status = block_alloc_cpu_chunk_inject_error(block, alloc_size, flags, NUMA_NO_NODE, chunk);
1775             if (status == NV_OK)
1776                 break;
1777         }
1778     }
1779 
1780     UVM_ASSERT(status == NV_OK || status == NV_ERR_NO_MEMORY);
1781 
1782     if (numa_fallback && status == NV_OK)
1783         status = NV_WARN_MORE_PROCESSING_REQUIRED;
1784 
1785     return status;
1786 }
1787 
1788 // Same as block_alloc_cpu_chunk() but allocate a chunk suitable for use as
1789 // a HMM destination page. The main difference is UVM does not own the reference
1790 // on the struct page backing these chunks.
block_alloc_hmm_cpu_chunk(uvm_va_block_t * block,uvm_chunk_sizes_mask_t cpu_allocation_sizes,uvm_cpu_chunk_alloc_flags_t flags,int nid,uvm_cpu_chunk_t ** chunk)1791 static NV_STATUS block_alloc_hmm_cpu_chunk(uvm_va_block_t *block,
1792                                            uvm_chunk_sizes_mask_t cpu_allocation_sizes,
1793                                            uvm_cpu_chunk_alloc_flags_t flags,
1794                                            int nid,
1795                                            uvm_cpu_chunk_t **chunk)
1796 {
1797     NV_STATUS status;
1798 
1799     UVM_ASSERT(uvm_va_block_is_hmm(block));
1800 
1801     status = block_alloc_cpu_chunk(block, cpu_allocation_sizes, flags, nid, chunk);
1802     if (status == NV_OK)
1803         (*chunk)->type = UVM_CPU_CHUNK_TYPE_HMM;
1804 
1805     return status;
1806 }
1807 
1808 // Find the largest allocation size we can use for the given page_index in the
1809 // given block. Returns the mask of possible sizes and region covered by the
1810 // largest. Callers may also elect to use a smaller size.
block_calculate_largest_alloc_size(uvm_va_block_t * va_block,uvm_page_index_t page_index,uvm_page_mask_t * allocated_mask,uvm_chunk_sizes_mask_t cpu_allocation_sizes,uvm_va_block_region_t * allocated_region)1811 static uvm_chunk_sizes_mask_t block_calculate_largest_alloc_size(uvm_va_block_t *va_block,
1812                                                                  uvm_page_index_t page_index,
1813                                                                  uvm_page_mask_t *allocated_mask,
1814                                                                  uvm_chunk_sizes_mask_t cpu_allocation_sizes,
1815                                                                  uvm_va_block_region_t *allocated_region)
1816 {
1817     uvm_chunk_size_t alloc_size;
1818     uvm_chunk_sizes_mask_t allocation_sizes = cpu_allocation_sizes;
1819 
1820     for_each_chunk_size_rev(alloc_size, cpu_allocation_sizes) {
1821         NvU64 alloc_virt_addr;
1822 
1823         // Page must be aligned to the allocation size.
1824         alloc_virt_addr = UVM_ALIGN_DOWN(uvm_va_block_cpu_page_address(va_block, page_index), alloc_size);
1825 
1826         // Allocation region must fit within the VA block.
1827         if (!uvm_va_block_contains_address(va_block, alloc_virt_addr) ||
1828             !uvm_va_block_contains_address(va_block, alloc_virt_addr + alloc_size - 1)) {
1829             allocation_sizes &= ~alloc_size;
1830             continue;
1831         }
1832 
1833         *allocated_region = uvm_va_block_region_from_start_end(va_block,
1834                                                                alloc_virt_addr,
1835                                                                alloc_virt_addr + alloc_size - 1);
1836 
1837         // Allocation region can't overlap previously allocated regions.
1838         if (!uvm_page_mask_region_empty(allocated_mask, *allocated_region)) {
1839             allocation_sizes &= ~alloc_size;
1840             continue;
1841         }
1842 
1843         return allocation_sizes;
1844     }
1845 
1846     // No possible size was found.
1847     allocated_region->first = 0;
1848     allocated_region->outer = 0;
1849 
1850     return UVM_CHUNK_SIZE_INVALID;
1851 }
1852 
1853 // Handle insertion of overlapping CPU chunks.
1854 // In cases where the kernel allocates CPU chunks on NUMA nodes that already
1855 // have existing chunks, it's possible that the newly allocated chunk overlaps
1856 // existing chunks.
1857 // In such cases, the newly allocated chunk has to be appropriately split and
1858 // only the non-overlapping subchunks inserted into the block.
1859 // The subchunks, which are not inserted are freed.
1860 // If there is an error during split, insertion, or mapping, any sub-chunks that
1861 // have already been successfully inserted will remain in the block. The rest of
1862 // the sub-chunks will be freed in order to maintain proper refcounts on the
1863 // parent chunk.
block_populate_overlapping_cpu_chunks(uvm_va_block_t * block,uvm_page_mask_t * node_pages_mask,uvm_cpu_chunk_t * chunk,uvm_page_index_t page_index)1864 static NV_STATUS block_populate_overlapping_cpu_chunks(uvm_va_block_t *block,
1865                                                        uvm_page_mask_t *node_pages_mask,
1866                                                        uvm_cpu_chunk_t *chunk,
1867                                                        uvm_page_index_t page_index)
1868 {
1869     int nid = uvm_cpu_chunk_get_numa_node(chunk);
1870     uvm_va_block_cpu_node_state_t *node_state = block_node_state_get(block, nid);
1871     uvm_chunk_size_t chunk_size = uvm_cpu_chunk_get_size(chunk);
1872     uvm_va_block_region_t chunk_region = uvm_va_block_chunk_region(block, chunk_size, page_index);
1873     uvm_page_index_t running_page_index;
1874     uvm_cpu_chunk_t **split_chunks;
1875     uvm_cpu_chunk_t **small_chunks = NULL;
1876     uvm_cpu_chunk_t *chunk_ptr;
1877     uvm_chunk_size_t split_size;
1878     size_t i;
1879     NV_STATUS status;
1880 
1881     UVM_ASSERT(IS_ALIGNED(uvm_va_block_cpu_page_address(block, page_index), chunk_size));
1882 
1883     // Get a mask of all the chunk pages that are not overlapping existing
1884     // chunks.
1885     uvm_page_mask_init_from_region(node_pages_mask, chunk_region, NULL);
1886     uvm_page_mask_andnot(node_pages_mask, node_pages_mask, &node_state->allocated);
1887 
1888     split_size = uvm_chunk_find_prev_size(uvm_cpu_chunk_get_allocation_sizes(), chunk_size);
1889     split_chunks = uvm_kvmalloc_zero((chunk_size / split_size) * sizeof(*split_chunks));
1890     if (!split_chunks) {
1891         uvm_cpu_chunk_free(chunk);
1892         return NV_ERR_NO_MEMORY;
1893     }
1894 
1895     if (split_size > UVM_PAGE_SIZE_4K) {
1896         small_chunks = uvm_kvmalloc_zero(MAX_SMALL_CHUNKS_PER_BIG_SLOT * sizeof(*small_chunks));
1897         if (!small_chunks) {
1898             uvm_kvfree(split_chunks);
1899             uvm_cpu_chunk_free(chunk);
1900             return NV_ERR_NO_MEMORY;
1901         }
1902     }
1903 
1904     // If we are here, we have to do at least one split.
1905     // We can't call any of the block_split_cpu_chunk_to_* functions since they
1906     // insert all of the split chunks into the block.
1907     // We only want to insert the sub-chunks that don't overlap. So, we have to
1908     // handle that by calling uvm_cpu_chunk_split() directly.
1909     status = uvm_cpu_chunk_split(chunk, split_chunks);
1910     if (status != NV_OK)
1911         goto done;
1912 
1913     // Insert all split chunks that don't overlap existing allocations.
1914     // Note that this handles both splitting to 64K and 4K.
1915     running_page_index = page_index;
1916     for (i = 0; i < chunk_size / split_size; i++) {
1917         uvm_va_block_region_t subchunk_region = uvm_va_block_chunk_region(block, split_size, running_page_index);
1918 
1919         // - If all the pages covered by the split chunk are missing, insert the
1920         //   chunk into the block.
1921         // - If none of the pages are missing, free the chunk.
1922         // - Otherwise, some of the pages covered by the chunk are missing and a
1923         //   second split will be needed.
1924         if (uvm_page_mask_region_full(node_pages_mask, subchunk_region)) {
1925             status = uvm_cpu_chunk_insert_in_block(block, split_chunks[i], running_page_index);
1926             if (status != NV_OK)
1927                 goto done;
1928 
1929             // To prevent double chunk freeing on error, clear the array pointer
1930             // before mapping.
1931             chunk_ptr = split_chunks[i];
1932             split_chunks[i] = NULL;
1933             status = uvm_va_block_map_cpu_chunk_on_gpus(block, chunk_ptr, running_page_index);
1934             if (status != NV_OK)
1935                 goto done;
1936         }
1937         else if (uvm_page_mask_region_empty(node_pages_mask, subchunk_region)) {
1938             uvm_cpu_chunk_free(split_chunks[i]);
1939             split_chunks[i] = NULL;
1940         }
1941 
1942         running_page_index = subchunk_region.outer;
1943     }
1944 
1945     if (split_size > UVM_PAGE_SIZE_4K) {
1946         // Split any 64K chunks that overlap 4K chunks.
1947         for (i = 0; i < chunk_size / split_size; i++) {
1948             size_t j;
1949 
1950             if (!split_chunks[i])
1951                 continue;
1952 
1953             running_page_index = page_index + ((split_size * i) / PAGE_SIZE);
1954             status = uvm_cpu_chunk_split(split_chunks[i], small_chunks);
1955             if (status != NV_OK)
1956                 goto done;
1957 
1958             for (j = 0; j < MAX_SMALL_CHUNKS_PER_BIG_SLOT; j++) {
1959                 size_t chunk_num_pages = uvm_cpu_chunk_num_pages(small_chunks[j]);
1960 
1961                 if (uvm_page_mask_test(node_pages_mask, running_page_index)) {
1962                     status = uvm_cpu_chunk_insert_in_block(block, small_chunks[j], running_page_index);
1963                     if (status != NV_OK)
1964                         goto done;
1965 
1966                     // To prevent double chunk freeing on error, clear the array pointer
1967                     // before mapping.
1968                     chunk_ptr = small_chunks[j];
1969                     small_chunks[j] = NULL;
1970                     status = uvm_va_block_map_cpu_chunk_on_gpus(block, chunk_ptr, running_page_index);
1971                     if (status != NV_OK)
1972                         goto done;
1973                 }
1974                 else {
1975                     uvm_cpu_chunk_free(small_chunks[j]);
1976                 }
1977 
1978                 running_page_index += chunk_num_pages;
1979             }
1980         }
1981     }
1982 
1983 done:
1984     if (status != NV_OK) {
1985         // First, free any small chunks that have not been inserted.
1986         if (small_chunks) {
1987             for (i = 0; i < MAX_SMALL_CHUNKS_PER_BIG_SLOT; i++)
1988                 uvm_cpu_chunk_free(small_chunks[i]);
1989         }
1990 
1991         // Next, free any large chunks that have not been inserted.
1992         for (i = 0; i < chunk_size / split_size; i++)
1993             uvm_cpu_chunk_free(split_chunks[i]);
1994     }
1995 
1996     uvm_kvfree(small_chunks);
1997     uvm_kvfree(split_chunks);
1998     return status;
1999 }
2000 
2001 // Add the already allocated chunk to the block. Note that this
2002 // handles chunk management on failure, so the caller must not free
2003 // the chunk on failure.
block_add_cpu_chunk(uvm_va_block_t * block,uvm_page_mask_t * node_pages_mask,uvm_cpu_chunk_t * chunk,uvm_va_block_region_t region)2004 static NV_STATUS block_add_cpu_chunk(uvm_va_block_t *block,
2005                                      uvm_page_mask_t *node_pages_mask,
2006                                      uvm_cpu_chunk_t *chunk,
2007                                      uvm_va_block_region_t region)
2008 {
2009     NV_STATUS status = NV_OK;
2010     int alloced_nid;
2011     uvm_va_block_cpu_node_state_t *node_state;
2012     uvm_page_index_t page_index = region.first;
2013 
2014     alloced_nid = uvm_cpu_chunk_get_numa_node(chunk);
2015     node_state = block_node_state_get(block, alloced_nid);
2016     if (!uvm_page_mask_region_empty(&node_state->allocated, region)) {
2017         // We may have ended up falling back to allocating the chunk on a
2018         // non-preferred node which may already had a chunk allocated on it in
2019         // which case we can discard the new chunk.
2020         if (uvm_page_mask_region_full(&node_state->allocated, region)) {
2021             uvm_cpu_chunk_free(chunk);
2022         }
2023         else {
2024             // There is no need to free the chunk on failure since
2025             // block_populate_overlapping_cpu_chunks() would already have
2026             // done it.
2027             status = block_populate_overlapping_cpu_chunks(block, node_pages_mask, chunk, page_index);
2028         }
2029 
2030         return status;
2031     }
2032     else {
2033         status = uvm_cpu_chunk_insert_in_block(block, chunk, page_index);
2034         if (status != NV_OK)
2035             goto out;
2036 
2037         status = uvm_va_block_map_cpu_chunk_on_gpus(block, chunk, page_index);
2038         if (status != NV_OK) {
2039             uvm_cpu_chunk_remove_from_block(block, uvm_cpu_chunk_get_numa_node(chunk), page_index);
2040             goto out;
2041         }
2042     }
2043 
2044 out:
2045     if (status != NV_OK) {
2046         // We free the chunk even though it was allocated by the caller because
2047         // block_populate_overlapping_cpu_chunks() can fail after freeing the
2048         // orignal chunk so we need to do the same here.
2049         uvm_cpu_chunk_free(chunk);
2050     }
2051 
2052     return status;
2053 }
2054 
2055 // Allocates the input page in the block, if it doesn't already exist
2056 //
2057 // Also maps the page for physical access by all GPUs used by the block, which
2058 // is required for IOMMU support. Skipped on GPUs without access to CPU memory.
2059 // e.g., this happens when the Confidential Computing Feature is enabled.
block_populate_pages_cpu(uvm_va_block_t * block,uvm_page_mask_t * populate_page_mask,uvm_va_block_region_t populate_region,uvm_va_block_context_t * block_context,bool staged)2060 static NV_STATUS block_populate_pages_cpu(uvm_va_block_t *block,
2061                                           uvm_page_mask_t *populate_page_mask,
2062                                           uvm_va_block_region_t populate_region,
2063                                           uvm_va_block_context_t *block_context,
2064                                           bool staged)
2065 {
2066     NV_STATUS status = NV_OK;
2067     uvm_cpu_chunk_t *chunk;
2068     uvm_va_block_test_t *block_test = uvm_va_block_get_test(block);
2069     uvm_chunk_sizes_mask_t cpu_allocation_sizes = uvm_cpu_chunk_get_allocation_sizes();
2070     uvm_page_mask_t *resident_mask = &block_context->scratch_page_mask;
2071     uvm_page_mask_t *allocated_mask;
2072     uvm_cpu_chunk_alloc_flags_t alloc_flags = UVM_CPU_CHUNK_ALLOC_FLAGS_NONE;
2073     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
2074     const uvm_va_policy_t *policy = uvm_va_policy_get_region(block, populate_region);
2075     uvm_page_index_t page_index;
2076     uvm_gpu_id_t id;
2077     int preferred_nid = block_context->make_resident.dest_nid;
2078 
2079     if (block_test && block_test->cpu_chunk_allocation_target_id != NUMA_NO_NODE)
2080         preferred_nid = block_test->cpu_chunk_allocation_target_id;
2081 
2082     // If the VA range has a preferred NUMA node, use it.
2083     if (preferred_nid == NUMA_NO_NODE)
2084         preferred_nid = policy->preferred_nid;
2085 
2086     // TODO: Bug 4158598: Using NUMA_NO_NODE for staging allocations is sub-optimal.
2087     if (preferred_nid != NUMA_NO_NODE) {
2088         uvm_va_block_cpu_node_state_t *node_state = block_node_state_get(block, preferred_nid);
2089         allocated_mask = &node_state->allocated;
2090         alloc_flags |= UVM_CPU_CHUNK_ALLOC_FLAGS_STRICT;
2091     }
2092     else {
2093         allocated_mask = &block->cpu.allocated;
2094     }
2095 
2096     if (va_space->test.allow_allocation_from_movable)
2097         alloc_flags |= UVM_CPU_CHUNK_ALLOC_FLAGS_ALLOW_MOVABLE;
2098 
2099     // Check whether all requested pages have already been allocated.
2100     uvm_page_mask_init_from_region(&block_context->scratch_page_mask, populate_region, populate_page_mask);
2101     if (!uvm_page_mask_andnot(&block_context->scratch_page_mask,
2102                               &block_context->scratch_page_mask,
2103                               allocated_mask))
2104         return NV_OK;
2105 
2106     if (block_test) {
2107         if (block_test->cpu_chunk_allocation_size_mask)
2108             cpu_allocation_sizes &= block_test->cpu_chunk_allocation_size_mask;
2109     }
2110 
2111     uvm_page_mask_zero(resident_mask);
2112     for_each_id_in_mask(id, &block->resident)
2113         uvm_page_mask_or(resident_mask, resident_mask, uvm_va_block_resident_mask_get(block, id, NUMA_NO_NODE));
2114 
2115     // If the VA space has a UVM-Lite GPU registered, only PAGE_SIZE allocations
2116     // should be used in order to avoid extra copies due to dirty compound
2117     // pages. HMM va_blocks also require PAGE_SIZE allocations.
2118     // TODO: Bug 3368756: add support for HMM transparent huge page (THP)
2119     // migrations.
2120 
2121     if (!uvm_processor_mask_empty(&va_space->non_faultable_processors) || uvm_va_block_is_hmm(block))
2122         cpu_allocation_sizes = PAGE_SIZE;
2123 
2124     if (block_context->mm && !uvm_va_block_is_hmm(block))
2125         alloc_flags |= UVM_CPU_CHUNK_ALLOC_FLAGS_ACCOUNT;
2126 
2127     UVM_ASSERT(cpu_allocation_sizes >= PAGE_SIZE);
2128     UVM_ASSERT(cpu_allocation_sizes & PAGE_SIZE);
2129 
2130     for_each_va_block_page_in_region_mask(page_index, populate_page_mask, populate_region) {
2131         uvm_cpu_chunk_alloc_flags_t chunk_alloc_flags = alloc_flags;
2132         uvm_va_block_region_t region = populate_region;
2133         uvm_page_mask_t *node_pages_mask = &block_context->make_resident.node_pages_mask;
2134         uvm_chunk_sizes_mask_t allocation_sizes;
2135 
2136         if (uvm_page_mask_test(allocated_mask, page_index) ||
2137             uvm_va_block_cpu_is_page_resident_on(block, preferred_nid, page_index)) {
2138             page_index = uvm_va_block_next_unset_page_in_mask(populate_region, allocated_mask, page_index) - 1;
2139             continue;
2140         }
2141 
2142         allocation_sizes = block_calculate_largest_alloc_size(block,
2143                                                               page_index,
2144                                                               allocated_mask,
2145                                                               cpu_allocation_sizes,
2146                                                               &region);
2147         if (allocation_sizes == UVM_CHUNK_SIZE_INVALID)
2148             return NV_ERR_NO_MEMORY;
2149 
2150         // If not all pages in the allocation region are resident somewhere,
2151         // zero out the allocated page.
2152         // This could be wasteful if only a few pages in high-order
2153         // allocation need to be zero'ed out but the alternative is to map
2154         // single sub-pages one-by-one.
2155         if (!uvm_page_mask_region_full(resident_mask, region))
2156             chunk_alloc_flags |= UVM_CPU_CHUNK_ALLOC_FLAGS_ZERO;
2157 
2158         // Management of a page used for a staged migration is never handed off
2159         // to the kernel and is really just a driver managed page. Therefore
2160         // don't allocate a HMM chunk in this case.
2161         if (uvm_va_block_is_hmm(block) && !staged)
2162             status = block_alloc_hmm_cpu_chunk(block, allocation_sizes, chunk_alloc_flags, preferred_nid, &chunk);
2163         else
2164             status = block_alloc_cpu_chunk(block, allocation_sizes, chunk_alloc_flags, preferred_nid, &chunk);
2165 
2166         if (status == NV_WARN_MORE_PROCESSING_REQUIRED) {
2167             alloc_flags &= ~UVM_CPU_CHUNK_ALLOC_FLAGS_STRICT;
2168             preferred_nid = NUMA_NO_NODE;
2169             block_context->make_resident.dest_nid = NUMA_NO_NODE;
2170         }
2171         else if (status != NV_OK) {
2172             return status;
2173         }
2174 
2175         // A smaller chunk than the maximum size may have been allocated, update the region accordingly.
2176         region = uvm_va_block_chunk_region(block, uvm_cpu_chunk_get_size(chunk), page_index);
2177         status = block_add_cpu_chunk(block, node_pages_mask, chunk, region);
2178         if (status != NV_OK)
2179             return status;
2180 
2181         // Skip iterating over all pages covered by the allocated chunk.
2182         page_index = region.outer - 1;
2183 
2184 #if UVM_IS_CONFIG_HMM()
2185         if (uvm_va_block_is_hmm(block) && block_context)
2186             block_context->hmm.dst_pfns[page_index] = migrate_pfn(page_to_pfn(chunk->page));
2187 #endif
2188     }
2189 
2190     return NV_OK;
2191 }
2192 
2193 // Note this clears the block_context caller_page_mask.
uvm_va_block_populate_page_cpu(uvm_va_block_t * va_block,uvm_page_index_t page_index,uvm_va_block_context_t * block_context)2194 NV_STATUS uvm_va_block_populate_page_cpu(uvm_va_block_t *va_block, uvm_page_index_t page_index, uvm_va_block_context_t *block_context)
2195 {
2196     uvm_page_mask_t *page_mask = &block_context->caller_page_mask;
2197 
2198     uvm_page_mask_zero(page_mask);
2199     uvm_page_mask_set(page_mask, page_index);
2200     return block_populate_pages_cpu(va_block, page_mask, uvm_va_block_region_from_block(va_block), block_context, false);
2201 }
2202 
2203 // Try allocating a chunk. If eviction was required,
2204 // NV_ERR_MORE_PROCESSING_REQUIRED will be returned since the block's lock was
2205 // unlocked and relocked. The caller is responsible for adding the chunk to the
2206 // retry used_chunks list.
block_alloc_gpu_chunk(uvm_va_block_t * block,uvm_va_block_retry_t * retry,uvm_gpu_t * gpu,uvm_chunk_size_t size,uvm_gpu_chunk_t ** out_gpu_chunk)2207 static NV_STATUS block_alloc_gpu_chunk(uvm_va_block_t *block,
2208                                        uvm_va_block_retry_t *retry,
2209                                        uvm_gpu_t *gpu,
2210                                        uvm_chunk_size_t size,
2211                                        uvm_gpu_chunk_t **out_gpu_chunk)
2212 {
2213     NV_STATUS status = NV_OK;
2214     uvm_gpu_chunk_t *gpu_chunk;
2215 
2216     // First try getting a free chunk from previously-made allocations.
2217     gpu_chunk = block_retry_get_free_chunk(retry, gpu, size);
2218     if (!gpu_chunk) {
2219         uvm_va_block_test_t *block_test = uvm_va_block_get_test(block);
2220         if (block_test && block_test->user_pages_allocation_retry_force_count > 0) {
2221             // Force eviction by pretending the allocation failed with no memory
2222             --block_test->user_pages_allocation_retry_force_count;
2223             status = NV_ERR_NO_MEMORY;
2224         }
2225         else {
2226             // Try allocating a new one without eviction
2227             status = uvm_pmm_gpu_alloc_user(&gpu->pmm, 1, size, UVM_PMM_ALLOC_FLAGS_NONE, &gpu_chunk, &retry->tracker);
2228         }
2229 
2230         if (status == NV_ERR_NO_MEMORY) {
2231             // If that fails with no memory, try allocating with eviction and
2232             // return back to the caller immediately so that the operation can
2233             // be restarted.
2234             uvm_mutex_unlock(&block->lock);
2235 
2236             status = uvm_pmm_gpu_alloc_user(&gpu->pmm, 1, size, UVM_PMM_ALLOC_FLAGS_EVICT, &gpu_chunk, &retry->tracker);
2237             if (status == NV_OK) {
2238                 block_retry_add_free_chunk(retry, gpu_chunk);
2239                 status = NV_ERR_MORE_PROCESSING_REQUIRED;
2240             }
2241 
2242             uvm_mutex_lock(&block->lock);
2243             return status;
2244         }
2245         else if (status != NV_OK) {
2246             return status;
2247         }
2248     }
2249 
2250     *out_gpu_chunk = gpu_chunk;
2251     return NV_OK;
2252 }
2253 
block_gpu_has_page_tables(uvm_va_block_t * block,uvm_gpu_t * gpu)2254 static bool block_gpu_has_page_tables(uvm_va_block_t *block, uvm_gpu_t *gpu)
2255 {
2256     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
2257 
2258     if (!gpu_state)
2259         return false;
2260 
2261     return gpu_state->page_table_range_4k.table  ||
2262            gpu_state->page_table_range_big.table ||
2263            gpu_state->page_table_range_2m.table;
2264 }
2265 
2266 // A helper to get a known-to-be-present GPU VA space given a VA block that's
2267 // locked. In order to use this function, the caller must know that at least one
2268 // of these conditions is true:
2269 //
2270 // 1) The VA space lock is held
2271 // 2) The VA block has active page tables for the GPU
2272 //
2273 // If the VA space lock is held (#1), then the gpu_va_space obviously can't go
2274 // away.
2275 //
2276 // On the eviction path, we don't have a lock on the VA space state. However,
2277 // since remove_gpu_va_space walks each block to unmap the GPU and free GPU page
2278 // tables before destroying the gpu_va_space, we're guaranteed that if this GPU
2279 // has page tables (#2), the gpu_va_space can't go away while we're holding the
2280 // block lock.
uvm_va_block_get_gpu_va_space(uvm_va_block_t * va_block,uvm_gpu_t * gpu)2281 static uvm_gpu_va_space_t *uvm_va_block_get_gpu_va_space(uvm_va_block_t *va_block, uvm_gpu_t *gpu)
2282 {
2283     uvm_gpu_va_space_t *gpu_va_space;
2284     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
2285 
2286     UVM_ASSERT(gpu);
2287 
2288     if (!block_gpu_has_page_tables(va_block, gpu))
2289         uvm_assert_rwsem_locked(&va_space->lock);
2290 
2291     UVM_ASSERT(uvm_processor_mask_test(&va_space->registered_gpu_va_spaces, gpu->id));
2292 
2293     gpu_va_space = va_space->gpu_va_spaces[uvm_id_gpu_index(gpu->id)];
2294 
2295     UVM_ASSERT(uvm_gpu_va_space_state(gpu_va_space) == UVM_GPU_VA_SPACE_STATE_ACTIVE);
2296     UVM_ASSERT(gpu_va_space->va_space == va_space);
2297     UVM_ASSERT(gpu_va_space->gpu == gpu);
2298 
2299     return gpu_va_space;
2300 }
2301 
block_gpu_supports_2m(uvm_va_block_t * block,uvm_gpu_t * gpu)2302 static bool block_gpu_supports_2m(uvm_va_block_t *block, uvm_gpu_t *gpu)
2303 {
2304     uvm_gpu_va_space_t *gpu_va_space;
2305 
2306     // TODO: Bug 3368756: add HMM support for transparent huge page migrations.
2307     if (uvm_va_block_size(block) < UVM_PAGE_SIZE_2M || uvm_va_block_is_hmm(block))
2308         return false;
2309 
2310     UVM_ASSERT(uvm_va_block_size(block) == UVM_PAGE_SIZE_2M);
2311 
2312     gpu_va_space = uvm_va_block_get_gpu_va_space(block, gpu);
2313     return uvm_mmu_page_size_supported(&gpu_va_space->page_tables, UVM_PAGE_SIZE_2M);
2314 }
2315 
uvm_va_block_gpu_big_page_size(uvm_va_block_t * va_block,uvm_gpu_t * gpu)2316 NvU32 uvm_va_block_gpu_big_page_size(uvm_va_block_t *va_block, uvm_gpu_t *gpu)
2317 {
2318     uvm_gpu_va_space_t *gpu_va_space;
2319 
2320     gpu_va_space = uvm_va_block_get_gpu_va_space(va_block, gpu);
2321     return gpu_va_space->page_tables.big_page_size;
2322 }
2323 
range_big_page_region_all(NvU64 start,NvU64 end,NvU32 big_page_size)2324 static uvm_va_block_region_t range_big_page_region_all(NvU64 start, NvU64 end, NvU32 big_page_size)
2325 {
2326     NvU64 first_addr = UVM_ALIGN_UP(start, big_page_size);
2327     NvU64 outer_addr = UVM_ALIGN_DOWN(end + 1, big_page_size);
2328 
2329     // The range must fit within a VA block
2330     UVM_ASSERT(UVM_VA_BLOCK_ALIGN_DOWN(start) == UVM_VA_BLOCK_ALIGN_DOWN(end));
2331 
2332     if (outer_addr <= first_addr)
2333         return uvm_va_block_region(0, 0);
2334 
2335     return uvm_va_block_region((first_addr - start) / PAGE_SIZE, (outer_addr - start) / PAGE_SIZE);
2336 }
2337 
range_num_big_pages(NvU64 start,NvU64 end,NvU32 big_page_size)2338 static size_t range_num_big_pages(NvU64 start, NvU64 end, NvU32 big_page_size)
2339 {
2340     uvm_va_block_region_t region = range_big_page_region_all(start, end, big_page_size);
2341     return (size_t)uvm_div_pow2_64(uvm_va_block_region_size(region), big_page_size);
2342 }
2343 
uvm_va_block_big_page_region_all(uvm_va_block_t * va_block,NvU32 big_page_size)2344 uvm_va_block_region_t uvm_va_block_big_page_region_all(uvm_va_block_t *va_block, NvU32 big_page_size)
2345 {
2346     return range_big_page_region_all(va_block->start, va_block->end, big_page_size);
2347 }
2348 
uvm_va_block_big_page_region_subset(uvm_va_block_t * va_block,uvm_va_block_region_t region,NvU32 big_page_size)2349 uvm_va_block_region_t uvm_va_block_big_page_region_subset(uvm_va_block_t *va_block,
2350                                                           uvm_va_block_region_t region,
2351                                                           NvU32 big_page_size)
2352 {
2353     NvU64 start = uvm_va_block_region_start(va_block, region);
2354     NvU64 end = uvm_va_block_region_end(va_block, region);
2355     uvm_va_block_region_t big_region;
2356 
2357     UVM_ASSERT(start < va_block->end);
2358     UVM_ASSERT(end <= va_block->end);
2359 
2360     big_region = range_big_page_region_all(start, end, big_page_size);
2361     if (big_region.outer) {
2362         big_region.first += region.first;
2363         big_region.outer += region.first;
2364     }
2365 
2366     return big_region;
2367 }
2368 
uvm_va_block_num_big_pages(uvm_va_block_t * va_block,NvU32 big_page_size)2369 size_t uvm_va_block_num_big_pages(uvm_va_block_t *va_block, NvU32 big_page_size)
2370 {
2371     return range_num_big_pages(va_block->start, va_block->end, big_page_size);
2372 }
2373 
uvm_va_block_big_page_addr(uvm_va_block_t * va_block,size_t big_page_index,NvU32 big_page_size)2374 NvU64 uvm_va_block_big_page_addr(uvm_va_block_t *va_block, size_t big_page_index, NvU32 big_page_size)
2375 {
2376     NvU64 addr = UVM_ALIGN_UP(va_block->start, big_page_size) + (big_page_index * big_page_size);
2377     UVM_ASSERT(addr >= va_block->start);
2378     UVM_ASSERT(addr < va_block->end);
2379     return addr;
2380 }
2381 
uvm_va_block_big_page_region(uvm_va_block_t * va_block,size_t big_page_index,NvU32 big_page_size)2382 uvm_va_block_region_t uvm_va_block_big_page_region(uvm_va_block_t *va_block, size_t big_page_index, NvU32 big_page_size)
2383 {
2384     NvU64 page_addr = uvm_va_block_big_page_addr(va_block, big_page_index, big_page_size);
2385 
2386     // Assume that we don't have to handle multiple big PTEs per system page.
2387     // It's not terribly difficult to implement, but we don't currently have a
2388     // use case.
2389     UVM_ASSERT(big_page_size >= PAGE_SIZE);
2390 
2391     return uvm_va_block_region_from_start_size(va_block, page_addr, big_page_size);
2392 }
2393 
2394 // Returns the big page index (the bit index within
2395 // uvm_va_block_gpu_state_t::big_ptes) corresponding to page_index. If
2396 // page_index cannot be covered by a big PTE due to alignment or block size,
2397 // MAX_BIG_PAGES_PER_UVM_VA_BLOCK is returned.
uvm_va_block_big_page_index(uvm_va_block_t * va_block,uvm_page_index_t page_index,NvU32 big_page_size)2398 size_t uvm_va_block_big_page_index(uvm_va_block_t *va_block, uvm_page_index_t page_index, NvU32 big_page_size)
2399 {
2400     uvm_va_block_region_t big_region_all = uvm_va_block_big_page_region_all(va_block, big_page_size);
2401     size_t big_index;
2402 
2403     // Note that this condition also handles the case of having no big pages in
2404     // the block, in which case .first >= .outer.
2405     if (page_index < big_region_all.first || page_index >= big_region_all.outer)
2406         return MAX_BIG_PAGES_PER_UVM_VA_BLOCK;
2407 
2408     big_index = (size_t)uvm_div_pow2_64((page_index - big_region_all.first) * PAGE_SIZE, big_page_size);
2409 
2410     UVM_ASSERT(uvm_va_block_big_page_addr(va_block, big_index, big_page_size) >= va_block->start);
2411     UVM_ASSERT(uvm_va_block_big_page_addr(va_block, big_index, big_page_size) + big_page_size <= va_block->end + 1);
2412 
2413     return big_index;
2414 }
2415 
uvm_page_mask_init_from_big_ptes(uvm_va_block_t * block,uvm_gpu_t * gpu,uvm_page_mask_t * mask_out,const unsigned long * big_ptes_in)2416 static void uvm_page_mask_init_from_big_ptes(uvm_va_block_t *block,
2417                                              uvm_gpu_t *gpu,
2418                                              uvm_page_mask_t *mask_out,
2419                                              const unsigned long *big_ptes_in)
2420 {
2421     uvm_va_block_region_t big_region;
2422     size_t big_page_index;
2423     NvU32 big_page_size = uvm_va_block_gpu_big_page_size(block, gpu);
2424 
2425     uvm_page_mask_zero(mask_out);
2426 
2427     for_each_set_bit(big_page_index, big_ptes_in, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) {
2428         big_region = uvm_va_block_big_page_region(block, big_page_index, big_page_size);
2429         uvm_page_mask_region_fill(mask_out, big_region);
2430     }
2431 }
2432 
uvm_va_block_page_size_cpu(uvm_va_block_t * va_block,uvm_page_index_t page_index)2433 NvU32 uvm_va_block_page_size_cpu(uvm_va_block_t *va_block, uvm_page_index_t page_index)
2434 {
2435     if (!uvm_page_mask_test(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ], page_index))
2436         return 0;
2437 
2438     UVM_ASSERT(uvm_processor_mask_test(&va_block->mapped, UVM_ID_CPU));
2439 
2440     // Despite the fact that physical CPU memory can be allocated at sizes
2441     // greater than PAGE_SIZE, vm_insert_page(s)() always maps CPU memory
2442     // with 4K PTEs. Until the core kernel adds support for PMD mappings,
2443     // the return value of this function will remain at PAGE_SIZE.
2444     return PAGE_SIZE;
2445 }
2446 
uvm_va_block_page_size_gpu(uvm_va_block_t * va_block,uvm_gpu_id_t gpu_id,uvm_page_index_t page_index)2447 NvU32 uvm_va_block_page_size_gpu(uvm_va_block_t *va_block, uvm_gpu_id_t gpu_id, uvm_page_index_t page_index)
2448 {
2449     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu_id);
2450     size_t big_page_size, big_page_index;
2451 
2452     if (!gpu_state)
2453         return 0;
2454 
2455     if (!uvm_page_mask_test(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ], page_index))
2456         return 0;
2457 
2458     UVM_ASSERT(uvm_processor_mask_test(&va_block->mapped, gpu_id));
2459 
2460     if (gpu_state->pte_is_2m)
2461         return UVM_PAGE_SIZE_2M;
2462 
2463     big_page_size = uvm_va_block_gpu_big_page_size(va_block, block_get_gpu(va_block, gpu_id));
2464     big_page_index = uvm_va_block_big_page_index(va_block, page_index, big_page_size);
2465     if (big_page_index != MAX_BIG_PAGES_PER_UVM_VA_BLOCK && test_bit(big_page_index, gpu_state->big_ptes))
2466         return big_page_size;
2467 
2468     return UVM_PAGE_SIZE_4K;
2469 }
2470 
2471 // Get the size of the physical allocation backing the page, or 0 if not
2472 // resident. Note that this is different from uvm_va_block_page_size_* because
2473 // those return the size of the PTE which maps the page index, which may be
2474 // smaller than the physical allocation.
block_phys_page_size(uvm_va_block_t * block,block_phys_page_t page)2475 static NvU32 block_phys_page_size(uvm_va_block_t *block, block_phys_page_t page)
2476 {
2477     uvm_va_block_gpu_state_t *gpu_state;
2478     uvm_chunk_size_t chunk_size;
2479 
2480     if (UVM_ID_IS_CPU(page.processor)) {
2481         uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page.nid, page.page_index);
2482         uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(block, page.processor, NUMA_NO_NODE);
2483 
2484         if (!uvm_page_mask_test(resident_mask, page.page_index))
2485             return 0;
2486 
2487         UVM_ASSERT(uvm_processor_mask_test(&block->resident, UVM_ID_CPU));
2488         return (NvU32)uvm_cpu_chunk_get_size(chunk);
2489     }
2490 
2491     gpu_state = uvm_va_block_gpu_state_get(block, page.processor);
2492     if (!gpu_state || !uvm_page_mask_test(&gpu_state->resident, page.page_index))
2493         return 0;
2494 
2495     UVM_ASSERT(uvm_processor_mask_test(&block->resident, page.processor));
2496     block_gpu_chunk_index(block, block_get_gpu(block, page.processor), page.page_index, &chunk_size);
2497     return (NvU32)chunk_size;
2498 }
2499 
uvm_va_block_get_physical_size(uvm_va_block_t * block,uvm_processor_id_t processor,uvm_page_index_t page_index)2500 NvU32 uvm_va_block_get_physical_size(uvm_va_block_t *block,
2501                                      uvm_processor_id_t processor,
2502                                      uvm_page_index_t page_index)
2503 {
2504     int nid = NUMA_NO_NODE;
2505     block_phys_page_t page;
2506 
2507     UVM_ASSERT(block);
2508 
2509     uvm_assert_mutex_locked(&block->lock);
2510 
2511     if (UVM_ID_IS_CPU(processor)) {
2512         nid = block_get_page_node_residency(block, page_index);
2513         if (nid == NUMA_NO_NODE)
2514             return 0;
2515     }
2516 
2517     page = block_phys_page(processor, nid, page_index);
2518 
2519     return block_phys_page_size(block, page);
2520 }
2521 
get_cpu_pte_bit_index(uvm_prot_t prot)2522 static uvm_pte_bits_cpu_t get_cpu_pte_bit_index(uvm_prot_t prot)
2523 {
2524     uvm_pte_bits_cpu_t pte_bit_index = UVM_PTE_BITS_CPU_MAX;
2525 
2526     // ATOMIC and WRITE are synonyms for the CPU
2527     if (prot == UVM_PROT_READ_WRITE_ATOMIC || prot == UVM_PROT_READ_WRITE)
2528         pte_bit_index = UVM_PTE_BITS_CPU_WRITE;
2529     else if (prot == UVM_PROT_READ_ONLY)
2530         pte_bit_index = UVM_PTE_BITS_CPU_READ;
2531     else
2532         UVM_ASSERT_MSG(false, "Invalid access permissions %s\n", uvm_prot_string(prot));
2533 
2534     return pte_bit_index;
2535 }
2536 
get_gpu_pte_bit_index(uvm_prot_t prot)2537 static uvm_pte_bits_gpu_t get_gpu_pte_bit_index(uvm_prot_t prot)
2538 {
2539     uvm_pte_bits_gpu_t pte_bit_index = UVM_PTE_BITS_GPU_MAX;
2540 
2541     if (prot == UVM_PROT_READ_WRITE_ATOMIC)
2542         pte_bit_index = UVM_PTE_BITS_GPU_ATOMIC;
2543     else if (prot == UVM_PROT_READ_WRITE)
2544         pte_bit_index = UVM_PTE_BITS_GPU_WRITE;
2545     else if (prot == UVM_PROT_READ_ONLY)
2546         pte_bit_index = UVM_PTE_BITS_GPU_READ;
2547     else
2548         UVM_ASSERT_MSG(false, "Invalid access permissions %s\n", uvm_prot_string(prot));
2549 
2550     return pte_bit_index;
2551 }
2552 
uvm_va_block_resident_mask_get(uvm_va_block_t * block,uvm_processor_id_t processor,int nid)2553 uvm_page_mask_t *uvm_va_block_resident_mask_get(uvm_va_block_t *block, uvm_processor_id_t processor, int nid)
2554 {
2555     uvm_va_block_gpu_state_t *gpu_state;
2556     uvm_page_mask_t *resident_mask;
2557 
2558     if (UVM_ID_IS_CPU(processor)) {
2559         uvm_va_block_cpu_node_state_t *node_state;
2560 
2561         if (nid == NUMA_NO_NODE) {
2562             resident_mask = &block->cpu.resident;
2563         }
2564         else {
2565             node_state = block_node_state_get(block, nid);
2566             resident_mask = &node_state->resident;
2567         }
2568     }
2569     else {
2570         gpu_state = uvm_va_block_gpu_state_get(block, processor);
2571         UVM_ASSERT(gpu_state);
2572         resident_mask = &gpu_state->resident;
2573     }
2574 
2575     return resident_mask;
2576 }
2577 
2578 // Get the page residency mask for a processor
2579 //
2580 // Notably this will allocate GPU state if not yet present and if that fails
2581 // NULL is returned.
block_resident_mask_get_alloc(uvm_va_block_t * block,uvm_processor_id_t processor,int nid)2582 static uvm_page_mask_t *block_resident_mask_get_alloc(uvm_va_block_t *block, uvm_processor_id_t processor, int nid)
2583 {
2584     uvm_va_block_gpu_state_t *gpu_state;
2585 
2586     if (UVM_ID_IS_CPU(processor))
2587         return uvm_va_block_resident_mask_get(block, processor, nid);
2588 
2589     gpu_state = block_gpu_state_get_alloc(block, block_get_gpu(block, processor));
2590     if (!gpu_state)
2591         return NULL;
2592 
2593     return &gpu_state->resident;
2594 }
2595 
block_map_with_prot_mask_get(uvm_va_block_t * block,uvm_processor_id_t processor,uvm_prot_t prot)2596 static const uvm_page_mask_t *block_map_with_prot_mask_get(uvm_va_block_t *block,
2597                                                            uvm_processor_id_t processor,
2598                                                            uvm_prot_t prot)
2599 {
2600     uvm_va_block_gpu_state_t *gpu_state;
2601 
2602     if (UVM_ID_IS_CPU(processor))
2603         return &block->cpu.pte_bits[get_cpu_pte_bit_index(prot)];
2604 
2605     gpu_state = uvm_va_block_gpu_state_get(block, processor);
2606 
2607     UVM_ASSERT(gpu_state);
2608     return &gpu_state->pte_bits[get_gpu_pte_bit_index(prot)];
2609 }
2610 
uvm_va_block_map_mask_get(uvm_va_block_t * block,uvm_processor_id_t processor)2611 const uvm_page_mask_t *uvm_va_block_map_mask_get(uvm_va_block_t *block, uvm_processor_id_t processor)
2612 {
2613     return block_map_with_prot_mask_get(block, processor, UVM_PROT_READ_ONLY);
2614 }
2615 
uvm_va_block_unmapped_pages_get(uvm_va_block_t * va_block,uvm_va_block_region_t region,uvm_page_mask_t * out_mask)2616 void uvm_va_block_unmapped_pages_get(uvm_va_block_t *va_block,
2617                                      uvm_va_block_region_t region,
2618                                      uvm_page_mask_t *out_mask)
2619 {
2620     uvm_processor_id_t id;
2621 
2622     uvm_assert_mutex_locked(&va_block->lock);
2623 
2624     if (!uvm_va_block_is_hmm(va_block)) {
2625         uvm_page_mask_complement(out_mask, &va_block->maybe_mapped_pages);
2626         return;
2627     }
2628 
2629     uvm_page_mask_region_fill(out_mask, region);
2630 
2631     for_each_id_in_mask(id, &va_block->mapped) {
2632         uvm_page_mask_andnot(out_mask, out_mask, uvm_va_block_map_mask_get(va_block, id));
2633     }
2634 }
2635 
block_evicted_mask_get(uvm_va_block_t * block,uvm_gpu_id_t gpu_id)2636 static const uvm_page_mask_t *block_evicted_mask_get(uvm_va_block_t *block, uvm_gpu_id_t gpu_id)
2637 {
2638     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu_id);
2639     UVM_ASSERT(gpu_state);
2640 
2641     return &gpu_state->evicted;
2642 }
2643 
block_is_page_resident_anywhere(uvm_va_block_t * block,uvm_page_index_t page_index)2644 static bool block_is_page_resident_anywhere(uvm_va_block_t *block, uvm_page_index_t page_index)
2645 {
2646     uvm_processor_id_t id;
2647     for_each_id_in_mask(id, &block->resident) {
2648         if (uvm_page_mask_test(uvm_va_block_resident_mask_get(block, id, NUMA_NO_NODE), page_index))
2649             return true;
2650     }
2651 
2652     return false;
2653 }
2654 
block_processor_page_is_populated(uvm_va_block_t * block,uvm_processor_id_t proc,uvm_page_index_t page_index)2655 static bool block_processor_page_is_populated(uvm_va_block_t *block, uvm_processor_id_t proc, uvm_page_index_t page_index)
2656 {
2657     uvm_va_block_gpu_state_t *gpu_state;
2658     size_t chunk_index;
2659 
2660     if (UVM_ID_IS_CPU(proc))
2661         return uvm_page_mask_test(&block->cpu.allocated, page_index);
2662 
2663     gpu_state = uvm_va_block_gpu_state_get(block, proc);
2664     if (!gpu_state)
2665         return false;
2666 
2667     chunk_index = block_gpu_chunk_index(block, block_get_gpu(block, proc), page_index, NULL);
2668     return gpu_state->chunks[chunk_index] != NULL;
2669 }
2670 
2671 // Compute the gpus that have at least the given access permissions for the
2672 // range described by region and page_mask. The function sets the bit if any
2673 // page in the region has the permissions.
block_region_authorized_gpus(uvm_va_block_t * va_block,uvm_va_block_region_t region,uvm_prot_t access_permission,uvm_processor_mask_t * authorized_gpus)2674 static void block_region_authorized_gpus(uvm_va_block_t *va_block,
2675                                          uvm_va_block_region_t region,
2676                                          uvm_prot_t access_permission,
2677                                          uvm_processor_mask_t *authorized_gpus)
2678 {
2679     uvm_gpu_id_t gpu_id;
2680     uvm_pte_bits_gpu_t search_gpu_bit = get_gpu_pte_bit_index(access_permission);
2681 
2682     uvm_processor_mask_zero(authorized_gpus);
2683 
2684     // Test all GPUs with mappings on the block
2685     for_each_gpu_id_in_mask(gpu_id, &va_block->mapped) {
2686         uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu_id);
2687         if (gpu_state && !uvm_page_mask_region_empty(&gpu_state->pte_bits[search_gpu_bit], region))
2688             uvm_processor_mask_set(authorized_gpus, gpu_id);
2689     }
2690 }
2691 
2692 // Compute the processors that have at least the given access permissions for
2693 // the range described by region and page_mask. The function sets the bit if any
2694 // page in the region has the permissions.
block_region_authorized_processors(uvm_va_block_t * va_block,uvm_va_block_region_t region,uvm_prot_t access_permission,uvm_processor_mask_t * authorized_processors)2695 static void block_region_authorized_processors(uvm_va_block_t *va_block,
2696                                                uvm_va_block_region_t region,
2697                                                uvm_prot_t access_permission,
2698                                                uvm_processor_mask_t *authorized_processors)
2699 {
2700     uvm_pte_bits_cpu_t search_cpu_bit = get_cpu_pte_bit_index(access_permission);
2701 
2702     // Compute GPUs
2703     block_region_authorized_gpus(va_block, region, access_permission, authorized_processors);
2704 
2705     // Test CPU
2706     if (uvm_processor_mask_test(&va_block->mapped, UVM_ID_CPU) &&
2707         !uvm_page_mask_region_empty(&va_block->cpu.pte_bits[search_cpu_bit], region)) {
2708         uvm_processor_mask_set(authorized_processors, UVM_ID_CPU);
2709     }
2710 }
2711 
block_page_authorized_processors(uvm_va_block_t * va_block,uvm_page_index_t page_index,uvm_prot_t access_permission,uvm_processor_mask_t * authorized_processors)2712 static void block_page_authorized_processors(uvm_va_block_t *va_block,
2713                                              uvm_page_index_t page_index,
2714                                              uvm_prot_t access_permission,
2715                                              uvm_processor_mask_t *authorized_processors)
2716 {
2717     block_region_authorized_processors(va_block,
2718                                        uvm_va_block_region_for_page(page_index),
2719                                        access_permission,
2720                                        authorized_processors);
2721 }
2722 
block_is_gpu_authorized_on_whole_region(uvm_va_block_t * va_block,uvm_va_block_region_t region,uvm_gpu_id_t gpu_id,uvm_prot_t required_prot)2723 static bool block_is_gpu_authorized_on_whole_region(uvm_va_block_t *va_block,
2724                                                     uvm_va_block_region_t region,
2725                                                     uvm_gpu_id_t gpu_id,
2726                                                     uvm_prot_t required_prot)
2727 {
2728     uvm_pte_bits_gpu_t search_gpu_bit = get_gpu_pte_bit_index(required_prot);
2729     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu_id);
2730 
2731     if (!gpu_state)
2732         return false;
2733 
2734     return uvm_page_mask_region_full(&gpu_state->pte_bits[search_gpu_bit], region);
2735 }
2736 
block_is_processor_authorized_on_whole_region(uvm_va_block_t * va_block,uvm_va_block_region_t region,uvm_processor_id_t processor_id,uvm_prot_t required_prot)2737 static bool block_is_processor_authorized_on_whole_region(uvm_va_block_t *va_block,
2738                                                           uvm_va_block_region_t region,
2739                                                           uvm_processor_id_t processor_id,
2740                                                           uvm_prot_t required_prot)
2741 {
2742     if (UVM_ID_IS_CPU(processor_id)) {
2743         uvm_pte_bits_cpu_t search_cpu_bit = get_cpu_pte_bit_index(required_prot);
2744 
2745         return uvm_page_mask_region_full(&va_block->cpu.pte_bits[search_cpu_bit], region);
2746     }
2747     else {
2748         return block_is_gpu_authorized_on_whole_region(va_block, region, processor_id, required_prot);
2749     }
2750 }
2751 
uvm_va_block_page_is_gpu_authorized(uvm_va_block_t * va_block,uvm_page_index_t page_index,uvm_gpu_id_t gpu_id,uvm_prot_t required_prot)2752 bool uvm_va_block_page_is_gpu_authorized(uvm_va_block_t *va_block,
2753                                          uvm_page_index_t page_index,
2754                                          uvm_gpu_id_t gpu_id,
2755                                          uvm_prot_t required_prot)
2756 {
2757     return block_is_gpu_authorized_on_whole_region(va_block,
2758                                                    uvm_va_block_region_for_page(page_index),
2759                                                    gpu_id,
2760                                                    required_prot);
2761 }
2762 
block_page_is_processor_authorized(uvm_va_block_t * va_block,uvm_page_index_t page_index,uvm_processor_id_t processor_id,uvm_prot_t required_prot)2763 static bool block_page_is_processor_authorized(uvm_va_block_t *va_block,
2764                                                uvm_page_index_t page_index,
2765                                                uvm_processor_id_t processor_id,
2766                                                uvm_prot_t required_prot)
2767 {
2768     return block_is_processor_authorized_on_whole_region(va_block,
2769                                                          uvm_va_block_region_for_page(page_index),
2770                                                          processor_id,
2771                                                          required_prot);
2772 }
2773 
2774 // Compute the gpus that have a copy of the given page resident in their memory
block_page_resident_gpus(uvm_va_block_t * va_block,uvm_page_index_t page_index,uvm_processor_mask_t * resident_gpus)2775 static void block_page_resident_gpus(uvm_va_block_t *va_block,
2776                                      uvm_page_index_t page_index,
2777                                      uvm_processor_mask_t *resident_gpus)
2778 {
2779     uvm_gpu_id_t id;
2780     uvm_processor_mask_zero(resident_gpus);
2781 
2782     for_each_gpu_id_in_mask(id, &va_block->resident) {
2783         if (uvm_page_mask_test(uvm_va_block_resident_mask_get(va_block, id, NUMA_NO_NODE), page_index)) {
2784             UVM_ASSERT(block_processor_page_is_populated(va_block, id, page_index));
2785             uvm_processor_mask_set(resident_gpus, id);
2786         }
2787     }
2788 }
2789 
2790 // Compute the processors that have a copy of the given page resident in their
2791 // memory.
uvm_va_block_page_resident_processors(uvm_va_block_t * va_block,uvm_page_index_t page_index,uvm_processor_mask_t * resident_processors)2792 void uvm_va_block_page_resident_processors(uvm_va_block_t *va_block,
2793                                            uvm_page_index_t page_index,
2794                                            uvm_processor_mask_t *resident_processors)
2795 {
2796     block_page_resident_gpus(va_block, page_index, resident_processors);
2797 
2798     if (uvm_page_mask_test(uvm_va_block_resident_mask_get(va_block, UVM_ID_CPU, NUMA_NO_NODE), page_index)) {
2799         UVM_ASSERT(block_processor_page_is_populated(va_block, UVM_ID_CPU, page_index));
2800         uvm_processor_mask_set(resident_processors, UVM_ID_CPU);
2801     }
2802 }
2803 
uvm_va_block_page_resident_processors_count(uvm_va_block_t * va_block,uvm_va_block_context_t * va_block_context,uvm_page_index_t page_index)2804 NvU32 uvm_va_block_page_resident_processors_count(uvm_va_block_t *va_block,
2805                                                   uvm_va_block_context_t *va_block_context,
2806                                                   uvm_page_index_t page_index)
2807 {
2808     uvm_processor_mask_t *resident_processors = &va_block_context->scratch_processor_mask;
2809     uvm_va_block_page_resident_processors(va_block, page_index, resident_processors);
2810 
2811     return uvm_processor_mask_get_count(resident_processors);
2812 }
2813 
uvm_va_block_page_get_closest_resident(uvm_va_block_t * va_block,uvm_va_block_context_t * va_block_context,uvm_page_index_t page_index,uvm_processor_id_t processor)2814 uvm_processor_id_t uvm_va_block_page_get_closest_resident(uvm_va_block_t *va_block,
2815                                                           uvm_va_block_context_t *va_block_context,
2816                                                           uvm_page_index_t page_index,
2817                                                           uvm_processor_id_t processor)
2818 {
2819     uvm_processor_id_t id;
2820     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
2821 
2822     uvm_processor_mask_copy(&va_block_context->scratch_processor_mask, &va_block->resident);
2823 
2824     for_each_closest_id(id, &va_block_context->scratch_processor_mask, processor, va_space) {
2825         if (uvm_page_mask_test(uvm_va_block_resident_mask_get(va_block, id, NUMA_NO_NODE), page_index))
2826             return id;
2827     }
2828 
2829     return UVM_ID_INVALID;
2830 }
2831 
2832 // We don't track the specific aperture of each mapped page. Instead, we assume
2833 // that each virtual mapping from a given processor always targets the closest
2834 // processor on which that page is resident (with special rules for UVM-Lite).
2835 //
2836 // This function verifies that assumption: before a page becomes resident on a
2837 // new location, assert that no processor has a valid mapping to a farther
2838 // processor on that page.
block_check_resident_proximity(uvm_va_block_t * block,uvm_va_block_context_t * block_context,uvm_page_index_t page_index,uvm_processor_id_t new_residency)2839 static bool block_check_resident_proximity(uvm_va_block_t *block,
2840                                            uvm_va_block_context_t *block_context,
2841                                            uvm_page_index_t page_index,
2842                                            uvm_processor_id_t new_residency)
2843 {
2844     uvm_processor_id_t mapped_id, closest_id;
2845     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
2846     uvm_processor_mask_t *resident_procs = &block_context->scratch_processor_mask;
2847     const uvm_processor_mask_t *uvm_lite_gpus = block_get_uvm_lite_gpus(block);
2848 
2849     for_each_id_in_mask(mapped_id, &block->mapped) {
2850         if (uvm_processor_mask_test(uvm_lite_gpus, mapped_id))
2851             continue;
2852 
2853         if (!uvm_page_mask_test(uvm_va_block_map_mask_get(block, mapped_id), page_index))
2854             continue;
2855 
2856         uvm_va_block_page_resident_processors(block, page_index, resident_procs);
2857         UVM_ASSERT(!uvm_processor_mask_empty(resident_procs));
2858         UVM_ASSERT(!uvm_processor_mask_test(resident_procs, new_residency));
2859         uvm_processor_mask_set(resident_procs, new_residency);
2860         closest_id = uvm_processor_mask_find_closest_id(va_space, resident_procs, mapped_id);
2861         UVM_ASSERT(!uvm_id_equal(closest_id, new_residency));
2862     }
2863 
2864     return true;
2865 }
2866 
2867 // Returns the processor to which page_index should be mapped on gpu
block_gpu_get_processor_to_map(uvm_va_block_t * block,uvm_va_block_context_t * block_context,uvm_gpu_t * gpu,uvm_page_index_t page_index)2868 static uvm_processor_id_t block_gpu_get_processor_to_map(uvm_va_block_t *block,
2869                                                          uvm_va_block_context_t *block_context,
2870                                                          uvm_gpu_t *gpu,
2871                                                          uvm_page_index_t page_index)
2872 {
2873     uvm_processor_id_t dest_id;
2874 
2875     // UVM-Lite GPUs can only map pages on the preferred location
2876     if (uvm_processor_mask_test(block_get_uvm_lite_gpus(block), gpu->id))
2877         return uvm_va_range_get_policy(block->va_range)->preferred_location;
2878 
2879     // Otherwise we always map the closest resident processor
2880     dest_id = uvm_va_block_page_get_closest_resident(block, block_context, page_index, gpu->id);
2881     UVM_ASSERT(UVM_ID_IS_VALID(dest_id));
2882     return dest_id;
2883 }
2884 
2885 // Returns the processor to which page_index should be mapped on mapping_id
block_get_processor_to_map(uvm_va_block_t * block,uvm_va_block_context_t * block_context,uvm_processor_id_t mapping_id,uvm_page_index_t page_index)2886 static uvm_processor_id_t block_get_processor_to_map(uvm_va_block_t *block,
2887                                                      uvm_va_block_context_t *block_context,
2888                                                      uvm_processor_id_t mapping_id,
2889                                                      uvm_page_index_t page_index)
2890 {
2891 
2892     if (UVM_ID_IS_CPU(mapping_id))
2893         return uvm_va_block_page_get_closest_resident(block, block_context, page_index, mapping_id);
2894 
2895     return block_gpu_get_processor_to_map(block, block_context, block_get_gpu(block, mapping_id), page_index);
2896 }
2897 
block_get_mapped_processors(uvm_va_block_t * block,uvm_va_block_context_t * block_context,uvm_processor_id_t resident_id,uvm_page_index_t page_index,uvm_processor_mask_t * mapped_procs)2898 static void block_get_mapped_processors(uvm_va_block_t *block,
2899                                         uvm_va_block_context_t *block_context,
2900                                         uvm_processor_id_t resident_id,
2901                                         uvm_page_index_t page_index,
2902                                         uvm_processor_mask_t *mapped_procs)
2903 {
2904     uvm_processor_id_t mapped_id;
2905 
2906     uvm_processor_mask_zero(mapped_procs);
2907 
2908     for_each_id_in_mask(mapped_id, &block->mapped) {
2909         if (uvm_page_mask_test(uvm_va_block_map_mask_get(block, mapped_id), page_index)) {
2910             uvm_processor_id_t to_map_id = block_get_processor_to_map(block, block_context, mapped_id, page_index);
2911 
2912             if (uvm_id_equal(to_map_id, resident_id))
2913                 uvm_processor_mask_set(mapped_procs, mapped_id);
2914         }
2915     }
2916 }
2917 
2918 // We use block_gpu_get_processor_to_map to find the destination processor of a
2919 // given GPU mapping. This function is called when the mapping is established to
2920 // sanity check that the destination of the mapping matches the query.
block_check_mapping_residency_region(uvm_va_block_t * block,uvm_va_block_context_t * block_context,uvm_gpu_t * gpu,uvm_processor_id_t mapping_dest,uvm_va_block_region_t region,const uvm_page_mask_t * page_mask)2921 static bool block_check_mapping_residency_region(uvm_va_block_t *block,
2922                                                  uvm_va_block_context_t *block_context,
2923                                                  uvm_gpu_t *gpu,
2924                                                  uvm_processor_id_t mapping_dest,
2925                                                  uvm_va_block_region_t region,
2926                                                  const uvm_page_mask_t *page_mask)
2927 {
2928     uvm_page_index_t page_index;
2929     for_each_va_block_page_in_region_mask(page_index, page_mask, region) {
2930         NvU64 va = uvm_va_block_cpu_page_address(block, page_index);
2931         uvm_processor_id_t proc_to_map = block_gpu_get_processor_to_map(block, block_context, gpu, page_index);
2932         UVM_ASSERT_MSG(uvm_id_equal(mapping_dest, proc_to_map),
2933                        "VA 0x%llx on %s: mapping %s, supposed to map %s",
2934                        va,
2935                        uvm_gpu_name(gpu),
2936                        block_processor_name(block, mapping_dest),
2937                        block_processor_name(block, proc_to_map));
2938     }
2939     return true;
2940 }
2941 
block_check_mapping_residency(uvm_va_block_t * block,uvm_va_block_context_t * block_context,uvm_gpu_t * gpu,uvm_processor_id_t mapping_dest,const uvm_page_mask_t * page_mask)2942 static bool block_check_mapping_residency(uvm_va_block_t *block,
2943                                           uvm_va_block_context_t *block_context,
2944                                           uvm_gpu_t *gpu,
2945                                           uvm_processor_id_t mapping_dest,
2946                                           const uvm_page_mask_t *page_mask)
2947 {
2948     return block_check_mapping_residency_region(block,
2949                                                 block_context,
2950                                                 gpu,
2951                                                 mapping_dest,
2952                                                 uvm_va_block_region_from_block(block),
2953                                                 page_mask);
2954 }
2955 
2956 // Check that there are no mappings targeting resident_id from any processor in
2957 // the block.
block_check_processor_not_mapped(uvm_va_block_t * block,uvm_va_block_context_t * block_context,uvm_processor_id_t resident_id)2958 static bool block_check_processor_not_mapped(uvm_va_block_t *block,
2959                                              uvm_va_block_context_t *block_context,
2960                                              uvm_processor_id_t resident_id)
2961 {
2962     uvm_processor_id_t mapped_id;
2963     uvm_page_index_t page_index;
2964 
2965     for_each_id_in_mask(mapped_id, &block->mapped) {
2966         const uvm_page_mask_t *map_mask = uvm_va_block_map_mask_get(block, mapped_id);
2967 
2968         for_each_va_block_page_in_mask(page_index, map_mask, block) {
2969             uvm_processor_id_t to_map_id = block_get_processor_to_map(block, block_context, mapped_id, page_index);
2970             UVM_ASSERT(!uvm_id_equal(to_map_id, resident_id));
2971         }
2972     }
2973 
2974     return true;
2975 }
2976 
2977 // Zero all pages of the newly-populated chunk which are not resident anywhere
2978 // else in the system, adding that work to the block's tracker. In all cases,
2979 // this function adds a dependency on passed in tracker to the block's tracker.
block_zero_new_gpu_chunk(uvm_va_block_t * block,uvm_gpu_t * gpu,uvm_gpu_chunk_t * chunk,uvm_va_block_region_t chunk_region,uvm_tracker_t * tracker)2980 static NV_STATUS block_zero_new_gpu_chunk(uvm_va_block_t *block,
2981                                           uvm_gpu_t *gpu,
2982                                           uvm_gpu_chunk_t *chunk,
2983                                           uvm_va_block_region_t chunk_region,
2984                                           uvm_tracker_t *tracker)
2985 {
2986     uvm_va_block_gpu_state_t *gpu_state;
2987     NV_STATUS status;
2988     uvm_gpu_address_t memset_addr_base, memset_addr;
2989     uvm_push_t push;
2990     uvm_gpu_id_t id;
2991     uvm_va_block_region_t subregion;
2992     uvm_page_mask_t *zero_mask;
2993 
2994     UVM_ASSERT(uvm_va_block_region_size(chunk_region) == uvm_gpu_chunk_get_size(chunk));
2995 
2996     if (chunk->is_zero)
2997         return NV_OK;
2998 
2999     gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
3000     zero_mask = kmem_cache_alloc(g_uvm_page_mask_cache, NV_UVM_GFP_FLAGS);
3001 
3002     if (!zero_mask)
3003         return NV_ERR_NO_MEMORY;
3004 
3005     // Tradeoff: zeroing entire chunk vs zeroing only the pages needed for the
3006     // operation.
3007     //
3008     // We may over-zero the page with this approach. For example, we might be
3009     // populating a 2MB chunk because only a single page within that chunk needs
3010     // to be made resident. If we also zero non-resident pages outside of the
3011     // strict region, we could waste the effort if those pages are populated on
3012     // another processor later and migrated here.
3013     //
3014     // We zero all non-resident pages in the chunk anyway for two reasons:
3015     //
3016     // 1) Efficiency. It's better to do all zeros as pipelined transfers once
3017     //    rather than scatter them around for each populate operation.
3018     //
3019     // 2) Optimizing the common case of block_populate_gpu_chunk being called
3020     //    for already-populated chunks. If we zero once at initial populate, we
3021     //    can simply check whether the chunk is present in the array. Otherwise
3022     //    we'd have to recompute the "is any page resident" mask every time.
3023 
3024     // Roll up all pages in chunk_region which are resident somewhere
3025     uvm_page_mask_zero(zero_mask);
3026     for_each_id_in_mask(id, &block->resident)
3027         uvm_page_mask_or(zero_mask, zero_mask, uvm_va_block_resident_mask_get(block, id, NUMA_NO_NODE));
3028 
3029     // If all pages in the chunk are resident somewhere, we don't need to clear
3030     // anything. Just make sure the chunk is tracked properly.
3031     if (uvm_page_mask_region_full(zero_mask, chunk_region)) {
3032         status = uvm_tracker_add_tracker_safe(&block->tracker, tracker);
3033         goto out;
3034     }
3035 
3036     // Complement to get the pages which are not resident anywhere. These
3037     // are the pages which must be zeroed.
3038     uvm_page_mask_complement(zero_mask, zero_mask);
3039 
3040     memset_addr_base = uvm_gpu_address_copy(gpu, uvm_gpu_phys_address(UVM_APERTURE_VID, chunk->address));
3041     memset_addr = memset_addr_base;
3042 
3043     status = uvm_push_begin_acquire(gpu->channel_manager,
3044                                     UVM_CHANNEL_TYPE_GPU_INTERNAL,
3045                                     tracker,
3046                                     &push,
3047                                     "Zero out chunk [0x%llx, 0x%llx) for region [0x%llx, 0x%llx) in va block [0x%llx, 0x%llx)",
3048                                     chunk->address,
3049                                     chunk->address + uvm_gpu_chunk_get_size(chunk),
3050                                     uvm_va_block_region_start(block, chunk_region),
3051                                     uvm_va_block_region_end(block, chunk_region) + 1,
3052                                     block->start,
3053                                     block->end + 1);
3054     if (status != NV_OK)
3055         goto out;
3056 
3057     for_each_va_block_subregion_in_mask(subregion, zero_mask, chunk_region) {
3058         // Pipeline the memsets since they never overlap with each other
3059         uvm_push_set_flag(&push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
3060 
3061         // We'll push one membar later for all memsets in this loop
3062         uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
3063 
3064         memset_addr.address = memset_addr_base.address + (subregion.first - chunk_region.first) * PAGE_SIZE;
3065         gpu->parent->ce_hal->memset_8(&push, memset_addr, 0, uvm_va_block_region_size(subregion));
3066     }
3067 
3068     // A membar from this GPU is required between this memset and any PTE write
3069     // pointing this or another GPU to this chunk. Otherwise an engine could
3070     // read the PTE then access the page before the memset write is visible to
3071     // that engine.
3072     //
3073     // This memset writes GPU memory, so local mappings need only a GPU-local
3074     // membar. We can't easily determine here whether a peer GPU will ever map
3075     // this page in the future, so always use a sysmembar. uvm_push_end provides
3076     // one by default.
3077     //
3078     // TODO: Bug 1766424: Use GPU-local membars if no peer can currently map
3079     //       this page. When peer access gets enabled, do a MEMBAR_SYS at that
3080     //       point.
3081     uvm_push_end(&push);
3082     status = uvm_tracker_add_push_safe(&block->tracker, &push);
3083 
3084 out:
3085     if (zero_mask)
3086         kmem_cache_free(g_uvm_page_mask_cache, zero_mask);
3087 
3088     return status;
3089 }
3090 
block_populate_gpu_chunk(uvm_va_block_t * block,uvm_va_block_retry_t * retry,uvm_gpu_t * gpu,size_t chunk_index,uvm_va_block_region_t chunk_region)3091 static NV_STATUS block_populate_gpu_chunk(uvm_va_block_t *block,
3092                                           uvm_va_block_retry_t *retry,
3093                                           uvm_gpu_t *gpu,
3094                                           size_t chunk_index,
3095                                           uvm_va_block_region_t chunk_region)
3096 {
3097     uvm_va_block_gpu_state_t *gpu_state = block_gpu_state_get_alloc(block, gpu);
3098     uvm_gpu_chunk_t *chunk = NULL;
3099     uvm_chunk_size_t chunk_size = uvm_va_block_region_size(chunk_region);
3100     uvm_va_block_test_t *block_test = uvm_va_block_get_test(block);
3101     NV_STATUS status;
3102 
3103     if (!gpu_state)
3104         return NV_ERR_NO_MEMORY;
3105 
3106     uvm_assert_mutex_locked(&block->lock);
3107     UVM_ASSERT(chunk_index < block_num_gpu_chunks(block, gpu));
3108     UVM_ASSERT(chunk_size & gpu->parent->mmu_user_chunk_sizes);
3109 
3110     // We zero chunks as necessary at initial population, so if the chunk is
3111     // already populated we're done. See the comment in
3112     // block_zero_new_gpu_chunk.
3113     if (gpu_state->chunks[chunk_index])
3114         return NV_OK;
3115 
3116     UVM_ASSERT(uvm_page_mask_region_empty(&gpu_state->resident, chunk_region));
3117 
3118     status = block_alloc_gpu_chunk(block, retry, gpu, chunk_size, &chunk);
3119     if (status != NV_OK)
3120         return status;
3121 
3122     // In some configurations such as SR-IOV heavy, the chunk cannot be
3123     // referenced using its physical address. Create a virtual mapping.
3124     status = uvm_mmu_chunk_map(chunk);
3125     if (status != NV_OK)
3126         goto chunk_free;
3127 
3128     status = block_zero_new_gpu_chunk(block, gpu, chunk, chunk_region, &retry->tracker);
3129     if (status != NV_OK)
3130         goto chunk_unmap;
3131 
3132     // It is safe to modify the page index field without holding any PMM locks
3133     // because the chunk is pinned, which means that none of the other fields in
3134     // the bitmap can change.
3135     chunk->va_block_page_index = chunk_region.first;
3136 
3137     // va_block_page_index is a bitfield of size PAGE_SHIFT. Make sure at
3138     // compile-time that it can store VA Block page indexes.
3139     BUILD_BUG_ON(PAGES_PER_UVM_VA_BLOCK >= PAGE_SIZE);
3140 
3141     status = block_map_indirect_peers_to_gpu_chunk(block, gpu, chunk);
3142     if (status != NV_OK)
3143         goto chunk_unmap;
3144 
3145     if (block_test && block_test->inject_populate_error) {
3146         block_test->inject_populate_error = false;
3147 
3148         // Use NV_ERR_MORE_PROCESSING_REQUIRED to force a retry rather than
3149         // causing a fatal OOM failure.
3150         status = NV_ERR_MORE_PROCESSING_REQUIRED;
3151         goto chunk_unmap_indirect_peers;
3152     }
3153 
3154     // Record the used chunk so that it can be unpinned at the end of the whole
3155     // operation.
3156     block_retry_add_used_chunk(retry, chunk);
3157     gpu_state->chunks[chunk_index] = chunk;
3158 
3159     return NV_OK;
3160 
3161 chunk_unmap_indirect_peers:
3162     block_unmap_indirect_peers_from_gpu_chunk(block, gpu, chunk);
3163 
3164 chunk_unmap:
3165     uvm_mmu_chunk_unmap(chunk, &block->tracker);
3166 
3167 chunk_free:
3168     // block_zero_new_gpu_chunk may have pushed memsets on this chunk which it
3169     // placed in the block tracker.
3170     uvm_pmm_gpu_free(&gpu->pmm, chunk, &block->tracker);
3171 
3172     return status;
3173 }
3174 
3175 // Populate all chunks which cover the given region and page mask.
block_populate_pages_gpu(uvm_va_block_t * block,uvm_va_block_retry_t * retry,uvm_gpu_t * gpu,uvm_va_block_region_t region,const uvm_page_mask_t * populate_mask)3176 static NV_STATUS block_populate_pages_gpu(uvm_va_block_t *block,
3177                                           uvm_va_block_retry_t *retry,
3178                                           uvm_gpu_t *gpu,
3179                                           uvm_va_block_region_t region,
3180                                           const uvm_page_mask_t *populate_mask)
3181 {
3182     uvm_va_block_region_t chunk_region, check_region;
3183     size_t chunk_index;
3184     uvm_page_index_t page_index;
3185     uvm_chunk_size_t chunk_size;
3186     NV_STATUS status;
3187 
3188     page_index = uvm_va_block_first_page_in_mask(region, populate_mask);
3189     if (page_index == region.outer)
3190         return NV_OK;
3191 
3192     chunk_index = block_gpu_chunk_index(block, gpu, page_index, &chunk_size);
3193     chunk_region = uvm_va_block_chunk_region(block, chunk_size, page_index);
3194 
3195     while (1) {
3196         check_region = uvm_va_block_region(max(chunk_region.first, region.first),
3197                                            min(chunk_region.outer, region.outer));
3198         page_index = uvm_va_block_first_page_in_mask(check_region, populate_mask);
3199         if (page_index != check_region.outer) {
3200             status = block_populate_gpu_chunk(block, retry, gpu, chunk_index, chunk_region);
3201             if (status != NV_OK)
3202                 return status;
3203         }
3204 
3205         if (check_region.outer == region.outer)
3206             break;
3207 
3208         ++chunk_index;
3209         chunk_size = block_gpu_chunk_size(block, gpu, chunk_region.outer);
3210         chunk_region = uvm_va_block_region(chunk_region.outer, chunk_region.outer + (chunk_size / PAGE_SIZE));
3211     }
3212 
3213     return NV_OK;
3214 }
3215 
block_get_can_copy_from_mask(uvm_va_block_t * block,uvm_processor_id_t from)3216 static const uvm_processor_mask_t *block_get_can_copy_from_mask(uvm_va_block_t *block, uvm_processor_id_t from)
3217 {
3218     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
3219 
3220     return &va_space->can_copy_from[uvm_id_value(from)];
3221 }
3222 
block_populate_pages(uvm_va_block_t * block,uvm_va_block_retry_t * retry,uvm_va_block_context_t * block_context,uvm_processor_id_t dest_id,uvm_va_block_region_t region,const uvm_page_mask_t * page_mask)3223 static NV_STATUS block_populate_pages(uvm_va_block_t *block,
3224                                       uvm_va_block_retry_t *retry,
3225                                       uvm_va_block_context_t *block_context,
3226                                       uvm_processor_id_t dest_id,
3227                                       uvm_va_block_region_t region,
3228                                       const uvm_page_mask_t *page_mask)
3229 {
3230     NV_STATUS status;
3231     const uvm_page_mask_t *resident_mask = block_resident_mask_get_alloc(block,
3232                                                                          dest_id,
3233                                                                          block_context->make_resident.dest_nid);
3234     uvm_page_mask_t *populate_page_mask = &block_context->make_resident.page_mask;
3235     uvm_page_mask_t *pages_staged = &block_context->make_resident.pages_staged;
3236     uvm_page_mask_t *cpu_populate_mask;
3237     uvm_memcg_context_t memcg_context;
3238 
3239     if (!resident_mask)
3240         return NV_ERR_NO_MEMORY;
3241 
3242     if (page_mask)
3243         uvm_page_mask_andnot(populate_page_mask, page_mask, resident_mask);
3244     else
3245         uvm_page_mask_complement(populate_page_mask, resident_mask);
3246 
3247     if (UVM_ID_IS_GPU(dest_id)) {
3248         const uvm_processor_mask_t *can_copy_from_processors;
3249         uvm_processor_mask_t *tmp_processor_mask;
3250         uvm_page_mask_t *scratch_page_mask = &block_context->scratch_page_mask;
3251         uvm_page_mask_t *id_resident_mask;
3252         uvm_processor_id_t id;
3253 
3254         tmp_processor_mask = uvm_processor_mask_cache_alloc();
3255         if (!tmp_processor_mask)
3256             return NV_ERR_NO_MEMORY;
3257 
3258         status = block_populate_pages_gpu(block, retry, block_get_gpu(block, dest_id), region, populate_page_mask);
3259         if (status != NV_OK) {
3260             uvm_processor_mask_cache_free(tmp_processor_mask);
3261             return status;
3262         }
3263 
3264         uvm_page_mask_zero(pages_staged);
3265 
3266         // Get the mask of all processors that have resident pages from which
3267         // the destination cannot copy directly.
3268         can_copy_from_processors = block_get_can_copy_from_mask(block, dest_id);
3269         if (!uvm_processor_mask_andnot(tmp_processor_mask, &block->resident, can_copy_from_processors)) {
3270             uvm_processor_mask_cache_free(tmp_processor_mask);
3271             return status;
3272         }
3273 
3274         // Compute the pages that will be staged through the CPU by:
3275         //   1. Computing all of the pages resident on the processors from which
3276         //      dest_id cannot directly copy.
3277         for_each_id_in_mask(id, tmp_processor_mask) {
3278             id_resident_mask = uvm_va_block_resident_mask_get(block, id, NUMA_NO_NODE);
3279             uvm_page_mask_and(scratch_page_mask, populate_page_mask, id_resident_mask);
3280             uvm_page_mask_or(pages_staged, pages_staged, scratch_page_mask);
3281         }
3282 
3283         //   2. Remove any pages in pages_staged that are on any resident processor
3284         //      dest_id can copy from.
3285         if (uvm_processor_mask_and(tmp_processor_mask, can_copy_from_processors, &block->resident)) {
3286             for_each_id_in_mask(id, tmp_processor_mask) {
3287                 id_resident_mask = uvm_va_block_resident_mask_get(block, id, NUMA_NO_NODE);
3288                 uvm_page_mask_andnot(pages_staged, pages_staged, id_resident_mask);
3289             }
3290         }
3291 
3292         //   3. Removing any pages not in the populate mask.
3293         uvm_page_mask_region_clear_outside(pages_staged, region);
3294         cpu_populate_mask = pages_staged;
3295 
3296         uvm_processor_mask_cache_free(tmp_processor_mask);
3297     }
3298     else {
3299         cpu_populate_mask = populate_page_mask;
3300     }
3301 
3302     uvm_memcg_context_start(&memcg_context, block_context->mm);
3303     status = block_populate_pages_cpu(block, cpu_populate_mask, region, block_context, UVM_ID_IS_GPU(dest_id));
3304     uvm_memcg_context_end(&memcg_context);
3305     return status;
3306 }
3307 
block_can_copy_from(uvm_va_block_t * va_block,uvm_processor_id_t from,uvm_processor_id_t to)3308 static bool block_can_copy_from(uvm_va_block_t *va_block, uvm_processor_id_t from, uvm_processor_id_t to)
3309 {
3310     return uvm_processor_mask_test(block_get_can_copy_from_mask(va_block, to), from);
3311 }
3312 
3313 // Get the chunk containing the given page, along with the offset of that page
3314 // within the chunk.
block_phys_page_chunk(uvm_va_block_t * block,block_phys_page_t block_page,size_t * chunk_offset)3315 static uvm_gpu_chunk_t *block_phys_page_chunk(uvm_va_block_t *block, block_phys_page_t block_page, size_t *chunk_offset)
3316 {
3317     uvm_gpu_t *gpu = block_get_gpu(block, block_page.processor);
3318     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, block_page.processor);
3319     size_t chunk_index;
3320     uvm_gpu_chunk_t *chunk;
3321     uvm_chunk_size_t chunk_size;
3322 
3323     UVM_ASSERT(gpu_state);
3324 
3325     chunk_index = block_gpu_chunk_index(block, gpu, block_page.page_index, &chunk_size);
3326     chunk = gpu_state->chunks[chunk_index];
3327     UVM_ASSERT(chunk);
3328 
3329     if (chunk_offset) {
3330         size_t page_offset = block_page.page_index -
3331                              uvm_va_block_chunk_region(block,chunk_size, block_page.page_index).first;
3332         *chunk_offset = page_offset * PAGE_SIZE;
3333     }
3334 
3335     return chunk;
3336 }
3337 
3338 // Get the physical GPU address of a block's page from the POV of the specified GPU
3339 // This is the address that should be used for making PTEs for the specified GPU.
block_phys_page_address(uvm_va_block_t * block,block_phys_page_t block_page,uvm_gpu_t * gpu)3340 static uvm_gpu_phys_address_t block_phys_page_address(uvm_va_block_t *block,
3341                                                       block_phys_page_t block_page,
3342                                                       uvm_gpu_t *gpu)
3343 {
3344     uvm_va_block_gpu_state_t *accessing_gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
3345     size_t chunk_offset;
3346     uvm_gpu_chunk_t *chunk;
3347 
3348     UVM_ASSERT(accessing_gpu_state);
3349 
3350     if (UVM_ID_IS_CPU(block_page.processor)) {
3351         uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, block_page.nid, block_page.page_index);
3352         NvU64 dma_addr = uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk, gpu->parent);
3353         uvm_va_block_region_t chunk_region = uvm_va_block_chunk_region(block,
3354                                                                        uvm_cpu_chunk_get_size(chunk),
3355                                                                        block_page.page_index);
3356 
3357         // The page should be mapped for physical access already as we do that
3358         // eagerly on CPU page population and GPU state alloc.
3359         UVM_ASSERT(dma_addr != 0);
3360         dma_addr += (block_page.page_index - chunk_region.first) * PAGE_SIZE;
3361 
3362         return uvm_gpu_phys_address(UVM_APERTURE_SYS, dma_addr);
3363     }
3364 
3365     chunk = block_phys_page_chunk(block, block_page, &chunk_offset);
3366 
3367     if (uvm_id_equal(block_page.processor, gpu->id)) {
3368         return uvm_gpu_phys_address(UVM_APERTURE_VID, chunk->address + chunk_offset);
3369     }
3370     else {
3371         uvm_gpu_phys_address_t phys_addr;
3372         uvm_gpu_t *owning_gpu = block_get_gpu(block, block_page.processor);
3373         uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
3374 
3375         UVM_ASSERT(uvm_va_space_peer_enabled(va_space, gpu, owning_gpu));
3376         phys_addr = uvm_pmm_gpu_peer_phys_address(&owning_gpu->pmm, chunk, gpu);
3377         phys_addr.address += chunk_offset;
3378         return phys_addr;
3379     }
3380 }
3381 
3382 // Get the physical GPU address of a block's page from the POV of the specified
3383 // GPU, suitable for accessing the memory from UVM-internal CE channels.
3384 //
3385 // Notably this is may be different from block_phys_page_address() to handle CE
3386 // limitations in addressing physical memory directly.
block_phys_page_copy_address(uvm_va_block_t * block,block_phys_page_t block_page,uvm_gpu_t * gpu)3387 static uvm_gpu_address_t block_phys_page_copy_address(uvm_va_block_t *block,
3388                                                       block_phys_page_t block_page,
3389                                                       uvm_gpu_t *gpu)
3390 {
3391     uvm_gpu_t *owning_gpu;
3392     size_t chunk_offset;
3393     uvm_gpu_chunk_t *chunk;
3394     uvm_gpu_address_t copy_addr;
3395     uvm_va_space_t *va_space;
3396 
3397     UVM_ASSERT_MSG(block_can_copy_from(block, gpu->id, block_page.processor),
3398                    "from %s to %s\n",
3399                    block_processor_name(block, gpu->id),
3400                    block_processor_name(block, block_page.processor));
3401 
3402     // CPU and local GPU accesses can rely on block_phys_page_address, but the
3403     // resulting physical address may need to be converted into virtual.
3404     if (UVM_ID_IS_CPU(block_page.processor) || uvm_id_equal(block_page.processor, gpu->id))
3405         return uvm_gpu_address_copy(gpu, block_phys_page_address(block, block_page, gpu));
3406 
3407     va_space = uvm_va_block_get_va_space(block);
3408 
3409     // See the comments on the peer_identity_mappings_supported assignments in
3410     // the HAL for why we disable direct copies between peers.
3411     owning_gpu = block_get_gpu(block, block_page.processor);
3412 
3413     UVM_ASSERT(uvm_va_space_peer_enabled(va_space, gpu, owning_gpu));
3414 
3415     chunk = block_phys_page_chunk(block, block_page, &chunk_offset);
3416     copy_addr = uvm_pmm_gpu_peer_copy_address(&owning_gpu->pmm, chunk, gpu);
3417     copy_addr.address += chunk_offset;
3418     return copy_addr;
3419 }
3420 
uvm_va_block_res_phys_page_address(uvm_va_block_t * va_block,uvm_page_index_t page_index,uvm_processor_id_t residency,uvm_gpu_t * gpu)3421 uvm_gpu_phys_address_t uvm_va_block_res_phys_page_address(uvm_va_block_t *va_block,
3422                                                           uvm_page_index_t page_index,
3423                                                           uvm_processor_id_t residency,
3424                                                           uvm_gpu_t *gpu)
3425 {
3426     int nid = NUMA_NO_NODE;
3427 
3428     uvm_assert_mutex_locked(&va_block->lock);
3429     if (UVM_ID_IS_CPU(residency)) {
3430         nid = block_get_page_node_residency(va_block, page_index);
3431         UVM_ASSERT(nid != NUMA_NO_NODE);
3432     }
3433 
3434     return block_phys_page_address(va_block, block_phys_page(residency, nid, page_index), gpu);
3435 }
3436 
uvm_va_block_gpu_phys_page_address(uvm_va_block_t * va_block,uvm_page_index_t page_index,uvm_gpu_t * gpu)3437 uvm_gpu_phys_address_t uvm_va_block_gpu_phys_page_address(uvm_va_block_t *va_block,
3438                                                           uvm_page_index_t page_index,
3439                                                           uvm_gpu_t *gpu)
3440 {
3441     return uvm_va_block_res_phys_page_address(va_block, page_index, gpu->id, gpu);
3442 }
3443 
3444 typedef struct
3445 {
3446     // Location of the memory
3447     uvm_processor_id_t id;
3448 
3449     // NUMA node ID if the processor is the CPU. Ignored otherwise.
3450     int nid;
3451 
3452     // Whether the whole block has a single physically-contiguous chunk of
3453     // storage on the processor.
3454     bool is_block_contig;
3455 
3456     // Starting address of the physically-contiguous allocation, from the view
3457     // of the copying GPU. Valid only if is_block_contig.
3458     uvm_gpu_address_t gpu_address;
3459 } block_copy_addr_t;
3460 
3461 typedef struct
3462 {
3463     block_copy_addr_t src;
3464     block_copy_addr_t dst;
3465     uvm_conf_computing_dma_buffer_t *dma_buffer;
3466     // True if at least one CE transfer (such as a memcopy) has already been
3467     // pushed to the GPU during the VA block copy thus far.
3468     bool copy_pushed;
3469 } block_copy_state_t;
3470 
3471 // Begin a push appropriate for copying data from src_id processor to dst_id processor.
3472 // One of src_id and dst_id needs to be a GPU.
block_copy_begin_push(uvm_va_block_t * va_block,block_copy_state_t * copy_state,uvm_tracker_t * tracker,uvm_push_t * push)3473 static NV_STATUS block_copy_begin_push(uvm_va_block_t *va_block,
3474                                        block_copy_state_t *copy_state,
3475                                        uvm_tracker_t *tracker,
3476                                        uvm_push_t *push)
3477 {
3478     uvm_gpu_t *gpu;
3479     NV_STATUS status;
3480     uvm_channel_type_t channel_type;
3481     uvm_tracker_t *tracker_ptr = tracker;
3482     uvm_processor_id_t dst_id = copy_state->dst.id;
3483     uvm_processor_id_t src_id = copy_state->src.id;
3484     uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
3485     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
3486 
3487     if (!(uvm_block_cpu_to_cpu_copy_with_ce || va_space->test.force_cpu_to_cpu_copy_with_ce) ||
3488         UVM_ID_IS_GPU(src_id) ||
3489         UVM_ID_IS_GPU(dst_id)) {
3490         UVM_ASSERT_MSG(!uvm_id_equal(src_id, dst_id),
3491                        "Unexpected copy to self, processor %s\n",
3492                        block_processor_name(va_block, src_id));
3493     }
3494 
3495     if (UVM_ID_IS_CPU(src_id) && UVM_ID_IS_CPU(dst_id)) {
3496         uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
3497 
3498         gpu = uvm_va_space_find_first_gpu_attached_to_cpu_node(va_space, copy_state->src.nid);
3499         if (!gpu)
3500             gpu = uvm_va_space_find_first_gpu(va_space);
3501 
3502         channel_type = UVM_CHANNEL_TYPE_CPU_TO_GPU;
3503     }
3504     else if (UVM_ID_IS_CPU(src_id)) {
3505         gpu = block_get_gpu(va_block, dst_id);
3506         channel_type = UVM_CHANNEL_TYPE_CPU_TO_GPU;
3507     }
3508     else if (UVM_ID_IS_CPU(dst_id)) {
3509         gpu = block_get_gpu(va_block, src_id);
3510         channel_type = UVM_CHANNEL_TYPE_GPU_TO_CPU;
3511     }
3512     else {
3513         // For GPU to GPU copies, prefer to "push" the data from the source as
3514         // that works better at least for P2P over PCI-E.
3515         gpu = block_get_gpu(va_block, src_id);
3516 
3517         channel_type = UVM_CHANNEL_TYPE_GPU_TO_GPU;
3518     }
3519 
3520     UVM_ASSERT_MSG(block_can_copy_from(va_block, gpu->id, dst_id),
3521                    "GPU %s dst %s src %s\n",
3522                    block_processor_name(va_block, gpu->id),
3523                    block_processor_name(va_block, dst_id),
3524                    block_processor_name(va_block, src_id));
3525     UVM_ASSERT_MSG(block_can_copy_from(va_block, gpu->id, src_id),
3526                    "GPU %s dst %s src %s\n",
3527                    block_processor_name(va_block, gpu->id),
3528                    block_processor_name(va_block, dst_id),
3529                    block_processor_name(va_block, src_id));
3530 
3531     if (channel_type == UVM_CHANNEL_TYPE_GPU_TO_GPU) {
3532         uvm_gpu_t *dst_gpu = block_get_gpu(va_block, dst_id);
3533         return uvm_push_begin_acquire_gpu_to_gpu(gpu->channel_manager,
3534                                                  dst_gpu,
3535                                                  tracker,
3536                                                  push,
3537                                                  "Copy from %s to %s for block [0x%llx, 0x%llx]",
3538                                                  block_processor_name(va_block, src_id),
3539                                                  block_processor_name(va_block, dst_id),
3540                                                  va_block->start,
3541                                                  va_block->end);
3542     }
3543 
3544     if (g_uvm_global.conf_computing_enabled) {
3545         // When Confidential Computing is enabled, additional dependencies
3546         // apply to the input tracker as well as the dma_buffer tracker.
3547         // * In the CPU to GPU case, because UVM performs CPU side
3548         //   crypto-operations first before the GPU copy, we both need to
3549         //   ensure that the dma_buffer and the input tracker are completed.
3550         // * In the GPU to CPU case, the GPU copy happens first, but the same
3551         //   principles apply. Hence, UVM acquires the input tracker and the
3552         //   dma buffer.
3553         status = uvm_tracker_overwrite_safe(&local_tracker, tracker);
3554         if (status != NV_OK)
3555             goto error;
3556 
3557         UVM_ASSERT(copy_state->dma_buffer == NULL);
3558         status = uvm_conf_computing_dma_buffer_alloc(&gpu->conf_computing.dma_buffer_pool,
3559                                                      &copy_state->dma_buffer,
3560                                                      &local_tracker);
3561 
3562         if (status != NV_OK)
3563             goto error;
3564 
3565         if (channel_type == UVM_CHANNEL_TYPE_CPU_TO_GPU) {
3566             status = uvm_tracker_wait(&local_tracker);
3567             if (status != NV_OK)
3568                 goto error;
3569         }
3570 
3571         tracker_ptr = &local_tracker;
3572     }
3573 
3574     status = uvm_push_begin_acquire(gpu->channel_manager,
3575                                     channel_type,
3576                                     tracker_ptr,
3577                                     push,
3578                                     "Copy from %s to %s for block [0x%llx, 0x%llx]",
3579                                     block_processor_name(va_block, src_id),
3580                                     block_processor_name(va_block, dst_id),
3581                                     va_block->start,
3582                                     va_block->end);
3583 
3584 error:
3585     // Caller is responsible for freeing the DMA buffer on error
3586     uvm_tracker_deinit(&local_tracker);
3587     return status;
3588 }
3589 
3590 // A page is clean iff...
3591 // the destination is equal to the preferred location
3592 // the source is the CPU and
3593 // the destination is not the CPU
3594 // the destination does not support faults/eviction and
3595 // the CPU page is not dirty
block_page_is_clean(uvm_va_block_t * block,uvm_processor_id_t dst_id,int dst_nid,uvm_processor_id_t src_id,int src_nid,uvm_page_index_t page_index)3596 static bool block_page_is_clean(uvm_va_block_t *block,
3597                                 uvm_processor_id_t dst_id,
3598                                 int dst_nid,
3599                                 uvm_processor_id_t src_id,
3600                                 int src_nid,
3601                                 uvm_page_index_t page_index)
3602 {
3603     return !uvm_va_block_is_hmm(block) &&
3604            uvm_va_policy_preferred_location_equal(uvm_va_range_get_policy(block->va_range), dst_id, dst_nid) &&
3605            UVM_ID_IS_CPU(src_id) &&
3606            !UVM_ID_IS_CPU(dst_id) &&
3607            !block_get_gpu(block, dst_id)->parent->isr.replayable_faults.handling &&
3608            !block_cpu_page_is_dirty(block, page_index, src_nid);
3609 }
3610 
3611 // When the destination is the CPU...
3612 // if the source is the preferred location and NUMA node id, mark as clean
3613 // otherwise, mark as dirty
block_update_page_dirty_state(uvm_va_block_t * block,uvm_processor_id_t dst_id,int dst_nid,uvm_processor_id_t src_id,int src_nid,uvm_page_index_t page_index)3614 static void block_update_page_dirty_state(uvm_va_block_t *block,
3615                                           uvm_processor_id_t dst_id,
3616                                           int dst_nid,
3617                                           uvm_processor_id_t src_id,
3618                                           int src_nid,
3619                                           uvm_page_index_t page_index)
3620 {
3621     uvm_va_policy_t *policy;
3622 
3623     if (UVM_ID_IS_GPU(dst_id))
3624         return;
3625 
3626     policy = uvm_va_range_get_policy(block->va_range);
3627     if (uvm_va_policy_preferred_location_equal(policy, src_id, src_nid))
3628         block_mark_cpu_page_clean(block, page_index, dst_nid);
3629     else
3630         block_mark_cpu_page_dirty(block, page_index, dst_nid);
3631 }
3632 
block_mark_memory_used(uvm_va_block_t * block,uvm_processor_id_t id)3633 static void block_mark_memory_used(uvm_va_block_t *block, uvm_processor_id_t id)
3634 {
3635     uvm_gpu_t *gpu;
3636 
3637     if (UVM_ID_IS_CPU(id))
3638         return;
3639 
3640     gpu = block_get_gpu(block, id);
3641 
3642     // If the block is of the max size and the GPU supports eviction, mark the
3643     // root chunk as used in PMM.
3644     // HMM always allocates PAGE_SIZE GPU chunks so skip HMM va_blocks.
3645     if (!uvm_va_block_is_hmm(block) &&
3646         uvm_va_block_size(block) == UVM_CHUNK_SIZE_MAX &&
3647         uvm_parent_gpu_supports_eviction(gpu->parent)) {
3648         // The chunk has to be there if this GPU is resident
3649         UVM_ASSERT(uvm_processor_mask_test(&block->resident, id));
3650         uvm_pmm_gpu_mark_root_chunk_used(&gpu->pmm, uvm_va_block_gpu_state_get(block, gpu->id)->chunks[0]);
3651     }
3652 }
3653 
block_set_resident_processor(uvm_va_block_t * block,uvm_processor_id_t id)3654 static void block_set_resident_processor(uvm_va_block_t *block, uvm_processor_id_t id)
3655 {
3656     UVM_ASSERT(!uvm_page_mask_empty(uvm_va_block_resident_mask_get(block, id, NUMA_NO_NODE)));
3657 
3658     if (uvm_processor_mask_test_and_set(&block->resident, id))
3659         return;
3660 
3661     block_mark_memory_used(block, id);
3662 }
3663 
block_clear_resident_processor(uvm_va_block_t * block,uvm_processor_id_t id)3664 static void block_clear_resident_processor(uvm_va_block_t *block, uvm_processor_id_t id)
3665 {
3666     uvm_gpu_t *gpu;
3667 
3668     UVM_ASSERT(uvm_page_mask_empty(uvm_va_block_resident_mask_get(block, id, NUMA_NO_NODE)));
3669 
3670     if (!uvm_processor_mask_test_and_clear(&block->resident, id))
3671         return;
3672 
3673     if (UVM_ID_IS_CPU(id))
3674         return;
3675 
3676     gpu = block_get_gpu(block, id);
3677 
3678     // If the block is of the max size and the GPU supports eviction, mark the
3679     // root chunk as unused in PMM.
3680     if (!uvm_va_block_is_hmm(block) &&
3681         uvm_va_block_size(block) == UVM_CHUNK_SIZE_MAX &&
3682         uvm_parent_gpu_supports_eviction(gpu->parent)) {
3683         // The chunk may not be there any more when residency is cleared.
3684         uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
3685         if (gpu_state && gpu_state->chunks[0])
3686             uvm_pmm_gpu_mark_root_chunk_unused(&gpu->pmm, gpu_state->chunks[0]);
3687     }
3688 }
3689 
block_phys_copy_contig_check(uvm_va_block_t * block,uvm_page_index_t page_index,const uvm_gpu_address_t * base_address,uvm_processor_id_t proc_id,int nid,uvm_gpu_t * copying_gpu)3690 static bool block_phys_copy_contig_check(uvm_va_block_t *block,
3691                                          uvm_page_index_t page_index,
3692                                          const uvm_gpu_address_t *base_address,
3693                                          uvm_processor_id_t proc_id,
3694                                          int nid,
3695                                          uvm_gpu_t *copying_gpu)
3696 {
3697     uvm_gpu_address_t page_address;
3698     uvm_gpu_address_t contig_address = *base_address;
3699 
3700     contig_address.address += page_index * PAGE_SIZE;
3701     page_address = block_phys_page_copy_address(block, block_phys_page(proc_id, nid, page_index), copying_gpu);
3702 
3703     return uvm_gpu_addr_cmp(page_address, contig_address) == 0;
3704 }
3705 
3706 // Check if the VA block has a single physically-contiguous chunk of storage
3707 // on the processor.
is_block_phys_contig(uvm_va_block_t * block,uvm_processor_id_t id,int nid)3708 static bool is_block_phys_contig(uvm_va_block_t *block, uvm_processor_id_t id, int nid)
3709 {
3710     uvm_cpu_chunk_t *chunk;
3711 
3712     if (UVM_ID_IS_GPU(id))
3713         return uvm_va_block_size(block) == block_gpu_chunk_size(block, block_get_gpu(block, id), 0);
3714 
3715     UVM_ASSERT(nid != NUMA_NO_NODE);
3716     chunk = uvm_cpu_chunk_first_in_region(block, uvm_va_block_region_from_block(block), nid, NULL);
3717     return chunk && (uvm_va_block_size(block) == uvm_cpu_chunk_get_size(chunk));
3718 }
3719 
block_phys_contig_region(uvm_va_block_t * block,uvm_page_index_t page_index,uvm_processor_id_t resident_id,int nid)3720 static uvm_va_block_region_t block_phys_contig_region(uvm_va_block_t *block,
3721                                                       uvm_page_index_t page_index,
3722                                                       uvm_processor_id_t resident_id,
3723                                                       int nid)
3724 {
3725     if (UVM_ID_IS_CPU(resident_id)) {
3726         uvm_cpu_chunk_t *chunk;
3727         UVM_ASSERT(nid != NUMA_NO_NODE);
3728         chunk = uvm_cpu_chunk_get_chunk_for_page(block, nid, page_index);
3729         return uvm_cpu_chunk_block_region(block, chunk, page_index);
3730     }
3731     else {
3732         uvm_chunk_size_t chunk_size;
3733         (void)block_gpu_chunk_index(block, block_get_gpu(block, resident_id), page_index, &chunk_size);
3734         return uvm_va_block_chunk_region(block, chunk_size, page_index);
3735     }
3736 }
3737 
3738 // Like block_phys_page_copy_address, but uses the address cached in bca when
3739 // possible.
block_copy_get_address(uvm_va_block_t * block,block_copy_addr_t * bca,uvm_page_index_t page_index,uvm_gpu_t * copying_gpu)3740 static uvm_gpu_address_t block_copy_get_address(uvm_va_block_t *block,
3741                                                 block_copy_addr_t *bca,
3742                                                 uvm_page_index_t page_index,
3743                                                 uvm_gpu_t *copying_gpu)
3744 {
3745     if (bca->is_block_contig) {
3746         uvm_gpu_address_t addr = bca->gpu_address;
3747         addr.address += page_index * PAGE_SIZE;
3748         UVM_ASSERT(block_phys_copy_contig_check(block, page_index, &bca->gpu_address, bca->id, bca->nid, copying_gpu));
3749         return addr;
3750     }
3751 
3752     return block_phys_page_copy_address(block, block_phys_page(bca->id, bca->nid, page_index), copying_gpu);
3753 }
3754 
3755 // When the Confidential Computing feature is enabled, the function performs
3756 // CPU side page encryption and GPU side decryption to the CPR.
3757 // GPU operations respect the caller's membar previously set in the push.
conf_computing_block_copy_push_cpu_to_gpu(uvm_va_block_t * block,block_copy_state_t * copy_state,uvm_va_block_region_t region,uvm_push_t * push)3758 static void conf_computing_block_copy_push_cpu_to_gpu(uvm_va_block_t *block,
3759                                                       block_copy_state_t *copy_state,
3760                                                       uvm_va_block_region_t region,
3761                                                       uvm_push_t *push)
3762 {
3763     uvm_push_flag_t push_membar_flag = UVM_PUSH_FLAG_COUNT;
3764     uvm_gpu_t *gpu = uvm_push_get_gpu(push);
3765     uvm_page_index_t page_index = region.first;
3766     uvm_conf_computing_dma_buffer_t *dma_buffer = copy_state->dma_buffer;
3767     struct page *src_page;
3768     uvm_gpu_address_t staging_buffer = uvm_mem_gpu_address_virtual_kernel(dma_buffer->alloc, gpu);
3769     uvm_gpu_address_t auth_tag_buffer = uvm_mem_gpu_address_virtual_kernel(dma_buffer->auth_tag, gpu);
3770     char *cpu_auth_tag_buffer = (char *)uvm_mem_get_cpu_addr_kernel(dma_buffer->auth_tag) +
3771                                         (page_index * UVM_CONF_COMPUTING_AUTH_TAG_SIZE);
3772     uvm_gpu_address_t dst_address = block_copy_get_address(block, &copy_state->dst, page_index, gpu);
3773     char *cpu_va_staging_buffer = (char *)uvm_mem_get_cpu_addr_kernel(dma_buffer->alloc) + (page_index * PAGE_SIZE);
3774     uvm_cpu_chunk_t *chunk;
3775     uvm_va_block_region_t chunk_region;
3776 
3777     UVM_ASSERT(UVM_ID_IS_CPU(copy_state->src.id));
3778     UVM_ASSERT(UVM_ID_IS_GPU(copy_state->dst.id));
3779     UVM_ASSERT(g_uvm_global.conf_computing_enabled);
3780 
3781     // See comment in block_copy_begin_push.
3782     UVM_ASSERT(uvm_tracker_is_completed(&block->tracker));
3783 
3784     chunk = uvm_cpu_chunk_get_chunk_for_page(block, copy_state->src.nid, page_index);
3785     UVM_ASSERT(chunk);
3786 
3787     // The caller guarantees that all pages in region are contiguous,
3788     // meaning they're guaranteed to be part of the same compound page.
3789     chunk_region = uvm_va_block_chunk_region(block, uvm_cpu_chunk_get_size(chunk), page_index);
3790     UVM_ASSERT(uvm_va_block_region_contains_region(chunk_region, region));
3791 
3792     src_page = uvm_cpu_chunk_get_cpu_page(block, chunk, page_index);
3793     staging_buffer.address += page_index * PAGE_SIZE;
3794     auth_tag_buffer.address += page_index * UVM_CONF_COMPUTING_AUTH_TAG_SIZE;
3795 
3796     if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE))
3797         push_membar_flag = UVM_PUSH_FLAG_NEXT_MEMBAR_NONE;
3798     else if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU))
3799         push_membar_flag = UVM_PUSH_FLAG_NEXT_MEMBAR_GPU;
3800 
3801     // kmap() only guarantees PAGE_SIZE contiguity, all encryption and
3802     // decryption must happen on a PAGE_SIZE basis.
3803     for_each_va_block_page_in_region(page_index, region) {
3804         void *src_cpu_virt_addr;
3805 
3806         src_cpu_virt_addr = kmap(src_page);
3807         uvm_conf_computing_cpu_encrypt(push->channel,
3808                                        cpu_va_staging_buffer,
3809                                        src_cpu_virt_addr,
3810                                        NULL,
3811                                        PAGE_SIZE,
3812                                        cpu_auth_tag_buffer);
3813         kunmap(src_page);
3814 
3815         // All but the first decryption can be pipelined. The first decryption
3816         // uses the caller's pipelining settings.
3817         if (page_index > region.first)
3818             uvm_push_set_flag(push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
3819 
3820         if (page_index < (region.outer - 1))
3821             uvm_push_set_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
3822         else if (push_membar_flag != UVM_PUSH_FLAG_COUNT)
3823             uvm_push_set_flag(push, push_membar_flag);
3824 
3825         gpu->parent->ce_hal->decrypt(push, dst_address, staging_buffer, PAGE_SIZE, auth_tag_buffer);
3826 
3827         src_page++;
3828         dst_address.address += PAGE_SIZE;
3829         cpu_va_staging_buffer += PAGE_SIZE;
3830         staging_buffer.address += PAGE_SIZE;
3831         cpu_auth_tag_buffer += UVM_CONF_COMPUTING_AUTH_TAG_SIZE;
3832         auth_tag_buffer.address += UVM_CONF_COMPUTING_AUTH_TAG_SIZE;
3833     }
3834 }
3835 
3836 // When the Confidential Computing feature is enabled, the function performs
3837 // GPU side page encryption. GPU operations respect the caller's membar
3838 // previously set in the push.
conf_computing_block_copy_push_gpu_to_cpu(uvm_va_block_t * block,block_copy_state_t * copy_state,uvm_va_block_region_t region,uvm_push_t * push)3839 static void conf_computing_block_copy_push_gpu_to_cpu(uvm_va_block_t *block,
3840                                                       block_copy_state_t *copy_state,
3841                                                       uvm_va_block_region_t region,
3842                                                       uvm_push_t *push)
3843 {
3844     uvm_push_flag_t push_membar_flag = UVM_PUSH_FLAG_COUNT;
3845     uvm_gpu_t *gpu = uvm_push_get_gpu(push);
3846     uvm_page_index_t page_index = region.first;
3847     uvm_conf_computing_dma_buffer_t *dma_buffer = copy_state->dma_buffer;
3848     uvm_gpu_address_t staging_buffer = uvm_mem_gpu_address_virtual_kernel(dma_buffer->alloc, gpu);
3849     uvm_gpu_address_t auth_tag_buffer = uvm_mem_gpu_address_virtual_kernel(dma_buffer->auth_tag, gpu);
3850     uvm_gpu_address_t src_address = block_copy_get_address(block, &copy_state->src, page_index, gpu);
3851     NvU32 key_version = uvm_channel_pool_key_version(push->channel->pool);
3852 
3853     UVM_ASSERT(UVM_ID_IS_GPU(copy_state->src.id));
3854     UVM_ASSERT(UVM_ID_IS_CPU(copy_state->dst.id));
3855     UVM_ASSERT(g_uvm_global.conf_computing_enabled);
3856 
3857     staging_buffer.address += page_index * PAGE_SIZE;
3858     auth_tag_buffer.address += page_index * UVM_CONF_COMPUTING_AUTH_TAG_SIZE;
3859 
3860     if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE))
3861         push_membar_flag = UVM_PUSH_FLAG_NEXT_MEMBAR_NONE;
3862     else if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU))
3863         push_membar_flag = UVM_PUSH_FLAG_NEXT_MEMBAR_GPU;
3864 
3865     // Because we use kmap() for mapping pages for CPU side
3866     // crypto-operations and it only guarantees PAGE_SIZE contiguity, all
3867     // encryptions and decryptions must happen on a PAGE_SIZE basis.
3868     for_each_va_block_page_in_region(page_index, region) {
3869         uvm_conf_computing_log_gpu_encryption(push->channel, PAGE_SIZE, &dma_buffer->decrypt_iv[page_index]);
3870         dma_buffer->key_version[page_index] = key_version;
3871 
3872         // All but the first encryption can be pipelined. The first encryption
3873         // uses the caller's pipelining settings.
3874         if (page_index > region.first)
3875             uvm_push_set_flag(push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
3876 
3877         if (page_index < (region.outer - 1))
3878             uvm_push_set_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
3879         else if (push_membar_flag != UVM_PUSH_FLAG_COUNT)
3880             uvm_push_set_flag(push, push_membar_flag);
3881 
3882         gpu->parent->ce_hal->encrypt(push, staging_buffer, src_address, PAGE_SIZE, auth_tag_buffer);
3883 
3884         src_address.address += PAGE_SIZE;
3885         staging_buffer.address += PAGE_SIZE;
3886         auth_tag_buffer.address += UVM_CONF_COMPUTING_AUTH_TAG_SIZE;
3887     }
3888 
3889     uvm_page_mask_region_fill(&dma_buffer->encrypted_page_mask, region);
3890 }
3891 
conf_computing_copy_pages_finish(uvm_va_block_t * block,block_copy_state_t * copy_state,uvm_push_t * push)3892 static NV_STATUS conf_computing_copy_pages_finish(uvm_va_block_t *block,
3893                                                   block_copy_state_t *copy_state,
3894                                                   uvm_push_t *push)
3895 {
3896     NV_STATUS status;
3897     uvm_page_index_t page_index;
3898     uvm_conf_computing_dma_buffer_t *dma_buffer = copy_state->dma_buffer;
3899     uvm_page_mask_t *encrypted_page_mask = &dma_buffer->encrypted_page_mask;
3900     void *auth_tag_buffer_base = uvm_mem_get_cpu_addr_kernel(dma_buffer->auth_tag);
3901     void *staging_buffer_base = uvm_mem_get_cpu_addr_kernel(dma_buffer->alloc);
3902 
3903     UVM_ASSERT(g_uvm_global.conf_computing_enabled);
3904 
3905     if (UVM_ID_IS_GPU(copy_state->dst.id))
3906         return NV_OK;
3907 
3908     UVM_ASSERT(UVM_ID_IS_GPU(copy_state->src.id));
3909 
3910     status = uvm_push_wait(push);
3911     if (status != NV_OK)
3912         return status;
3913 
3914     // kmap() only guarantees PAGE_SIZE contiguity, all encryption and
3915     // decryption must happen on a PAGE_SIZE basis.
3916     for_each_va_block_page_in_mask(page_index, encrypted_page_mask, block) {
3917         // All CPU chunks for the copy have already been allocated in
3918         // block_populate_pages() and copy_state has been filled in based on
3919         // those allocations.
3920         uvm_cpu_chunk_t *cpu_chunk = uvm_cpu_chunk_get_chunk_for_page(block, copy_state->dst.nid, page_index);
3921         struct page *dst_page = uvm_cpu_chunk_get_cpu_page(block, cpu_chunk, page_index);
3922         void *staging_buffer = (char *)staging_buffer_base + (page_index * PAGE_SIZE);
3923         void *auth_tag_buffer = (char *)auth_tag_buffer_base + (page_index * UVM_CONF_COMPUTING_AUTH_TAG_SIZE);
3924         void *cpu_page_address = kmap(dst_page);
3925 
3926         status = uvm_conf_computing_cpu_decrypt(push->channel,
3927                                                 cpu_page_address,
3928                                                 staging_buffer,
3929                                                 dma_buffer->decrypt_iv + page_index,
3930                                                 dma_buffer->key_version[page_index],
3931                                                 PAGE_SIZE,
3932                                                 auth_tag_buffer);
3933         kunmap(dst_page);
3934         if (status != NV_OK) {
3935             // TODO: Bug 3814087: [UVM][HCC] Handle CSL auth_tag verification
3936             //                    failures & other failures gracefully.
3937             // uvm_conf_computing_cpu_decrypt() can fail if the authentication
3938             // tag verification fails. May this happen, it is considered a
3939             // critical failure and cannot be recovered.
3940             uvm_global_set_fatal_error(status);
3941             return status;
3942         }
3943     }
3944 
3945     return NV_OK;
3946 }
3947 
block_copy_push(uvm_va_block_t * block,block_copy_state_t * copy_state,uvm_va_block_region_t region,uvm_push_t * push)3948 static void block_copy_push(uvm_va_block_t *block,
3949                             block_copy_state_t *copy_state,
3950                             uvm_va_block_region_t region,
3951                             uvm_push_t *push)
3952 {
3953     uvm_gpu_address_t gpu_dst_address, gpu_src_address;
3954     uvm_gpu_t *gpu = uvm_push_get_gpu(push);
3955 
3956     // Only the first transfer is not pipelined. Since the callees observe the
3957     // caller's pipeline settings, pipelining must be disabled in that first
3958     // transfer.
3959     if (copy_state->copy_pushed)
3960         uvm_push_set_flag(push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
3961     else
3962         UVM_ASSERT(!uvm_push_test_flag(push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED));
3963 
3964     uvm_push_set_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
3965 
3966     if (g_uvm_global.conf_computing_enabled) {
3967         if (UVM_ID_IS_CPU(copy_state->src.id))
3968             conf_computing_block_copy_push_cpu_to_gpu(block, copy_state, region, push);
3969         else
3970             conf_computing_block_copy_push_gpu_to_cpu(block, copy_state, region, push);
3971     }
3972     else {
3973         gpu_dst_address = block_copy_get_address(block, &copy_state->dst, region.first, gpu);
3974         gpu_src_address = block_copy_get_address(block, &copy_state->src, region.first, gpu);
3975 
3976         gpu->parent->ce_hal->memcopy(push, gpu_dst_address, gpu_src_address, uvm_va_block_region_size(region));
3977     }
3978 
3979     copy_state->copy_pushed = true;
3980 }
3981 
block_copy_end_push(uvm_va_block_t * block,block_copy_state_t * copy_state,uvm_tracker_t * copy_tracker,NV_STATUS push_status,uvm_push_t * push)3982 static NV_STATUS block_copy_end_push(uvm_va_block_t *block,
3983                                      block_copy_state_t *copy_state,
3984                                      uvm_tracker_t *copy_tracker,
3985                                      NV_STATUS push_status,
3986                                      uvm_push_t *push)
3987 {
3988     NV_STATUS tracker_status;
3989 
3990     // TODO: Bug 1766424: If the destination is a GPU and the copy was done
3991     //       by that GPU, use a GPU-local membar if no peer can currently
3992     //       map this page. When peer access gets enabled, do a MEMBAR_SYS
3993     //       at that point.
3994     uvm_push_end(push);
3995 
3996     if ((push_status == NV_OK) && g_uvm_global.conf_computing_enabled)
3997         push_status = conf_computing_copy_pages_finish(block, copy_state, push);
3998 
3999     tracker_status = uvm_tracker_add_push_safe(copy_tracker, push);
4000     if (push_status == NV_OK)
4001         push_status = tracker_status;
4002 
4003     if (g_uvm_global.conf_computing_enabled) {
4004         uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
4005 
4006         uvm_tracker_overwrite_with_push(&local_tracker, push);
4007         uvm_conf_computing_dma_buffer_free(&push->gpu->conf_computing.dma_buffer_pool,
4008                                            copy_state->dma_buffer,
4009                                            &local_tracker);
4010         copy_state->dma_buffer = NULL;
4011         uvm_tracker_deinit(&local_tracker);
4012     }
4013 
4014     return push_status;
4015 }
4016 
4017 // Copies use CEs if:
4018 //   - uvm_block_cpu_to_cpu_copy_with_ce or
4019 //     uvm_test_force_block_cpu_to_cpu_copy_with_ce are set AND there are
4020 //     registered GPUs in the VA space.
4021 //   - the source and destination are not the CPU.
block_copy_should_use_push(uvm_va_block_t * block,block_copy_state_t * copy_state)4022 static bool block_copy_should_use_push(uvm_va_block_t *block, block_copy_state_t *copy_state)
4023 {
4024     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
4025 
4026     return ((uvm_block_cpu_to_cpu_copy_with_ce || va_space->test.force_cpu_to_cpu_copy_with_ce) &&
4027             uvm_processor_mask_get_gpu_count(&va_space->registered_gpus)) ||
4028         !(UVM_ID_IS_CPU(copy_state->src.id) && uvm_id_equal(copy_state->src.id, copy_state->dst.id));
4029 }
4030 
block_copy_pages(uvm_va_block_t * va_block,block_copy_state_t * copy_state,uvm_va_block_region_t region,uvm_push_t * push)4031 static NV_STATUS block_copy_pages(uvm_va_block_t *va_block,
4032                                   block_copy_state_t *copy_state,
4033                                   uvm_va_block_region_t region,
4034                                   uvm_push_t *push)
4035 {
4036     if (!block_copy_should_use_push(va_block, copy_state)) {
4037         uvm_cpu_chunk_t *src_chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, copy_state->src.nid, region.first);
4038         uvm_cpu_chunk_t *dst_chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, copy_state->dst.nid, region.first);
4039         uvm_va_block_region_t src_chunk_region = uvm_cpu_chunk_block_region(va_block, src_chunk, region.first);
4040         uvm_va_block_region_t dst_chunk_region = uvm_cpu_chunk_block_region(va_block, dst_chunk, region.first);
4041         struct page *src_chunk_page = uvm_cpu_chunk_get_cpu_page(va_block, src_chunk, src_chunk_region.first);
4042         struct page *dst_chunk_page = uvm_cpu_chunk_get_cpu_page(va_block, dst_chunk, dst_chunk_region.first);
4043         uvm_page_index_t page_index;
4044         NV_STATUS status;
4045 
4046         UVM_ASSERT(dst_chunk);
4047         UVM_ASSERT(uvm_cpu_chunk_get_size(src_chunk) >= uvm_va_block_region_size(region));
4048         UVM_ASSERT(uvm_va_block_region_size(region) <= uvm_cpu_chunk_get_size(dst_chunk));
4049 
4050         // CPU-to-CPU copies using memcpy() don't have any inherent ordering with
4051         // copies using GPU CEs. So, we have to make sure that all previously
4052         // submitted work is complete.
4053         status = uvm_tracker_wait(&va_block->tracker);
4054         if (status != NV_OK)
4055             return status;
4056 
4057         for_each_va_block_page_in_region(page_index, region) {
4058             struct page *src_page = src_chunk_page + (page_index - src_chunk_region.first);
4059             struct page *dst_page = dst_chunk_page + (page_index - dst_chunk_region.first);
4060             void *src_addr = kmap(src_page);
4061             void *dst_addr = kmap(dst_page);
4062 
4063             memcpy(dst_addr, src_addr, PAGE_SIZE);
4064             kunmap(src_addr);
4065             kunmap(dst_addr);
4066 
4067             if (block_cpu_page_is_dirty(va_block, page_index, copy_state->src.nid))
4068                 block_mark_cpu_page_dirty(va_block, page_index, copy_state->dst.nid);
4069         }
4070     }
4071     else {
4072         block_copy_push(va_block, copy_state, region, push);
4073     }
4074 
4075     return NV_OK;
4076 }
4077 
4078 // Copies pages resident on the src_id processor to the dst_id processor
4079 //
4080 // The function adds the pages that were successfully copied to the output
4081 // migrated_pages mask and returns the number of pages in copied_pages. These
4082 // fields are reliable even if an error is returned.
4083 //
4084 // Acquires the block's tracker and adds all of its pushes to the copy_tracker.
block_copy_resident_pages_between(uvm_va_block_t * block,uvm_va_block_context_t * block_context,uvm_processor_id_t dst_id,int dst_nid,uvm_processor_id_t src_id,int src_nid,uvm_va_block_region_t region,uvm_page_mask_t * copy_mask,const uvm_page_mask_t * prefetch_page_mask,uvm_va_block_transfer_mode_t transfer_mode,uvm_page_mask_t * migrated_pages,NvU32 * copied_pages,uvm_tracker_t * copy_tracker)4085 static NV_STATUS block_copy_resident_pages_between(uvm_va_block_t *block,
4086                                                    uvm_va_block_context_t *block_context,
4087                                                    uvm_processor_id_t dst_id,
4088                                                    int dst_nid,
4089                                                    uvm_processor_id_t src_id,
4090                                                    int src_nid,
4091                                                    uvm_va_block_region_t region,
4092                                                    uvm_page_mask_t *copy_mask,
4093                                                    const uvm_page_mask_t *prefetch_page_mask,
4094                                                    uvm_va_block_transfer_mode_t transfer_mode,
4095                                                    uvm_page_mask_t *migrated_pages,
4096                                                    NvU32 *copied_pages,
4097                                                    uvm_tracker_t *copy_tracker)
4098 {
4099     NV_STATUS status = NV_OK;
4100     uvm_page_mask_t *dst_resident_mask = uvm_va_block_resident_mask_get(block, dst_id, dst_nid);
4101     uvm_gpu_t *copying_gpu = NULL;
4102     uvm_push_t push;
4103     uvm_page_index_t page_index;
4104     uvm_page_index_t contig_start_index = region.outer;
4105     uvm_page_index_t last_index = region.outer;
4106     uvm_range_group_range_t *rgr = NULL;
4107     bool rgr_has_changed = false;
4108     uvm_make_resident_cause_t cause = block_context->make_resident.cause;
4109     uvm_make_resident_cause_t contig_cause = cause;
4110     const bool may_prefetch = (cause == UVM_MAKE_RESIDENT_CAUSE_REPLAYABLE_FAULT ||
4111                                cause == UVM_MAKE_RESIDENT_CAUSE_NON_REPLAYABLE_FAULT ||
4112                                cause == UVM_MAKE_RESIDENT_CAUSE_ACCESS_COUNTER) && !!prefetch_page_mask;
4113     block_copy_state_t copy_state = {0};
4114     uvm_va_range_t *va_range = block->va_range;
4115     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
4116     uvm_va_block_region_t contig_region = {0};
4117     NvU64 cpu_migration_begin_timestamp = 0;
4118 
4119     *copied_pages = 0;
4120 
4121     if (UVM_ID_IS_CPU(src_id))
4122         UVM_ASSERT(src_nid != NUMA_NO_NODE);
4123 
4124     if (UVM_ID_IS_CPU(dst_id))
4125         UVM_ASSERT(dst_nid != NUMA_NO_NODE);
4126 
4127     // If there are no pages to be copied, exit early
4128     if (!uvm_page_mask_andnot(copy_mask, copy_mask, dst_resident_mask))
4129         return NV_OK;
4130 
4131     if (migrated_pages && !uvm_page_mask_andnot(copy_mask, copy_mask, migrated_pages))
4132         return NV_OK;
4133 
4134     copy_state.src.id = src_id;
4135     copy_state.dst.id = dst_id;
4136     copy_state.src.nid = src_nid;
4137     copy_state.dst.nid = dst_nid;
4138 
4139     copy_state.src.is_block_contig = is_block_phys_contig(block, src_id, copy_state.src.nid);
4140     copy_state.dst.is_block_contig = is_block_phys_contig(block, dst_id, copy_state.dst.nid);
4141 
4142     // uvm_range_group_range_iter_first should only be called when the va_space
4143     // lock is held, which is always the case unless an eviction is taking
4144     // place.
4145     if (cause != UVM_MAKE_RESIDENT_CAUSE_EVICTION) {
4146         rgr = uvm_range_group_range_iter_first(va_space,
4147                                                uvm_va_block_region_start(block, region),
4148                                                uvm_va_block_region_end(block, region));
4149         rgr_has_changed = true;
4150     }
4151 
4152     // TODO: Bug 3745051: This function is complicated and needs refactoring
4153     for_each_va_block_page_in_region_mask(page_index, copy_mask, region) {
4154         NvU64 page_start = uvm_va_block_cpu_page_address(block, page_index);
4155         uvm_make_resident_cause_t page_cause = (may_prefetch && uvm_page_mask_test(prefetch_page_mask, page_index)) ?
4156                                                 UVM_MAKE_RESIDENT_CAUSE_PREFETCH:
4157                                                 cause;
4158 
4159         UVM_ASSERT(block_check_resident_proximity(block, block_context, page_index, dst_id));
4160         UVM_ASSERT(block_processor_page_is_populated(block, dst_id, page_index));
4161 
4162         // If we're not evicting and we're migrating away from the preferred
4163         // location, then we should add the range group range to the list of
4164         // migrated ranges in the range group. It's safe to skip this because
4165         // the use of range_group's migrated_ranges list is a UVM-Lite
4166         // optimization - eviction is not supported on UVM-Lite GPUs.
4167         if (cause != UVM_MAKE_RESIDENT_CAUSE_EVICTION && !uvm_va_block_is_hmm(block) &&
4168             uvm_va_policy_preferred_location_equal(uvm_va_range_get_policy(va_range), src_id, src_nid)) {
4169             // rgr_has_changed is used to minimize the number of times the
4170             // migrated_ranges_lock is taken. It is set to false when the range
4171             // group range pointed by rgr is added to the migrated_ranges list,
4172             // and it is just set back to true when we move to a different
4173             // range group range.
4174 
4175             // The current page could be after the end of rgr. Iterate over the
4176             // range group ranges until rgr's end location is greater than or
4177             // equal to the current page.
4178             while (rgr && rgr->node.end < page_start) {
4179                 rgr = uvm_range_group_range_iter_next(va_space, rgr, uvm_va_block_region_end(block, region));
4180                 rgr_has_changed = true;
4181             }
4182 
4183             // Check whether the current page lies within rgr. A single page
4184             // must entirely reside within a range group range. Since we've
4185             // incremented rgr until its end is higher than page_start, we now
4186             // check if page_start lies within rgr.
4187             if (rgr && rgr_has_changed && page_start >= rgr->node.start && page_start <= rgr->node.end) {
4188                 uvm_spin_lock(&rgr->range_group->migrated_ranges_lock);
4189                 if (list_empty(&rgr->range_group_migrated_list_node))
4190                     list_move_tail(&rgr->range_group_migrated_list_node, &rgr->range_group->migrated_ranges);
4191                 uvm_spin_unlock(&rgr->range_group->migrated_ranges_lock);
4192 
4193                 rgr_has_changed = false;
4194             }
4195         }
4196 
4197         // No need to copy pages that haven't changed.  Just clear residency
4198         // information
4199         if (block_page_is_clean(block, dst_id, copy_state.dst.nid, src_id, copy_state.src.nid, page_index))
4200             continue;
4201 
4202         if (last_index == region.outer) {
4203             // Record all processors involved in the copy.
4204             uvm_processor_mask_set(&block_context->make_resident.all_involved_processors, dst_id);
4205             uvm_processor_mask_set(&block_context->make_resident.all_involved_processors, src_id);
4206         }
4207 
4208         if (block_copy_should_use_push(block, &copy_state)) {
4209             if (!copying_gpu) {
4210                 status = block_copy_begin_push(block, &copy_state, &block->tracker, &push);
4211 
4212                 if (status != NV_OK)
4213                     break;
4214 
4215                 copying_gpu = uvm_push_get_gpu(&push);
4216 
4217                 // Ensure that there is GPU state that can be used for CPU-to-CPU copies
4218                 if (UVM_ID_IS_CPU(dst_id) && uvm_id_equal(src_id, dst_id)) {
4219                     uvm_va_block_gpu_state_t *gpu_state = block_gpu_state_get_alloc(block, copying_gpu);
4220                     if (!gpu_state) {
4221                         status = NV_ERR_NO_MEMORY;
4222                         break;
4223                     }
4224                 }
4225 
4226                 // Record the GPU involved in the copy
4227                 uvm_processor_mask_set(&block_context->make_resident.all_involved_processors, copying_gpu->id);
4228 
4229                 // This function is called just once per VA block and needs to
4230                 // receive the "main" cause for the migration (it mainly checks if
4231                 // we are in the eviction path). Therefore, we pass cause instead
4232                 // of contig_cause
4233                 uvm_tools_record_block_migration_begin(block, &push, dst_id, src_id, page_start, cause);
4234             }
4235         }
4236         else {
4237             // For CPU-to-CPU copies using memcpy(), record the start of the
4238             // migration here. This will be reported in the migration event.
4239             cpu_migration_begin_timestamp = NV_GETTIME();
4240         }
4241 
4242         if (!uvm_va_block_is_hmm(block))
4243             block_update_page_dirty_state(block, dst_id, copy_state.dst.nid, src_id, copy_state.src.nid, page_index);
4244 
4245         if (last_index == region.outer) {
4246             bool can_cache_src_phys_addr = copy_state.src.is_block_contig;
4247             bool can_cache_dst_phys_addr = copy_state.dst.is_block_contig;
4248             contig_start_index = page_index;
4249             contig_cause = page_cause;
4250 
4251             if (block_copy_should_use_push(block, &copy_state)) {
4252                 // When CC is enabled, transfers between GPU and CPU don't rely on
4253                 // any GPU mapping of CPU chunks, physical or virtual.
4254             if (UVM_ID_IS_CPU(src_id) && g_uvm_global.conf_computing_enabled)
4255                     can_cache_src_phys_addr = false;
4256 
4257             if (UVM_ID_IS_CPU(dst_id) && g_uvm_global.conf_computing_enabled)
4258                     can_cache_dst_phys_addr = false;
4259                 // Computing the physical address is a non-trivial operation and
4260                 // seems to be a performance limiter on systems with 2 or more
4261                 // NVLINK links. Therefore, for physically-contiguous block
4262                 // storage, we cache the start address and compute the page address
4263                 // using the page index.
4264                 if (can_cache_src_phys_addr) {
4265                     copy_state.src.gpu_address = block_phys_page_copy_address(block,
4266                                                                               block_phys_page(src_id,
4267                                                                                               copy_state.src.nid,
4268                                                                                               0),
4269                                                                               copying_gpu);
4270                 }
4271                 if (can_cache_dst_phys_addr) {
4272                     copy_state.dst.gpu_address = block_phys_page_copy_address(block,
4273                                                                               block_phys_page(dst_id,
4274                                                                                               copy_state.dst.nid,
4275                                                                                               0),
4276                                                                               copying_gpu);
4277                 }
4278             }
4279         }
4280         else if ((page_index != last_index + 1) || contig_cause != page_cause) {
4281             contig_region = uvm_va_block_region(contig_start_index, last_index + 1);
4282             UVM_ASSERT(uvm_va_block_region_contains_region(region, contig_region));
4283 
4284             // If both src and dst are physically-contiguous, consolidate copies
4285             // of contiguous pages into a single method.
4286             if (copy_state.src.is_block_contig && copy_state.dst.is_block_contig) {
4287                 status = block_copy_pages(block, &copy_state, contig_region, &push);
4288                 if (status != NV_OK)
4289                     break;
4290             }
4291 
4292             if (block_copy_should_use_push(block, &copy_state)) {
4293                 uvm_perf_event_notify_migration(&va_space->perf_events,
4294                                                 &push,
4295                                                 block,
4296                                                 dst_id,
4297                                                 src_id,
4298                                                 uvm_va_block_region_start(block, contig_region),
4299                                                 uvm_va_block_region_size(contig_region),
4300                                                 transfer_mode,
4301                                                 contig_cause,
4302                                                 &block_context->make_resident);
4303             }
4304             else {
4305                 uvm_perf_event_notify_migration_cpu(&va_space->perf_events,
4306                                                     block,
4307                                                     copy_state.dst.nid,
4308                                                     copy_state.src.nid,
4309                                                     uvm_va_block_region_start(block, contig_region),
4310                                                     uvm_va_block_region_size(contig_region),
4311                                                     cpu_migration_begin_timestamp,
4312                                                     transfer_mode,
4313                                                     contig_cause,
4314                                                     &block_context->make_resident);
4315             }
4316 
4317             contig_start_index = page_index;
4318             contig_cause = page_cause;
4319         }
4320 
4321         if (!copy_state.src.is_block_contig || !copy_state.dst.is_block_contig) {
4322             status = block_copy_pages(block, &copy_state, uvm_va_block_region_for_page(page_index), &push);
4323             if (status != NV_OK)
4324                 return status;
4325         }
4326 
4327         last_index = page_index;
4328     }
4329 
4330     // Copy the remaining pages
4331     contig_region = uvm_va_block_region(contig_start_index, last_index + 1);
4332     if (uvm_va_block_region_size(contig_region) && uvm_va_block_region_contains_region(region, contig_region)) {
4333         if (copy_state.src.is_block_contig && copy_state.dst.is_block_contig) {
4334             status = block_copy_pages(block, &copy_state, contig_region, &push);
4335             if (status != NV_OK)
4336                 return status;
4337         }
4338 
4339         if (block_copy_should_use_push(block, &copy_state)) {
4340             uvm_perf_event_notify_migration(&va_space->perf_events,
4341                                             &push,
4342                                             block,
4343                                             dst_id,
4344                                             src_id,
4345                                             uvm_va_block_region_start(block, contig_region),
4346                                             uvm_va_block_region_size(contig_region),
4347                                             transfer_mode,
4348                                             contig_cause,
4349                                             &block_context->make_resident);
4350         }
4351         else {
4352             uvm_perf_event_notify_migration_cpu(&va_space->perf_events,
4353                                                 block,
4354                                                 copy_state.dst.nid,
4355                                                 copy_state.src.nid,
4356                                                 uvm_va_block_region_start(block, contig_region),
4357                                                 uvm_va_block_region_size(contig_region),
4358                                                 cpu_migration_begin_timestamp,
4359                                                 transfer_mode,
4360                                                 contig_cause,
4361                                                 &block_context->make_resident);
4362         }
4363 
4364         if (block_copy_should_use_push(block, &copy_state) && copying_gpu)
4365             status = block_copy_end_push(block, &copy_state, copy_tracker, status, &push);
4366     }
4367 
4368     // Update VA block status bits
4369     //
4370     // Only update the bits for the pages that succeeded
4371     if (status != NV_OK)
4372         uvm_page_mask_region_clear(copy_mask, uvm_va_block_region(page_index, PAGES_PER_UVM_VA_BLOCK));
4373 
4374     *copied_pages = uvm_page_mask_weight(copy_mask);
4375     if (*copied_pages && migrated_pages)
4376         uvm_page_mask_or(migrated_pages, migrated_pages, copy_mask);
4377 
4378     return status;
4379 }
4380 
block_copy_resident_pages_from(uvm_va_block_t * block,uvm_va_block_context_t * block_context,uvm_processor_id_t dst_id,uvm_processor_id_t src_id,int src_nid,uvm_va_block_region_t region,const uvm_page_mask_t * page_mask,const uvm_page_mask_t * prefetch_page_mask,uvm_va_block_transfer_mode_t transfer_mode,uvm_page_mask_t * migrated_pages,NvU32 * copied_pages_out,uvm_tracker_t * copy_tracker)4381 static NV_STATUS block_copy_resident_pages_from(uvm_va_block_t *block,
4382                                                 uvm_va_block_context_t *block_context,
4383                                                 uvm_processor_id_t dst_id,
4384                                                 uvm_processor_id_t src_id,
4385                                                 int src_nid,
4386                                                 uvm_va_block_region_t region,
4387                                                 const uvm_page_mask_t *page_mask,
4388                                                 const uvm_page_mask_t *prefetch_page_mask,
4389                                                 uvm_va_block_transfer_mode_t transfer_mode,
4390                                                 uvm_page_mask_t *migrated_pages,
4391                                                 NvU32 *copied_pages_out,
4392                                                 uvm_tracker_t *copy_tracker)
4393 {
4394     uvm_page_mask_t *copy_mask = &block_context->make_resident.copy_resident_pages_mask;
4395     uvm_page_mask_t *src_resident_mask;
4396     uvm_page_mask_t *node_pages_mask = &block_context->make_resident.node_pages_mask;
4397     uvm_make_resident_page_tracking_t *page_tracking = &block_context->make_resident.cpu_pages_used;
4398     NvU32 copied_pages_from_src;
4399     NV_STATUS status = NV_OK;
4400     int dst_nid;
4401 
4402     src_resident_mask = uvm_va_block_resident_mask_get(block, src_id, src_nid);
4403     uvm_page_mask_init_from_region(copy_mask, region, src_resident_mask);
4404 
4405     if (page_mask)
4406         uvm_page_mask_and(copy_mask, copy_mask, page_mask);
4407 
4408     if (UVM_ID_IS_CPU(dst_id)) {
4409         for_each_node_mask(dst_nid, page_tracking->nodes) {
4410             if (!uvm_page_mask_and(node_pages_mask, copy_mask, block_tracking_node_mask_get(block_context, dst_nid)))
4411                 continue;
4412 
4413             status = block_copy_resident_pages_between(block,
4414                                                        block_context,
4415                                                        dst_id,
4416                                                        dst_nid,
4417                                                        src_id,
4418                                                        src_nid,
4419                                                        region,
4420                                                        node_pages_mask,
4421                                                        prefetch_page_mask,
4422                                                        transfer_mode,
4423                                                        migrated_pages,
4424                                                        &copied_pages_from_src,
4425                                                        copy_tracker);
4426 
4427             *copied_pages_out += copied_pages_from_src;
4428 
4429             if (status != NV_OK)
4430                 break;
4431 
4432             if (!uvm_page_mask_andnot(copy_mask, copy_mask, node_pages_mask))
4433                 break;
4434         }
4435     }
4436     else {
4437         status = block_copy_resident_pages_between(block,
4438                                                    block_context,
4439                                                    dst_id,
4440                                                    NUMA_NO_NODE,
4441                                                    src_id,
4442                                                    src_nid,
4443                                                    region,
4444                                                    copy_mask,
4445                                                    prefetch_page_mask,
4446                                                    transfer_mode,
4447                                                    migrated_pages,
4448                                                    &copied_pages_from_src,
4449                                                    copy_tracker);
4450         *copied_pages_out += copied_pages_from_src;
4451     }
4452 
4453     return status;
4454 }
4455 
4456 // Copy resident pages to the destination from all source processors in the
4457 // src_processor_mask
4458 //
4459 // The function adds the pages that were successfully copied to the output
4460 // migrated_pages mask and returns the number of pages in copied_pages. These
4461 // fields are reliable even if an error is returned.
block_copy_resident_pages_mask(uvm_va_block_t * block,uvm_va_block_context_t * block_context,uvm_processor_id_t dst_id,const uvm_processor_mask_t * src_processor_mask,uvm_va_block_region_t region,const uvm_page_mask_t * page_mask,const uvm_page_mask_t * prefetch_page_mask,uvm_va_block_transfer_mode_t transfer_mode,NvU32 max_pages_to_copy,uvm_page_mask_t * migrated_pages,NvU32 * copied_pages_out,uvm_tracker_t * tracker_out)4462 static NV_STATUS block_copy_resident_pages_mask(uvm_va_block_t *block,
4463                                                 uvm_va_block_context_t *block_context,
4464                                                 uvm_processor_id_t dst_id,
4465                                                 const uvm_processor_mask_t *src_processor_mask,
4466                                                 uvm_va_block_region_t region,
4467                                                 const uvm_page_mask_t *page_mask,
4468                                                 const uvm_page_mask_t *prefetch_page_mask,
4469                                                 uvm_va_block_transfer_mode_t transfer_mode,
4470                                                 NvU32 max_pages_to_copy,
4471                                                 uvm_page_mask_t *migrated_pages,
4472                                                 NvU32 *copied_pages_out,
4473                                                 uvm_tracker_t *tracker_out)
4474 {
4475     NV_STATUS status = NV_OK;
4476     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
4477     uvm_processor_id_t src_id;
4478     uvm_processor_mask_t *search_mask;
4479 
4480     *copied_pages_out = 0;
4481 
4482     search_mask = uvm_processor_mask_cache_alloc();
4483     if (!search_mask)
4484         return NV_ERR_NO_MEMORY;
4485 
4486     uvm_processor_mask_copy(search_mask, src_processor_mask);
4487 
4488     for_each_closest_id(src_id, search_mask, dst_id, va_space) {
4489         NV_STATUS status;
4490 
4491         if (UVM_ID_IS_CPU(src_id)) {
4492             int nid;
4493 
4494             for_each_possible_uvm_node(nid) {
4495                 status = block_copy_resident_pages_from(block,
4496                                                         block_context,
4497                                                         dst_id,
4498                                                         src_id,
4499                                                         nid,
4500                                                         region,
4501                                                         page_mask,
4502                                                         prefetch_page_mask,
4503                                                         transfer_mode,
4504                                                         migrated_pages,
4505                                                         copied_pages_out,
4506                                                         tracker_out);
4507 
4508                 if (status != NV_OK)
4509                     break;
4510             }
4511         }
4512         else {
4513             status = block_copy_resident_pages_from(block,
4514                                                     block_context,
4515                                                     dst_id,
4516                                                     src_id,
4517                                                     NUMA_NO_NODE,
4518                                                     region,
4519                                                     page_mask,
4520                                                     prefetch_page_mask,
4521                                                     transfer_mode,
4522                                                     migrated_pages,
4523                                                     copied_pages_out,
4524                                                     tracker_out);
4525 
4526         }
4527 
4528         UVM_ASSERT(*copied_pages_out <= max_pages_to_copy);
4529 
4530         if (status != NV_OK)
4531             break;
4532 
4533         // Break out once we copied max pages already
4534         if (*copied_pages_out == max_pages_to_copy)
4535             break;
4536     }
4537 
4538     uvm_processor_mask_cache_free(search_mask);
4539     return status;
4540 }
4541 
break_read_duplication_in_region(uvm_va_block_t * block,uvm_va_block_context_t * block_context,uvm_processor_id_t dst_id,uvm_va_block_region_t region,const uvm_page_mask_t * page_mask)4542 static void break_read_duplication_in_region(uvm_va_block_t *block,
4543                                              uvm_va_block_context_t *block_context,
4544                                              uvm_processor_id_t dst_id,
4545                                              uvm_va_block_region_t region,
4546                                              const uvm_page_mask_t *page_mask)
4547 {
4548     uvm_processor_id_t id;
4549     uvm_page_mask_t *break_pages_in_region = &block_context->scratch_page_mask;
4550 
4551     uvm_page_mask_init_from_region(break_pages_in_region, region, page_mask);
4552 
4553     UVM_ASSERT(
4554         uvm_page_mask_subset(break_pages_in_region, uvm_va_block_resident_mask_get(block, dst_id, NUMA_NO_NODE)));
4555 
4556     // Clear read_duplicated bit for all pages in region
4557     uvm_page_mask_andnot(&block->read_duplicated_pages, &block->read_duplicated_pages, break_pages_in_region);
4558 
4559     // Clear residency bits for all processors other than dst_id
4560     for_each_id_in_mask(id, &block->resident) {
4561         uvm_page_mask_t *other_resident_mask;
4562 
4563         // Skip the destination processor, unless it's the CPU and a specific
4564         // NUMA node is the target destination. This is because CPU-to-CPU
4565         // migrations will switch the residency from one NUMA node to another
4566         // but the resident processor will remain the CPU.
4567         if (uvm_id_equal(id, dst_id) &&
4568             (!UVM_ID_IS_CPU(dst_id) || block_context->make_resident.dest_nid == NUMA_NO_NODE))
4569             continue;
4570 
4571         if (UVM_ID_IS_CPU(id)) {
4572             uvm_va_block_cpu_clear_resident_all_chunks(block, block_context, break_pages_in_region);
4573             other_resident_mask = uvm_va_block_resident_mask_get(block, UVM_ID_CPU, NUMA_NO_NODE);
4574         }
4575         else {
4576             other_resident_mask = uvm_va_block_resident_mask_get(block, id, NUMA_NO_NODE);
4577             uvm_page_mask_andnot(other_resident_mask, other_resident_mask, break_pages_in_region);
4578         }
4579 
4580         if (uvm_page_mask_empty(other_resident_mask))
4581             block_clear_resident_processor(block, id);
4582     }
4583 }
4584 
block_copy_set_first_touch_residency(uvm_va_block_t * block,uvm_va_block_context_t * block_context,uvm_processor_id_t dst_id,uvm_va_block_region_t region,const uvm_page_mask_t * page_mask)4585 static void block_copy_set_first_touch_residency(uvm_va_block_t *block,
4586                                                  uvm_va_block_context_t *block_context,
4587                                                  uvm_processor_id_t dst_id,
4588                                                  uvm_va_block_region_t region,
4589                                                  const uvm_page_mask_t *page_mask)
4590 {
4591     uvm_page_index_t page_index;
4592     uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(block, dst_id, NUMA_NO_NODE);
4593     uvm_page_mask_t *first_touch_mask = &block_context->make_resident.page_mask;
4594 
4595     if (page_mask)
4596         uvm_page_mask_andnot(first_touch_mask, page_mask, resident_mask);
4597     else
4598         uvm_page_mask_complement(first_touch_mask, resident_mask);
4599 
4600     uvm_page_mask_region_clear_outside(first_touch_mask, region);
4601 
4602     for_each_va_block_page_in_mask(page_index, first_touch_mask, block) {
4603         UVM_ASSERT(!block_is_page_resident_anywhere(block, page_index));
4604         UVM_ASSERT(block_processor_page_is_populated(block, dst_id, page_index));
4605         UVM_ASSERT(block_check_resident_proximity(block, block_context, page_index, dst_id));
4606     }
4607 
4608     if (UVM_ID_IS_CPU(dst_id)) {
4609         uvm_va_block_cpu_set_resident_all_chunks(block, block_context, first_touch_mask);
4610         resident_mask = uvm_va_block_resident_mask_get(block, UVM_ID_CPU, NUMA_NO_NODE);
4611     }
4612     else {
4613         uvm_page_mask_or(resident_mask, resident_mask, first_touch_mask);
4614     }
4615 
4616     if (!uvm_page_mask_empty(resident_mask))
4617         block_set_resident_processor(block, dst_id);
4618 
4619     // Add them to the output mask, too
4620     uvm_page_mask_or(&block_context->make_resident.pages_changed_residency,
4621                      &block_context->make_resident.pages_changed_residency,
4622                      first_touch_mask);
4623 }
4624 
4625 // Select the set of CPU pages to be used for the migration. The pages selected
4626 // could be used for either CPU destination pages (when the destination of the
4627 // migration is the CPU) or staging pages (when the migration to the destination
4628 // processor requires staging through the CPU).
block_select_cpu_node_pages(uvm_va_block_t * block,uvm_va_block_context_t * block_context,const uvm_page_mask_t * page_mask,uvm_va_block_region_t region)4629 static void block_select_cpu_node_pages(uvm_va_block_t *block,
4630                                         uvm_va_block_context_t *block_context,
4631                                         const uvm_page_mask_t *page_mask,
4632                                         uvm_va_block_region_t region)
4633 {
4634     uvm_va_block_cpu_node_state_t *node_state;
4635     uvm_make_resident_page_tracking_t *tracking = &block_context->make_resident.cpu_pages_used;
4636     uvm_page_mask_t *scratch_page_mask = &block_context->scratch_page_mask;
4637     uvm_page_mask_t *node_mask;
4638     int nid;
4639 
4640     if (uvm_page_mask_empty(page_mask))
4641         return;
4642 
4643     block_context->scratch_node_mask = node_possible_map;
4644     uvm_page_mask_init_from_region(scratch_page_mask, region, page_mask);
4645 
4646     for_each_closest_uvm_node(nid, uvm_va_block_context_get_node(block_context), block_context->scratch_node_mask) {
4647         node_state = block_node_state_get(block, nid);
4648         node_mask = block_tracking_node_mask_get(block_context, nid);
4649         if (uvm_page_mask_and(node_mask, scratch_page_mask, &node_state->allocated)) {
4650             node_set(nid, tracking->nodes);
4651             if (!uvm_page_mask_andnot(scratch_page_mask, scratch_page_mask, node_mask))
4652                 return;
4653         }
4654     }
4655 }
4656 
4657 // Copy resident pages from other processors to the destination.
4658 // All the pages on the destination need to be populated by the caller first.
4659 // Pages not resident anywhere else need to be zeroed out as well.
4660 // The transfer_mode is only used to tell uvm_perf_event_notify_migration()
4661 // whether the copy is for a migration or read duplication.
block_copy_resident_pages(uvm_va_block_t * block,uvm_va_block_context_t * block_context,uvm_processor_id_t dst_id,uvm_va_block_region_t region,const uvm_page_mask_t * page_mask,const uvm_page_mask_t * prefetch_page_mask,uvm_va_block_transfer_mode_t transfer_mode)4662 static NV_STATUS block_copy_resident_pages(uvm_va_block_t *block,
4663                                            uvm_va_block_context_t *block_context,
4664                                            uvm_processor_id_t dst_id,
4665                                            uvm_va_block_region_t region,
4666                                            const uvm_page_mask_t *page_mask,
4667                                            const uvm_page_mask_t *prefetch_page_mask,
4668                                            uvm_va_block_transfer_mode_t transfer_mode)
4669 {
4670     NV_STATUS status = NV_OK;
4671     NV_STATUS tracker_status;
4672     uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
4673     uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(block,
4674                                                                     dst_id,
4675                                                                     block_context->make_resident.dest_nid);
4676     NvU32 missing_pages_count;
4677     NvU32 pages_copied;
4678     NvU32 pages_copied_to_cpu = 0;
4679     uvm_processor_mask_t *src_processor_mask = NULL;
4680     uvm_page_mask_t *copy_page_mask = &block_context->make_resident.page_mask;
4681     uvm_page_mask_t *migrated_pages = &block_context->make_resident.pages_migrated;
4682     uvm_page_mask_t *pages_staged = &block_context->make_resident.pages_staged;
4683     uvm_page_mask_t *cpu_page_mask;
4684     uvm_page_mask_t *numa_resident_pages;
4685     int nid;
4686 
4687     uvm_page_mask_zero(migrated_pages);
4688 
4689     if (page_mask)
4690         uvm_page_mask_andnot(copy_page_mask, page_mask, resident_mask);
4691     else
4692         uvm_page_mask_complement(copy_page_mask, resident_mask);
4693 
4694     missing_pages_count = uvm_page_mask_region_weight(copy_page_mask, region);
4695 
4696     if (missing_pages_count == 0)
4697         goto out;
4698 
4699     src_processor_mask = uvm_processor_mask_cache_alloc();
4700     if (!src_processor_mask) {
4701         status = NV_ERR_NO_MEMORY;
4702         goto out;
4703     }
4704 
4705     // TODO: Bug 1753731: Add P2P2P copies staged through a GPU
4706     // TODO: Bug 1753731: When a page is resident in multiple locations due to
4707     //       read-duplication, spread out the source of the copy so we don't
4708     //       bottleneck on a single location.
4709 
4710     uvm_processor_mask_zero(src_processor_mask);
4711 
4712     if (UVM_ID_IS_GPU(dst_id)) {
4713         // If the destination is a GPU, first copy everything from processors
4714         // with copy access supported. Notably this will copy pages from the CPU
4715         // as well even if later some extra copies from CPU are required for
4716         // staged copies.
4717         uvm_processor_mask_and(src_processor_mask, block_get_can_copy_from_mask(block, dst_id), &block->resident);
4718         uvm_processor_mask_clear(src_processor_mask, dst_id);
4719 
4720         cpu_page_mask = pages_staged;
4721     }
4722     else {
4723         cpu_page_mask = copy_page_mask;
4724     }
4725 
4726     block_select_cpu_node_pages(block, block_context, cpu_page_mask, region);
4727 
4728     if (UVM_ID_IS_GPU(dst_id)) {
4729         status = block_copy_resident_pages_mask(block,
4730                                                 block_context,
4731                                                 dst_id,
4732                                                 src_processor_mask,
4733                                                 region,
4734                                                 copy_page_mask,
4735                                                 prefetch_page_mask,
4736                                                 transfer_mode,
4737                                                 missing_pages_count,
4738                                                 migrated_pages,
4739                                                 &pages_copied,
4740                                                 &local_tracker);
4741 
4742         UVM_ASSERT(missing_pages_count >= pages_copied);
4743         missing_pages_count -= pages_copied;
4744 
4745         if (status != NV_OK)
4746             goto out;
4747 
4748         if (missing_pages_count == 0) {
4749             UVM_ASSERT(uvm_page_mask_empty(pages_staged));
4750             goto out;
4751         }
4752 
4753         if (pages_copied)
4754             uvm_page_mask_andnot(copy_page_mask, copy_page_mask, migrated_pages);
4755     }
4756 
4757     // Now copy from everywhere else to the CPU. This is both for when the
4758     // destination is the CPU (src_processor_mask empty) and for a staged copy
4759     // (src_processor_mask containing processors with copy access to dst_id).
4760     uvm_processor_mask_andnot(src_processor_mask, &block->resident, src_processor_mask);
4761 
4762     // If the destination is the CPU but not all pages are resident on the
4763     // destination NUMA node, the CPU is still a source.
4764     numa_resident_pages = uvm_va_block_resident_mask_get(block, UVM_ID_CPU, block_context->make_resident.dest_nid);
4765     if (!UVM_ID_IS_CPU(dst_id) || uvm_page_mask_subset(copy_page_mask, numa_resident_pages)) {
4766         uvm_processor_mask_clear(src_processor_mask, dst_id);
4767         uvm_processor_mask_clear(src_processor_mask, UVM_ID_CPU);
4768     }
4769 
4770 
4771     if (!uvm_page_mask_empty(cpu_page_mask)) {
4772         status = block_copy_resident_pages_mask(block,
4773                                                 block_context,
4774                                                 UVM_ID_CPU,
4775                                                 src_processor_mask,
4776                                                 region,
4777                                                 cpu_page_mask,
4778                                                 prefetch_page_mask,
4779                                                 transfer_mode,
4780                                                 missing_pages_count,
4781                                                 UVM_ID_IS_CPU(dst_id) ? migrated_pages : NULL,
4782                                                 &pages_copied_to_cpu,
4783                                                 &local_tracker);
4784 
4785         if (status != NV_OK)
4786             goto out;
4787     }
4788 
4789     // If destination is the CPU then we copied everything there above
4790     if (!UVM_ID_IS_GPU(dst_id))
4791         goto out;
4792 
4793     // Add everything to the block's tracker so that the
4794     // block_copy_resident_pages_between() call below will acquire it.
4795     status = uvm_tracker_add_tracker_safe(&block->tracker, &local_tracker);
4796     if (status != NV_OK)
4797         goto out;
4798     uvm_tracker_clear(&local_tracker);
4799 
4800     // Now copy staged pages from the CPU to the destination.
4801     // The staging copy above could have allocated pages on any NUMA node.
4802     // Loop over all nodes where pages were allocated and copy from those
4803     // nodes.
4804     pages_copied = 0;
4805     for_each_node_mask(nid, block_context->make_resident.cpu_pages_used.nodes) {
4806         NvU32 pages_copied_from_node;
4807         uvm_page_mask_t *node_pages_mask = &block_context->make_resident.node_pages_mask;
4808         uvm_page_mask_t *node_alloc_mask = block_tracking_node_mask_get(block_context, nid);
4809 
4810         if (uvm_page_mask_and(node_pages_mask, pages_staged, node_alloc_mask)) {
4811             status = block_copy_resident_pages_between(block,
4812                                                        block_context,
4813                                                        dst_id,
4814                                                        NUMA_NO_NODE,
4815                                                        UVM_ID_CPU,
4816                                                        nid,
4817                                                        region,
4818                                                        node_pages_mask,
4819                                                        prefetch_page_mask,
4820                                                        transfer_mode,
4821                                                        migrated_pages,
4822                                                        &pages_copied_from_node,
4823                                                        &local_tracker);
4824             UVM_ASSERT(missing_pages_count >= pages_copied_from_node);
4825             missing_pages_count -= pages_copied_from_node;
4826             pages_copied += pages_copied_from_node;
4827         }
4828 
4829         if (status != NV_OK)
4830             break;
4831     }
4832 
4833     if (status != NV_OK)
4834         goto out;
4835 
4836     // If we get here, that means we were staging the copy through the CPU and
4837     // we should copy as many pages from the CPU as we copied to the CPU.
4838     UVM_ASSERT(pages_copied == pages_copied_to_cpu);
4839 
4840 out:
4841     // Add everything from the local tracker to the block's tracker.
4842     // Notably this is also needed for handling
4843     // block_copy_resident_pages_between() failures in the first loop.
4844     tracker_status = uvm_tracker_add_tracker_safe(&block->tracker, &local_tracker);
4845     uvm_tracker_deinit(&local_tracker);
4846     uvm_processor_mask_cache_free(src_processor_mask);
4847 
4848     return status == NV_OK ? tracker_status : status;
4849 }
4850 
uvm_va_block_make_resident_copy(uvm_va_block_t * va_block,uvm_va_block_retry_t * va_block_retry,uvm_va_block_context_t * va_block_context,uvm_processor_id_t dest_id,uvm_va_block_region_t region,const uvm_page_mask_t * page_mask,const uvm_page_mask_t * prefetch_page_mask,uvm_make_resident_cause_t cause)4851 NV_STATUS uvm_va_block_make_resident_copy(uvm_va_block_t *va_block,
4852                                           uvm_va_block_retry_t *va_block_retry,
4853                                           uvm_va_block_context_t *va_block_context,
4854                                           uvm_processor_id_t dest_id,
4855                                           uvm_va_block_region_t region,
4856                                           const uvm_page_mask_t *page_mask,
4857                                           const uvm_page_mask_t *prefetch_page_mask,
4858                                           uvm_make_resident_cause_t cause)
4859 {
4860     NV_STATUS status = NV_OK;
4861     uvm_processor_mask_t *unmap_processor_mask;
4862     uvm_page_mask_t *unmap_page_mask = &va_block_context->make_resident.page_mask;
4863     uvm_page_mask_t *resident_mask;
4864 
4865     va_block_context->make_resident.dest_id = dest_id;
4866     va_block_context->make_resident.cause = cause;
4867     nodes_clear(va_block_context->make_resident.cpu_pages_used.nodes);
4868 
4869     if (prefetch_page_mask) {
4870         UVM_ASSERT(cause == UVM_MAKE_RESIDENT_CAUSE_REPLAYABLE_FAULT ||
4871                    cause == UVM_MAKE_RESIDENT_CAUSE_NON_REPLAYABLE_FAULT ||
4872                    cause == UVM_MAKE_RESIDENT_CAUSE_ACCESS_COUNTER);
4873     }
4874 
4875     uvm_assert_mutex_locked(&va_block->lock);
4876     UVM_ASSERT(uvm_va_block_is_hmm(va_block) || va_block->va_range->type == UVM_VA_RANGE_TYPE_MANAGED);
4877 
4878     unmap_processor_mask = uvm_processor_mask_cache_alloc();
4879     if (!unmap_processor_mask) {
4880         status = NV_ERR_NO_MEMORY;
4881         goto out;
4882     }
4883 
4884     resident_mask = block_resident_mask_get_alloc(va_block, dest_id, va_block_context->make_resident.dest_nid);
4885     if (!resident_mask) {
4886         status = NV_ERR_NO_MEMORY;
4887         goto out;
4888     }
4889 
4890     // Unmap all mapped processors except for UVM-Lite GPUs as their mappings
4891     // are largely persistent.
4892     uvm_processor_mask_andnot(unmap_processor_mask, &va_block->mapped, block_get_uvm_lite_gpus(va_block));
4893 
4894     if (page_mask)
4895         uvm_page_mask_andnot(unmap_page_mask, page_mask, resident_mask);
4896     else
4897         uvm_page_mask_complement(unmap_page_mask, resident_mask);
4898     uvm_page_mask_region_clear_outside(unmap_page_mask, region);
4899 
4900     // Unmap all pages not resident on the destination
4901     status = uvm_va_block_unmap_mask(va_block, va_block_context, unmap_processor_mask, region, unmap_page_mask);
4902     if (status != NV_OK)
4903         goto out;
4904 
4905     if (page_mask)
4906         uvm_page_mask_and(unmap_page_mask, page_mask, &va_block->read_duplicated_pages);
4907     else
4908         uvm_page_mask_init_from_region(unmap_page_mask, region, &va_block->read_duplicated_pages);
4909     uvm_page_mask_region_clear_outside(unmap_page_mask, region);
4910 
4911     // Also unmap read-duplicated pages excluding dest_id
4912     uvm_processor_mask_clear(unmap_processor_mask, dest_id);
4913     status = uvm_va_block_unmap_mask(va_block, va_block_context, unmap_processor_mask, region, unmap_page_mask);
4914     if (status != NV_OK)
4915         goto out;
4916 
4917     uvm_tools_record_read_duplicate_invalidate(va_block,
4918                                                dest_id,
4919                                                region,
4920                                                unmap_page_mask);
4921 
4922     // Note that block_populate_pages and block_copy_resident_pages also use
4923     // va_block_context->make_resident.page_mask.
4924     unmap_page_mask = NULL;
4925 
4926     status = block_populate_pages(va_block, va_block_retry, va_block_context, dest_id, region, page_mask);
4927     if (status != NV_OK)
4928         goto out;
4929 
4930     status = block_copy_resident_pages(va_block,
4931                                        va_block_context,
4932                                        dest_id,
4933                                        region,
4934                                        page_mask,
4935                                        prefetch_page_mask,
4936                                        UVM_VA_BLOCK_TRANSFER_MODE_MOVE);
4937 
4938 out:
4939     uvm_processor_mask_cache_free(unmap_processor_mask);
4940     return status;
4941 }
4942 
block_make_resident_clear_evicted(uvm_va_block_t * va_block,uvm_processor_id_t dst_id,uvm_page_mask_t * page_mask)4943 static void block_make_resident_clear_evicted(uvm_va_block_t *va_block,
4944                                               uvm_processor_id_t dst_id,
4945                                               uvm_page_mask_t *page_mask)
4946 {
4947     uvm_va_block_gpu_state_t *dst_gpu_state = uvm_va_block_gpu_state_get(va_block, dst_id);
4948 
4949     UVM_ASSERT(dst_gpu_state);
4950 
4951     if (!uvm_page_mask_andnot(&dst_gpu_state->evicted, &dst_gpu_state->evicted, page_mask))
4952         uvm_processor_mask_clear(&va_block->evicted_gpus, dst_id);
4953 }
4954 
block_make_resident_update_state(uvm_va_block_t * va_block,uvm_va_block_context_t * va_block_context,uvm_processor_id_t dst_id,uvm_va_block_region_t region,uvm_page_mask_t * copy_mask,uvm_make_resident_cause_t cause)4955 static void block_make_resident_update_state(uvm_va_block_t *va_block,
4956                                              uvm_va_block_context_t *va_block_context,
4957                                              uvm_processor_id_t dst_id,
4958                                              uvm_va_block_region_t region,
4959                                              uvm_page_mask_t *copy_mask,
4960                                              uvm_make_resident_cause_t cause)
4961 {
4962     if (UVM_ID_IS_CPU(dst_id)) {
4963         // CPU chunks may not have been allocated on the preferred NUMA node. So,
4964         // the residency has to be updated based on the chunk's NUMA ID.
4965         uvm_va_block_cpu_set_resident_all_chunks(va_block, va_block_context, copy_mask);
4966     }
4967     else {
4968         uvm_page_mask_t *dst_resident_mask = uvm_va_block_resident_mask_get(va_block, dst_id, NUMA_NO_NODE);
4969 
4970         uvm_page_mask_or(dst_resident_mask, dst_resident_mask, copy_mask);
4971     }
4972 
4973     block_set_resident_processor(va_block, dst_id);
4974 
4975     // Accumulate the pages that migrated into the output mask.
4976     uvm_page_mask_or(&va_block_context->make_resident.pages_changed_residency,
4977                      &va_block_context->make_resident.pages_changed_residency,
4978                      copy_mask);
4979 
4980     // Any move operation implies that mappings have been removed from all
4981     // non-UVM-Lite GPUs.
4982     if (!uvm_va_block_is_hmm(va_block))
4983         uvm_page_mask_andnot(&va_block->maybe_mapped_pages, &va_block->maybe_mapped_pages, copy_mask);
4984 
4985     // If we are migrating due to an eviction, set the GPU as evicted and
4986     // mark the evicted pages. If we are migrating away from the CPU this
4987     // means that those pages are not evicted.
4988     if (cause == UVM_MAKE_RESIDENT_CAUSE_EVICTION) {
4989         uvm_processor_id_t src_id;
4990 
4991         UVM_ASSERT(UVM_ID_IS_CPU(dst_id));
4992 
4993         // Note that the destination is the CPU so this loop excludes it.
4994         for_each_gpu_id_in_mask(src_id, &va_block_context->make_resident.all_involved_processors) {
4995             uvm_va_block_gpu_state_t *src_gpu_state = uvm_va_block_gpu_state_get(va_block, src_id);
4996 
4997             UVM_ASSERT(src_gpu_state);
4998 
4999             uvm_page_mask_or(&src_gpu_state->evicted, &src_gpu_state->evicted, copy_mask);
5000             uvm_processor_mask_set(&va_block->evicted_gpus, src_id);
5001         }
5002     }
5003     else if (UVM_ID_IS_GPU(dst_id) && uvm_processor_mask_test(&va_block->evicted_gpus, dst_id))
5004         block_make_resident_clear_evicted(va_block, dst_id, copy_mask);
5005 }
5006 
uvm_va_block_make_resident_finish(uvm_va_block_t * va_block,uvm_va_block_context_t * va_block_context,uvm_va_block_region_t region,const uvm_page_mask_t * page_mask)5007 void uvm_va_block_make_resident_finish(uvm_va_block_t *va_block,
5008                                        uvm_va_block_context_t *va_block_context,
5009                                        uvm_va_block_region_t region,
5010                                        const uvm_page_mask_t *page_mask)
5011 {
5012     uvm_page_mask_t *migrated_pages = &va_block_context->make_resident.pages_migrated;
5013     uvm_processor_id_t dst_id = va_block_context->make_resident.dest_id;
5014 
5015     uvm_assert_mutex_locked(&va_block->lock);
5016 
5017     if (page_mask)
5018         uvm_page_mask_and(migrated_pages, migrated_pages, page_mask);
5019 
5020     if (!uvm_page_mask_empty(migrated_pages)) {
5021         // The migrated pages are now resident on the destination.
5022         block_make_resident_update_state(va_block,
5023                                          va_block_context,
5024                                          dst_id,
5025                                          region,
5026                                          migrated_pages,
5027                                          va_block_context->make_resident.cause);
5028     }
5029 
5030     // Pages that weren't resident anywhere else were populated at the
5031     // destination directly. Mark them as resident now.
5032     block_copy_set_first_touch_residency(va_block, va_block_context, dst_id, region, page_mask);
5033 
5034     // Break read duplication and clear residency from other processors.
5035     break_read_duplication_in_region(va_block, va_block_context, dst_id, region, page_mask);
5036 
5037     // Update eviction heuristics, if needed. Notably this could repeat the call
5038     // done in block_set_resident_processor(), but that doesn't do anything bad
5039     // and it's simpler to keep it in both places.
5040     //
5041     // Skip this if we didn't do anything (the input region and/or page mask was
5042     // empty).
5043     if (uvm_processor_mask_test(&va_block->resident, dst_id))
5044         block_mark_memory_used(va_block, dst_id);
5045 
5046     // Check state of all chunks after residency change.
5047     // TODO: Bug 4207783: Check both CPU and GPU chunks.
5048     UVM_ASSERT(block_check_cpu_chunks(va_block));
5049 }
5050 
uvm_va_block_make_resident(uvm_va_block_t * va_block,uvm_va_block_retry_t * va_block_retry,uvm_va_block_context_t * va_block_context,uvm_processor_id_t dest_id,uvm_va_block_region_t region,const uvm_page_mask_t * page_mask,const uvm_page_mask_t * prefetch_page_mask,uvm_make_resident_cause_t cause)5051 NV_STATUS uvm_va_block_make_resident(uvm_va_block_t *va_block,
5052                                      uvm_va_block_retry_t *va_block_retry,
5053                                      uvm_va_block_context_t *va_block_context,
5054                                      uvm_processor_id_t dest_id,
5055                                      uvm_va_block_region_t region,
5056                                      const uvm_page_mask_t *page_mask,
5057                                      const uvm_page_mask_t *prefetch_page_mask,
5058                                      uvm_make_resident_cause_t cause)
5059 {
5060     NV_STATUS status;
5061 
5062     status = uvm_va_block_make_resident_copy(va_block,
5063                                              va_block_retry,
5064                                              va_block_context,
5065                                              dest_id,
5066                                              region,
5067                                              page_mask,
5068                                              prefetch_page_mask,
5069                                              cause);
5070     if (status != NV_OK)
5071         return status;
5072 
5073     uvm_va_block_make_resident_finish(va_block,
5074                                       va_block_context,
5075                                       region,
5076                                       page_mask);
5077 
5078     return NV_OK;
5079 }
5080 
5081 // Combination function which prepares the input {region, page_mask} for
5082 // entering read-duplication. It:
5083 // - Unmaps all processors but revoke_id
5084 // - Revokes write access from revoke_id
block_prep_read_duplicate_mapping(uvm_va_block_t * va_block,uvm_va_block_context_t * va_block_context,uvm_processor_id_t revoke_id,uvm_va_block_region_t region,const uvm_page_mask_t * page_mask)5085 static NV_STATUS block_prep_read_duplicate_mapping(uvm_va_block_t *va_block,
5086                                                    uvm_va_block_context_t *va_block_context,
5087                                                    uvm_processor_id_t revoke_id,
5088                                                    uvm_va_block_region_t region,
5089                                                    const uvm_page_mask_t *page_mask)
5090 {
5091     uvm_processor_mask_t *unmap_processor_mask;
5092     uvm_processor_id_t unmap_id;
5093     uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
5094     NV_STATUS status, tracker_status;
5095 
5096     unmap_processor_mask = uvm_processor_mask_cache_alloc();
5097     if (!unmap_processor_mask)
5098         return NV_ERR_NO_MEMORY;
5099 
5100     // Unmap everybody except revoke_id
5101     uvm_processor_mask_andnot(unmap_processor_mask, &va_block->mapped, block_get_uvm_lite_gpus(va_block));
5102     uvm_processor_mask_clear(unmap_processor_mask, revoke_id);
5103 
5104     for_each_id_in_mask(unmap_id, unmap_processor_mask) {
5105         status = uvm_va_block_unmap(va_block, va_block_context, unmap_id, region, page_mask, &local_tracker);
5106         if (status != NV_OK)
5107             goto out;
5108     }
5109 
5110     // Revoke WRITE/ATOMIC access permissions from the remaining mapped
5111     // processor.
5112     status = uvm_va_block_revoke_prot(va_block,
5113                                       va_block_context,
5114                                       revoke_id,
5115                                       region,
5116                                       page_mask,
5117                                       UVM_PROT_READ_WRITE,
5118                                       &local_tracker);
5119     if (status != NV_OK)
5120         goto out;
5121 
5122 out:
5123     tracker_status = uvm_tracker_add_tracker_safe(&va_block->tracker, &local_tracker);
5124     uvm_tracker_deinit(&local_tracker);
5125     uvm_processor_mask_cache_free(unmap_processor_mask);
5126     return status == NV_OK ? tracker_status : status;
5127 }
5128 
uvm_va_block_make_resident_read_duplicate(uvm_va_block_t * va_block,uvm_va_block_retry_t * va_block_retry,uvm_va_block_context_t * va_block_context,uvm_processor_id_t dest_id,uvm_va_block_region_t region,const uvm_page_mask_t * page_mask,const uvm_page_mask_t * prefetch_page_mask,uvm_make_resident_cause_t cause)5129 NV_STATUS uvm_va_block_make_resident_read_duplicate(uvm_va_block_t *va_block,
5130                                                     uvm_va_block_retry_t *va_block_retry,
5131                                                     uvm_va_block_context_t *va_block_context,
5132                                                     uvm_processor_id_t dest_id,
5133                                                     uvm_va_block_region_t region,
5134                                                     const uvm_page_mask_t *page_mask,
5135                                                     const uvm_page_mask_t *prefetch_page_mask,
5136                                                     uvm_make_resident_cause_t cause)
5137 {
5138     NV_STATUS status = NV_OK;
5139     uvm_processor_id_t src_id;
5140     uvm_page_mask_t *dst_resident_mask;
5141     uvm_page_mask_t *migrated_pages;
5142     uvm_page_mask_t *staged_pages;
5143     uvm_page_mask_t *scratch_residency_mask;
5144 
5145     // TODO: Bug 3660922: need to implement HMM read duplication support.
5146     UVM_ASSERT(!uvm_va_block_is_hmm(va_block));
5147 
5148     va_block_context->make_resident.dest_id = dest_id;
5149     va_block_context->make_resident.cause = cause;
5150     nodes_clear(va_block_context->make_resident.cpu_pages_used.nodes);
5151 
5152     if (prefetch_page_mask) {
5153         // TODO: Bug 1877578: investigate automatic read-duplicate policies
5154         UVM_ASSERT(cause == UVM_MAKE_RESIDENT_CAUSE_REPLAYABLE_FAULT ||
5155                    cause == UVM_MAKE_RESIDENT_CAUSE_NON_REPLAYABLE_FAULT ||
5156                    cause == UVM_MAKE_RESIDENT_CAUSE_ACCESS_COUNTER);
5157     }
5158 
5159     uvm_assert_mutex_locked(&va_block->lock);
5160     UVM_ASSERT(!uvm_va_block_is_dead(va_block));
5161 
5162     scratch_residency_mask = kmem_cache_alloc(g_uvm_page_mask_cache, NV_UVM_GFP_FLAGS);
5163     if (!scratch_residency_mask)
5164         return NV_ERR_NO_MEMORY;
5165 
5166     // For pages that are entering read-duplication we need to unmap remote
5167     // mappings and revoke RW and higher access permissions.
5168     //
5169     // The current implementation:
5170     // - Unmaps pages from all processors but the one with the resident copy
5171     // - Revokes write access from the processor with the resident copy
5172     for_each_id_in_mask(src_id, &va_block->resident) {
5173         // Note that the below calls to block_populate_pages and
5174         // block_copy_resident_pages also use
5175         // va_block_context->make_resident.page_mask.
5176         uvm_page_mask_t *preprocess_page_mask = &va_block_context->make_resident.page_mask;
5177         const uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, src_id, NUMA_NO_NODE);
5178         UVM_ASSERT(!uvm_page_mask_empty(resident_mask));
5179 
5180         if (page_mask)
5181             uvm_page_mask_andnot(preprocess_page_mask, page_mask, &va_block->read_duplicated_pages);
5182         else
5183             uvm_page_mask_complement(preprocess_page_mask, &va_block->read_duplicated_pages);
5184 
5185         // If there are no pages that need to be unmapped/revoked, skip to the
5186         // next processor
5187         if (!uvm_page_mask_and(preprocess_page_mask, preprocess_page_mask, resident_mask))
5188             continue;
5189 
5190         status = block_prep_read_duplicate_mapping(va_block, va_block_context, src_id, region, preprocess_page_mask);
5191         if (status != NV_OK)
5192             goto out;
5193     }
5194 
5195     status = block_populate_pages(va_block, va_block_retry, va_block_context, dest_id, region, page_mask);
5196     if (status != NV_OK)
5197         goto out;
5198 
5199     status = block_copy_resident_pages(va_block,
5200                                        va_block_context,
5201                                        dest_id,
5202                                        region,
5203                                        page_mask,
5204                                        prefetch_page_mask,
5205                                        UVM_VA_BLOCK_TRANSFER_MODE_COPY);
5206     if (status != NV_OK)
5207         goto out;
5208 
5209     // Pages that weren't resident anywhere else were populated at the
5210     // destination directly. Mark them as resident now, since there were no
5211     // errors from block_copy_resident_pages() above.
5212     migrated_pages = &va_block_context->make_resident.pages_migrated;
5213     uvm_page_mask_init_from_region(scratch_residency_mask, region, page_mask);
5214     uvm_page_mask_andnot(scratch_residency_mask, scratch_residency_mask, migrated_pages);
5215 
5216     if (!uvm_page_mask_empty(scratch_residency_mask))
5217         block_copy_set_first_touch_residency(va_block, va_block_context, dest_id, region, scratch_residency_mask);
5218 
5219     staged_pages = &va_block_context->make_resident.pages_staged;
5220     if (!UVM_ID_IS_CPU(dest_id) && !uvm_page_mask_empty(staged_pages)) {
5221         uvm_va_block_cpu_set_resident_all_chunks(va_block, va_block_context, staged_pages);
5222         block_set_resident_processor(va_block, UVM_ID_CPU);
5223         uvm_page_mask_or(&va_block->read_duplicated_pages, &va_block->read_duplicated_pages, staged_pages);
5224         uvm_tools_record_read_duplicate(va_block, UVM_ID_CPU, region, staged_pages);
5225     }
5226 
5227     if (!uvm_page_mask_empty(migrated_pages)) {
5228         if (UVM_ID_IS_CPU(dest_id)) {
5229             // Check if the CPU is already in the resident set of processors.
5230             // We need to do this since we can't have multiple NUMA nodes with
5231             // resident pages.
5232             // If any of the migrate pages were already resident on the CPU, the
5233             // residency has to be switched to the destination NUMA node.
5234             if (uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU) &&
5235                 uvm_page_mask_and(scratch_residency_mask,
5236                                   uvm_va_block_resident_mask_get(va_block, UVM_ID_CPU, NUMA_NO_NODE),
5237                                   migrated_pages)) {
5238                 uvm_va_block_cpu_clear_resident_all_chunks(va_block, va_block_context, scratch_residency_mask);
5239             }
5240 
5241             uvm_va_block_cpu_set_resident_all_chunks(va_block, va_block_context, migrated_pages);
5242         }
5243         else {
5244             dst_resident_mask = uvm_va_block_resident_mask_get(va_block, dest_id, NUMA_NO_NODE);
5245             uvm_page_mask_or(dst_resident_mask, dst_resident_mask, migrated_pages);
5246         }
5247 
5248         block_set_resident_processor(va_block, dest_id);
5249         uvm_page_mask_or(&va_block->read_duplicated_pages, &va_block->read_duplicated_pages, migrated_pages);
5250         uvm_tools_record_read_duplicate(va_block, dest_id, region, migrated_pages);
5251     }
5252 
5253     UVM_ASSERT(cause != UVM_MAKE_RESIDENT_CAUSE_EVICTION);
5254     if (UVM_ID_IS_GPU(dest_id) && uvm_processor_mask_test(&va_block->evicted_gpus, dest_id))
5255         block_make_resident_clear_evicted(va_block, dest_id, migrated_pages);
5256 
5257     // Update eviction heuristics, if needed. Notably this could repeat the call
5258     // done in block_set_resident_processor(), but that doesn't do anything bad
5259     // and it's simpler to keep it in both places.
5260     //
5261     // Skip this if we didn't do anything (the input region and/or page mask was
5262     // empty).
5263     if (uvm_processor_mask_test(&va_block->resident, dest_id))
5264         block_mark_memory_used(va_block, dest_id);
5265 
5266     // Check state of all chunks after residency change.
5267     // TODO: Bug 4207783: Check both CPU and GPU chunks.
5268     UVM_ASSERT(block_check_cpu_chunks(va_block));
5269 out:
5270     kmem_cache_free(g_uvm_page_mask_cache, scratch_residency_mask);
5271     return status;
5272 }
5273 
5274 // Looks up the current CPU mapping state of page from the
5275 // block->cpu.pte_bits bitmaps. If write access is enabled,
5276 // UVM_PROT_READ_WRITE_ATOMIC is returned instead of UVM_PROT_READ_WRITE, since
5277 // write access implies atomic access for CPUs.
block_page_prot_cpu(uvm_va_block_t * block,uvm_page_index_t page_index)5278 static uvm_prot_t block_page_prot_cpu(uvm_va_block_t *block, uvm_page_index_t page_index)
5279 {
5280     uvm_prot_t prot;
5281 
5282     UVM_ASSERT(!uvm_va_block_is_dead(block));
5283 
5284     if (uvm_page_mask_test(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE], page_index))
5285         prot = UVM_PROT_READ_WRITE_ATOMIC;
5286     else if (uvm_page_mask_test(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ], page_index))
5287         prot = UVM_PROT_READ_ONLY;
5288     else
5289         prot = UVM_PROT_NONE;
5290 
5291     return prot;
5292 }
5293 
5294 // Looks up the current GPU mapping state of page from the
5295 // block->gpus[i]->pte_bits bitmaps.
block_page_prot_gpu(uvm_va_block_t * block,uvm_gpu_t * gpu,uvm_page_index_t page_index)5296 static uvm_prot_t block_page_prot_gpu(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_page_index_t page_index)
5297 {
5298     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
5299     uvm_prot_t prot;
5300 
5301     UVM_ASSERT(!uvm_va_block_is_dead(block));
5302 
5303     if (!gpu_state)
5304         return UVM_PROT_NONE;
5305 
5306     if (uvm_page_mask_test(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_ATOMIC], page_index))
5307         prot = UVM_PROT_READ_WRITE_ATOMIC;
5308     else if (uvm_page_mask_test(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_WRITE], page_index))
5309         prot = UVM_PROT_READ_WRITE;
5310     else if (uvm_page_mask_test(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ], page_index))
5311         prot = UVM_PROT_READ_ONLY;
5312     else
5313         prot = UVM_PROT_NONE;
5314 
5315     return prot;
5316 }
5317 
block_page_prot(uvm_va_block_t * block,uvm_processor_id_t id,uvm_page_index_t page_index)5318 static uvm_prot_t block_page_prot(uvm_va_block_t *block, uvm_processor_id_t id, uvm_page_index_t page_index)
5319 {
5320     if (UVM_ID_IS_CPU(id))
5321         return block_page_prot_cpu(block, page_index);
5322     else
5323         return block_page_prot_gpu(block, block_get_gpu(block, id), page_index);
5324 }
5325 
5326 // Returns true if the block has any valid CPU PTE mapping in the block region.
block_has_valid_mapping_cpu(uvm_va_block_t * block,uvm_va_block_region_t region)5327 static bool block_has_valid_mapping_cpu(uvm_va_block_t *block, uvm_va_block_region_t region)
5328 {
5329     size_t valid_page;
5330 
5331     UVM_ASSERT(region.outer <= uvm_va_block_num_cpu_pages(block));
5332 
5333     // Early-out: check whether any address in this block has a CPU mapping
5334     if (!uvm_processor_mask_test(&block->mapped, UVM_ID_CPU)) {
5335         UVM_ASSERT(uvm_page_mask_empty(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ]));
5336         UVM_ASSERT(uvm_page_mask_empty(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE]));
5337         return false;
5338     }
5339 
5340     // All valid mappings have at least read permissions so we only need to
5341     // inspect the read bits.
5342     valid_page = uvm_va_block_first_page_in_mask(region, &block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ]);
5343     if (valid_page == region.outer)
5344         return false;
5345 
5346     UVM_ASSERT(block_page_prot_cpu(block, valid_page) != UVM_PROT_NONE);
5347     return true;
5348 }
5349 
block_check_chunk_indirect_peers(uvm_va_block_t * block,uvm_gpu_t * gpu,uvm_gpu_chunk_t * chunk)5350 static bool block_check_chunk_indirect_peers(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_gpu_chunk_t *chunk)
5351 {
5352     uvm_gpu_t *accessing_gpu;
5353     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
5354 
5355     if (!uvm_pmm_sysmem_mappings_indirect_supported())
5356         return true;
5357 
5358     for_each_va_space_gpu_in_mask(accessing_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)]) {
5359         NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(&gpu->pmm, chunk, accessing_gpu);
5360         uvm_reverse_map_t reverse_map;
5361         size_t num_mappings;
5362 
5363         num_mappings = uvm_pmm_sysmem_mappings_dma_to_virt(&accessing_gpu->pmm_reverse_sysmem_mappings,
5364                                                            peer_addr,
5365                                                            uvm_gpu_chunk_get_size(chunk),
5366                                                            &reverse_map,
5367                                                            1);
5368         UVM_ASSERT(num_mappings == 1);
5369         UVM_ASSERT(reverse_map.va_block == block);
5370         UVM_ASSERT(reverse_map.region.first == chunk->va_block_page_index);
5371         UVM_ASSERT(uvm_va_block_region_size(reverse_map.region) == uvm_gpu_chunk_get_size(chunk));
5372 
5373         uvm_va_block_release_no_destroy(reverse_map.va_block);
5374     }
5375 
5376     return true;
5377 }
5378 
5379 // Sanity check the given GPU's chunks array
block_check_gpu_chunks(uvm_va_block_t * block,uvm_gpu_id_t id)5380 static bool block_check_gpu_chunks(uvm_va_block_t *block, uvm_gpu_id_t id)
5381 {
5382     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, id);
5383     uvm_gpu_t *gpu;
5384     size_t i, num_chunks;
5385     uvm_page_index_t page_index;
5386     uvm_chunk_size_t chunk_size;
5387 
5388     if (!gpu_state)
5389         return true;
5390 
5391     gpu = block_get_gpu(block, id);
5392 
5393     num_chunks = block_num_gpu_chunks(block, gpu);
5394     for (page_index = 0, i = 0; i < num_chunks; i++) {
5395         uvm_gpu_chunk_t *chunk = gpu_state->chunks[i];
5396         size_t chunk_index = block_gpu_chunk_index(block, gpu, page_index, &chunk_size);
5397 
5398         if (chunk_index != i) {
5399             UVM_ERR_PRINT("chunk index mismatch: calculated %zu, is in %zu. VA block [0x%llx, 0x%llx) GPU %u page_index: %u\n",
5400                            chunk_index,
5401                            i,
5402                            block->start,
5403                            block->end + 1,
5404                            uvm_id_value(id),
5405                            page_index);
5406             return false;
5407         }
5408 
5409         if (chunk) {
5410             if (chunk_size != uvm_gpu_chunk_get_size(chunk)) {
5411                 UVM_ERR_PRINT("chunk size mismatch: calc %u, actual %u. VA block [0x%llx, 0x%llx) GPU: %u page_index: %u chunk index: %zu\n",
5412                               chunk_size,
5413                               uvm_gpu_chunk_get_size(chunk),
5414                               block->start,
5415                               block->end + 1,
5416                               uvm_id_value(id),
5417                               page_index,
5418                               i);
5419                 return false;
5420             }
5421 
5422             if (chunk->state != UVM_PMM_GPU_CHUNK_STATE_ALLOCATED) {
5423                 UVM_ERR_PRINT("Invalid chunk state %s. VA block [0x%llx, 0x%llx) GPU: %u page_index: %u chunk index: %zu chunk_size: %u\n",
5424                               uvm_pmm_gpu_chunk_state_string(chunk->state),
5425                               block->start,
5426                               block->end + 1,
5427                               uvm_id_value(id),
5428                               page_index,
5429                               i,
5430                               chunk_size);
5431                 return false;
5432             }
5433 
5434             UVM_ASSERT(chunk->va_block == block);
5435             UVM_ASSERT(chunk->va_block_page_index == page_index);
5436 
5437             UVM_ASSERT(block_check_chunk_indirect_peers(block, gpu, chunk));
5438         }
5439 
5440         page_index += chunk_size / PAGE_SIZE;
5441     }
5442 
5443     return true;
5444 }
5445 
block_check_chunks(uvm_va_block_t * va_block)5446 static bool block_check_chunks(uvm_va_block_t *va_block)
5447 {
5448     uvm_gpu_id_t id;
5449 
5450     for_each_gpu_id(id) {
5451         if (!block_check_gpu_chunks(va_block, id))
5452             return false;
5453     }
5454 
5455     return block_check_cpu_chunks(va_block);
5456 }
5457 
5458 typedef struct
5459 {
5460     uvm_processor_mask_t atomic_mappings;
5461     uvm_processor_mask_t write_mappings;
5462     uvm_processor_mask_t read_mappings;
5463     uvm_processor_mask_t lite_read_mappings;
5464     uvm_processor_mask_t lite_atomic_mappings;
5465     uvm_processor_mask_t remaining_mappings;
5466     uvm_processor_mask_t temp_mappings;
5467     uvm_processor_mask_t resident_processors;
5468     uvm_processor_mask_t native_atomics;
5469     uvm_processor_mask_t non_native_atomics;
5470     uvm_processor_mask_t residency_accessible_from;
5471     uvm_processor_mask_t residency_has_native_atomics;
5472 } mapping_masks_t;
5473 
5474 // Sanity checks for page mappings
block_check_mappings_page(uvm_va_block_t * block,uvm_va_block_context_t * block_context,uvm_page_index_t page_index)5475 static bool block_check_mappings_page(uvm_va_block_t *block,
5476                                       uvm_va_block_context_t *block_context,
5477                                       uvm_page_index_t page_index)
5478 {
5479     uvm_processor_mask_t *atomic_mappings, *write_mappings, *read_mappings;
5480     uvm_processor_mask_t *lite_read_mappings, *lite_atomic_mappings;
5481     uvm_processor_mask_t *remaining_mappings, *temp_mappings;
5482     uvm_processor_mask_t *resident_processors;
5483     uvm_processor_mask_t *native_atomics, *non_native_atomics;
5484     uvm_processor_mask_t *residency_accessible_from;
5485     uvm_processor_mask_t *residency_has_native_atomics;
5486     uvm_processor_id_t residency, id;
5487     uvm_va_range_t *va_range = block->va_range;
5488     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
5489     uvm_processor_id_t preferred_location = va_range ?
5490                                             uvm_va_range_get_policy(va_range)->preferred_location :
5491                                             UVM_ID_INVALID;
5492     const uvm_processor_mask_t *uvm_lite_gpus = block_get_uvm_lite_gpus(block);
5493     mapping_masks_t *mapping_masks = uvm_kvmalloc(sizeof(*mapping_masks));
5494 
5495     // Since all subsequent checks are skipped if mapping_masks allocation
5496     // fails, assert so that assertion messages can be seen on non-release
5497     // builds.
5498     UVM_ASSERT(mapping_masks);
5499 
5500     if (!mapping_masks)
5501         return true;
5502 
5503     atomic_mappings = &mapping_masks->atomic_mappings;
5504     write_mappings = &mapping_masks->write_mappings;
5505     read_mappings = &mapping_masks->read_mappings;
5506 
5507     block_page_authorized_processors(block, page_index, UVM_PROT_READ_WRITE_ATOMIC, atomic_mappings);
5508     block_page_authorized_processors(block, page_index, UVM_PROT_READ_WRITE, write_mappings);
5509     block_page_authorized_processors(block, page_index, UVM_PROT_READ_ONLY, read_mappings);
5510 
5511     // Each access bit implies all accesses below it
5512     UVM_ASSERT(uvm_processor_mask_subset(atomic_mappings, write_mappings));
5513     UVM_ASSERT(uvm_processor_mask_subset(write_mappings, read_mappings));
5514     UVM_ASSERT(uvm_processor_mask_subset(read_mappings, &block->mapped));
5515 
5516     resident_processors = &mapping_masks->resident_processors;
5517 
5518     uvm_va_block_page_resident_processors(block, page_index, resident_processors);
5519     UVM_ASSERT(uvm_processor_mask_subset(resident_processors, &block->resident));
5520 
5521     remaining_mappings = &mapping_masks->remaining_mappings;
5522     temp_mappings = &mapping_masks->temp_mappings;
5523 
5524     // Sanity check block_get_mapped_processors
5525     uvm_processor_mask_copy(remaining_mappings, read_mappings);
5526     for_each_id_in_mask(residency, resident_processors) {
5527         block_get_mapped_processors(block, block_context, residency, page_index, temp_mappings);
5528         UVM_ASSERT(uvm_processor_mask_subset(temp_mappings, remaining_mappings));
5529         uvm_processor_mask_andnot(remaining_mappings, remaining_mappings, temp_mappings);
5530     }
5531 
5532     // Any remaining mappings point to non-resident locations, so they must be
5533     // UVM-Lite mappings.
5534     UVM_ASSERT(uvm_processor_mask_subset(remaining_mappings, uvm_lite_gpus));
5535 
5536     residency = uvm_processor_mask_find_first_id(resident_processors);
5537 
5538     residency_accessible_from = &mapping_masks->residency_accessible_from;
5539     residency_has_native_atomics = &mapping_masks->residency_has_native_atomics;
5540 
5541     if (uvm_processor_mask_get_count(resident_processors) > 0) {
5542         residency_accessible_from    = &va_space->accessible_from[uvm_id_value(residency)];
5543         residency_has_native_atomics = &va_space->has_native_atomics[uvm_id_value(residency)];
5544     }
5545 
5546     // If the page is not resident, there should be no valid mappings
5547     UVM_ASSERT_MSG(uvm_processor_mask_get_count(resident_processors) > 0 ||
5548                    uvm_processor_mask_get_count(read_mappings) == 0,
5549                    "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx - SWA: 0x%lx - RD: 0x%lx\n",
5550                    *resident_processors->bitmap,
5551                    *read_mappings->bitmap, *write_mappings->bitmap, *atomic_mappings->bitmap,
5552                    *va_space->system_wide_atomics_enabled_processors.bitmap,
5553                    *block->read_duplicated_pages.bitmap);
5554 
5555     // Test read_duplicated_pages mask
5556     UVM_ASSERT_MSG((!uvm_page_mask_test(&block->read_duplicated_pages, page_index) &&
5557                     uvm_processor_mask_get_count(resident_processors) <= 1) ||
5558                    (uvm_page_mask_test(&block->read_duplicated_pages, page_index) &&
5559                     uvm_processor_mask_get_count(resident_processors) >= 1),
5560                    "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx - SWA: 0x%lx - RD: 0x%lx\n",
5561                    *resident_processors->bitmap,
5562                    *read_mappings->bitmap,
5563                    *write_mappings->bitmap,
5564                    *atomic_mappings->bitmap,
5565                    *va_space->system_wide_atomics_enabled_processors.bitmap,
5566                    *block->read_duplicated_pages.bitmap);
5567 
5568     if (!uvm_processor_mask_empty(uvm_lite_gpus))
5569         UVM_ASSERT(UVM_ID_IS_VALID(preferred_location));
5570 
5571     lite_read_mappings = &mapping_masks->lite_read_mappings;
5572     lite_atomic_mappings = &mapping_masks->lite_atomic_mappings;
5573 
5574     // UVM-Lite checks. Since the range group is made non-migratable before the
5575     // actual migrations for that range group happen, we can only make those
5576     // checks which are valid on both migratable and non-migratable range
5577     // groups.
5578     uvm_processor_mask_and(lite_read_mappings, read_mappings, uvm_lite_gpus);
5579     uvm_processor_mask_and(lite_atomic_mappings, atomic_mappings, uvm_lite_gpus);
5580 
5581     // Any mapping from a UVM-Lite GPU must be atomic...
5582     UVM_ASSERT(uvm_processor_mask_equal(lite_read_mappings, lite_atomic_mappings));
5583 
5584     // ... and must have access to preferred_location
5585     if (UVM_ID_IS_VALID(preferred_location)) {
5586         const uvm_processor_mask_t *preferred_location_accessible_from;
5587 
5588         preferred_location_accessible_from = &va_space->accessible_from[uvm_id_value(preferred_location)];
5589         UVM_ASSERT(uvm_processor_mask_subset(lite_atomic_mappings, preferred_location_accessible_from));
5590     }
5591 
5592     for_each_id_in_mask(id, lite_atomic_mappings)
5593         UVM_ASSERT(uvm_processor_mask_test(&va_space->can_access[uvm_id_value(id)], preferred_location));
5594 
5595     // Exclude uvm_lite_gpus from mappings' masks after UVM-Lite tests
5596     uvm_processor_mask_andnot(read_mappings, read_mappings, uvm_lite_gpus);
5597     uvm_processor_mask_andnot(write_mappings, write_mappings, uvm_lite_gpus);
5598     uvm_processor_mask_andnot(atomic_mappings, atomic_mappings, uvm_lite_gpus);
5599 
5600     // Pages set to zero in maybe_mapped_pages must not be mapped on any
5601     // non-UVM-Lite GPU
5602     if (!uvm_va_block_is_hmm(block) && !uvm_page_mask_test(&block->maybe_mapped_pages, page_index)) {
5603         UVM_ASSERT_MSG(uvm_processor_mask_get_count(read_mappings) == 0,
5604                        "Resident: 0x%lx - Mappings Block: 0x%lx / Page R: 0x%lx W: 0x%lx A: 0x%lx\n",
5605                        *resident_processors->bitmap,
5606                        *block->mapped.bitmap,
5607                        *read_mappings->bitmap, *write_mappings->bitmap, *atomic_mappings->bitmap);
5608     }
5609 
5610     // atomic mappings from GPUs with disabled system-wide atomics are treated
5611     // as write mappings. Therefore, we remove them from the atomic mappings mask
5612     uvm_processor_mask_and(atomic_mappings, atomic_mappings, &va_space->system_wide_atomics_enabled_processors);
5613 
5614     if (!uvm_processor_mask_empty(read_mappings)) {
5615         // Read-duplicate: if a page is resident in multiple locations, it
5616         // must be resident locally on each mapped processor.
5617         if (uvm_processor_mask_get_count(resident_processors) > 1) {
5618             UVM_ASSERT_MSG(uvm_processor_mask_subset(read_mappings, resident_processors),
5619                            "Read-duplicate copies from remote processors\n"
5620                            "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx - SWA: 0x%lx - RD: 0x%lx\n",
5621                            *resident_processors->bitmap,
5622                            *read_mappings->bitmap, *write_mappings->bitmap, *atomic_mappings->bitmap,
5623                            *va_space->system_wide_atomics_enabled_processors.bitmap,
5624                            *block->read_duplicated_pages.bitmap);
5625         }
5626         else {
5627             // Processors with mappings must have access to the processor that
5628             // has the valid copy
5629             UVM_ASSERT_MSG(uvm_processor_mask_subset(read_mappings, residency_accessible_from),
5630                            "Not all processors have access to %s\n"
5631                            "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx -"
5632                            "Access: 0x%lx - Native Atomics: 0x%lx - SWA: 0x%lx\n",
5633                            uvm_va_space_processor_name(va_space, residency),
5634                            *resident_processors->bitmap,
5635                            *read_mappings->bitmap,
5636                            *write_mappings->bitmap,
5637                            *atomic_mappings->bitmap,
5638                            *residency_accessible_from->bitmap,
5639                            *residency_has_native_atomics->bitmap,
5640                            *va_space->system_wide_atomics_enabled_processors.bitmap);
5641             for_each_id_in_mask(id, read_mappings) {
5642                 UVM_ASSERT(uvm_processor_mask_test(&va_space->can_access[uvm_id_value(id)], residency));
5643 
5644                 if (uvm_processor_mask_test(&va_space->indirect_peers[uvm_id_value(residency)], id)) {
5645                     uvm_gpu_t *resident_gpu = uvm_va_space_get_gpu(va_space, residency);
5646                     uvm_gpu_t *mapped_gpu = uvm_va_space_get_gpu(va_space, id);
5647                     uvm_gpu_chunk_t *chunk = block_phys_page_chunk(block,
5648                                                                    block_phys_page(residency, NUMA_NO_NODE, page_index),
5649                                                                    NULL);
5650 
5651                     // This function will assert if no mapping exists
5652                     (void)uvm_pmm_gpu_indirect_peer_addr(&resident_gpu->pmm, chunk, mapped_gpu);
5653                 }
5654             }
5655         }
5656     }
5657 
5658     // If any processor has a writable mapping, there must only be one copy of
5659     // the page in the system
5660     if (!uvm_processor_mask_empty(write_mappings)) {
5661         UVM_ASSERT_MSG(uvm_processor_mask_get_count(resident_processors) == 1,
5662                        "Too many resident copies for pages with write_mappings\n"
5663                        "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx - SWA: 0x%lx - RD: 0x%lx\n",
5664                        *resident_processors->bitmap,
5665                        *read_mappings->bitmap,
5666                        *write_mappings->bitmap,
5667                        *atomic_mappings->bitmap,
5668                        *va_space->system_wide_atomics_enabled_processors.bitmap,
5669                        *block->read_duplicated_pages.bitmap);
5670     }
5671 
5672     if (!uvm_processor_mask_empty(atomic_mappings)) {
5673 
5674         native_atomics = &mapping_masks->native_atomics;
5675 
5676         uvm_processor_mask_and(native_atomics, atomic_mappings, residency_has_native_atomics);
5677 
5678         if (uvm_processor_mask_empty(native_atomics)) {
5679             // No other faultable processor should be able to write
5680             uvm_processor_mask_and(write_mappings, write_mappings, &va_space->faultable_processors);
5681 
5682             UVM_ASSERT_MSG(uvm_processor_mask_get_count(write_mappings) == 1,
5683                            "Too many write mappings to %s from processors with non-native atomics\n"
5684                            "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx -"
5685                            "Access: 0x%lx - Native Atomics: 0x%lx - SWA: 0x%lx\n",
5686                            uvm_va_space_processor_name(va_space, residency),
5687                            *resident_processors->bitmap,
5688                            *read_mappings->bitmap,
5689                            *write_mappings->bitmap,
5690                            *atomic_mappings->bitmap,
5691                            *residency_accessible_from->bitmap,
5692                            *residency_has_native_atomics->bitmap,
5693                            *va_space->system_wide_atomics_enabled_processors.bitmap);
5694 
5695             // Only one processor outside of the native group can have atomics enabled
5696             UVM_ASSERT_MSG(uvm_processor_mask_get_count(atomic_mappings) == 1,
5697                            "Too many atomics mappings to %s from processors with non-native atomics\n"
5698                            "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx -"
5699                            "Access: 0x%lx - Native Atomics: 0x%lx - SWA: 0x%lx\n",
5700                            uvm_va_space_processor_name(va_space, residency),
5701                            *resident_processors->bitmap,
5702                            *read_mappings->bitmap,
5703                            *write_mappings->bitmap,
5704                            *atomic_mappings->bitmap,
5705                            *residency_accessible_from->bitmap,
5706                            *residency_has_native_atomics->bitmap,
5707                            *va_space->system_wide_atomics_enabled_processors.bitmap);
5708         }
5709         else {
5710 
5711             non_native_atomics = &mapping_masks->non_native_atomics;
5712 
5713             // One or more processors within the native group have atomics enabled.
5714             // All processors outside of that group may have write but not atomic
5715             // permissions.
5716             uvm_processor_mask_andnot(non_native_atomics, atomic_mappings, residency_has_native_atomics);
5717 
5718             UVM_ASSERT_MSG(uvm_processor_mask_empty(non_native_atomics),
5719                            "atomic mappings to %s from processors native and non-native\n"
5720                            "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx -"
5721                            "Access: 0x%lx - Native Atomics: 0x%lx - SWA: 0x%lx\n",
5722                            uvm_va_space_processor_name(va_space, residency),
5723                            *resident_processors->bitmap,
5724                            *read_mappings->bitmap,
5725                            *write_mappings->bitmap,
5726                            *atomic_mappings->bitmap,
5727                            *residency_accessible_from->bitmap,
5728                            *residency_has_native_atomics->bitmap,
5729                            *va_space->system_wide_atomics_enabled_processors.bitmap);
5730         }
5731     }
5732 
5733     uvm_kvfree(mapping_masks);
5734     return true;
5735 }
5736 
block_check_mappings_ptes(uvm_va_block_t * block,uvm_va_block_context_t * block_context,uvm_gpu_t * gpu)5737 static bool block_check_mappings_ptes(uvm_va_block_t *block, uvm_va_block_context_t *block_context, uvm_gpu_t *gpu)
5738 {
5739     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
5740     uvm_va_block_gpu_state_t *resident_gpu_state;
5741     uvm_pte_bits_gpu_t pte_bit;
5742     uvm_processor_id_t resident_id;
5743     uvm_prot_t prot;
5744     NvU32 big_page_size;
5745     size_t num_big_pages, big_page_index;
5746     uvm_va_block_region_t big_region, chunk_region;
5747     uvm_gpu_chunk_t *chunk;
5748 
5749     if (!gpu_state->page_table_range_4k.table)
5750         UVM_ASSERT(!gpu_state->activated_4k);
5751 
5752     if (!gpu_state->page_table_range_big.table) {
5753         UVM_ASSERT(!gpu_state->initialized_big);
5754         UVM_ASSERT(!gpu_state->activated_big);
5755     }
5756 
5757     // It's only safe to check the PTE mappings if we have page tables. See
5758     // uvm_va_block_get_gpu_va_space.
5759     if (!block_gpu_has_page_tables(block, gpu)) {
5760         UVM_ASSERT(!uvm_processor_mask_test(&block->mapped, gpu->id));
5761         return true;
5762     }
5763 
5764     big_page_size = uvm_va_block_gpu_big_page_size(block, gpu);
5765     num_big_pages = uvm_va_block_num_big_pages(block, big_page_size);
5766 
5767     if (block_gpu_supports_2m(block, gpu)) {
5768         if (gpu_state->page_table_range_big.table || gpu_state->page_table_range_4k.table) {
5769             // 2M blocks require the 2M entry to be allocated for the lower
5770             // ranges to also be allocated.
5771             UVM_ASSERT(gpu_state->page_table_range_2m.table);
5772         }
5773         else if (gpu_state->page_table_range_2m.table) {
5774             // If the 2M entry is present but the lower ones aren't, the PTE
5775             // must be 2M.
5776             UVM_ASSERT(gpu_state->pte_is_2m);
5777         }
5778     }
5779     else {
5780         UVM_ASSERT(!gpu_state->page_table_range_2m.table);
5781         if (num_big_pages == 0)
5782             UVM_ASSERT(!gpu_state->page_table_range_big.table);
5783     }
5784 
5785     // If we have the big table and it's in use then it must have been
5786     // initialized, even if it doesn't currently contain active PTEs.
5787     if ((!block_gpu_supports_2m(block, gpu) && gpu_state->page_table_range_big.table) ||
5788         (block_gpu_supports_2m(block, gpu) && !gpu_state->pte_is_2m && gpu_state->activated_big))
5789         UVM_ASSERT(gpu_state->initialized_big);
5790 
5791     if (gpu_state->pte_is_2m) {
5792         UVM_ASSERT(block_gpu_supports_2m(block, gpu));
5793         UVM_ASSERT(gpu_state->page_table_range_2m.table);
5794         UVM_ASSERT(bitmap_empty(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK));
5795         UVM_ASSERT(!gpu_state->force_4k_ptes);
5796 
5797         // GPU architectures which support 2M pages only support 64K as the big
5798         // page size. All of the 2M code assumes that
5799         // MAX_BIG_PAGES_PER_UVM_VA_BLOCK covers a 2M PTE exactly (bitmap_full,
5800         // bitmap_complement, etc).
5801         BUILD_BUG_ON((UVM_PAGE_SIZE_2M / UVM_PAGE_SIZE_64K) != MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5802 
5803         prot = block_page_prot_gpu(block, gpu, 0);
5804 
5805         // All page permissions match
5806         for (pte_bit = 0; pte_bit < UVM_PTE_BITS_GPU_MAX; pte_bit++) {
5807             if (prot == UVM_PROT_NONE || pte_bit > get_gpu_pte_bit_index(prot))
5808                 UVM_ASSERT(uvm_page_mask_empty(&gpu_state->pte_bits[pte_bit]));
5809             else
5810                 UVM_ASSERT(uvm_page_mask_full(&gpu_state->pte_bits[pte_bit]));
5811         }
5812 
5813         if (prot != UVM_PROT_NONE) {
5814             resident_id = block_gpu_get_processor_to_map(block, block_context, gpu, 0);
5815 
5816             // block_check_resident_proximity verifies that no closer processor
5817             // has a resident page, so we don't need to check that all pages
5818             // have the same resident_id.
5819 
5820             // block_check_mappings_page verifies that all pages marked resident
5821             // are backed by populated memory.
5822 
5823             // The mapped processor should be fully resident and physically-
5824             // contiguous.
5825             UVM_ASSERT(uvm_page_mask_full(uvm_va_block_resident_mask_get(block, resident_id, NUMA_NO_NODE)));
5826 
5827             if (UVM_ID_IS_GPU(resident_id)) {
5828                 resident_gpu_state = uvm_va_block_gpu_state_get(block, resident_id);
5829                 UVM_ASSERT(resident_gpu_state);
5830                 UVM_ASSERT(uvm_gpu_chunk_get_size(resident_gpu_state->chunks[0]) == UVM_CHUNK_SIZE_2M);
5831             }
5832             else {
5833                 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page_resident(block, 0);
5834                 int chunk_nid = uvm_cpu_chunk_get_numa_node(chunk);
5835 
5836                 UVM_ASSERT(uvm_page_mask_full(&block->cpu.allocated));
5837                 UVM_ASSERT(chunk);
5838                 UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_2M);
5839                 UVM_ASSERT(uvm_va_block_cpu_is_region_resident_on(block,
5840                                                                   chunk_nid,
5841                                                                   uvm_va_block_region_from_block(block)));
5842             }
5843         }
5844     }
5845     else if (!bitmap_empty(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)) {
5846         UVM_ASSERT(gpu_state->page_table_range_big.table);
5847         UVM_ASSERT(!gpu_state->force_4k_ptes);
5848         UVM_ASSERT(num_big_pages > 0);
5849         UVM_ASSERT(gpu_state->initialized_big);
5850 
5851         for (big_page_index = 0; big_page_index < num_big_pages; big_page_index++) {
5852             big_region = uvm_va_block_big_page_region(block, big_page_index, big_page_size);
5853 
5854             if (!test_bit(big_page_index, gpu_state->big_ptes)) {
5855                 // If there are valid mappings but this isn't a big PTE, the
5856                 // mapping must be using the 4k PTEs.
5857                 if (!uvm_page_mask_region_empty(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ], big_region))
5858                     UVM_ASSERT(gpu_state->page_table_range_4k.table);
5859                 continue;
5860             }
5861 
5862             prot = block_page_prot_gpu(block, gpu, big_region.first);
5863 
5864             // All page permissions match
5865             for (pte_bit = 0; pte_bit < UVM_PTE_BITS_GPU_MAX; pte_bit++) {
5866                 if (prot == UVM_PROT_NONE || pte_bit > get_gpu_pte_bit_index(prot))
5867                     UVM_ASSERT(uvm_page_mask_region_empty(&gpu_state->pte_bits[pte_bit], big_region));
5868                 else
5869                     UVM_ASSERT(uvm_page_mask_region_full(&gpu_state->pte_bits[pte_bit], big_region));
5870             }
5871 
5872             if (prot != UVM_PROT_NONE) {
5873                 resident_id = block_gpu_get_processor_to_map(block, block_context, gpu, big_region.first);
5874 
5875                 // The mapped processor should be fully resident and physically-
5876                 // contiguous. Exception: UVM-Lite GPUs always map the preferred
5877                 // location even if the memory is resident elsewhere. Skip the
5878                 // residency check but still verify contiguity.
5879                 if (!uvm_processor_mask_test(block_get_uvm_lite_gpus(block), gpu->id)) {
5880                     UVM_ASSERT(
5881                         uvm_page_mask_region_full(uvm_va_block_resident_mask_get(block, resident_id, NUMA_NO_NODE),
5882                                                   big_region));
5883                 }
5884 
5885                 if (UVM_ID_IS_CPU(resident_id)) {
5886                     int resident_nid = block_get_page_node_residency(block, big_region.first);
5887                     uvm_va_block_cpu_node_state_t *node_state = block_node_state_get(block, resident_nid);
5888                     uvm_cpu_chunk_t *chunk;
5889 
5890                     UVM_ASSERT(resident_nid != NUMA_NO_NODE);
5891                     UVM_ASSERT(uvm_page_mask_region_full(&node_state->allocated, big_region));
5892                     chunk = uvm_cpu_chunk_get_chunk_for_page(block, resident_nid, big_region.first);
5893                     UVM_ASSERT(gpu->parent->can_map_sysmem_with_large_pages);
5894                     UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) >= uvm_va_block_region_size(big_region));
5895                     UVM_ASSERT(uvm_page_mask_region_full(&node_state->resident, big_region));
5896                 }
5897                 else {
5898                     // Check GPU chunks
5899                     chunk = block_phys_page_chunk(block,
5900                                                   block_phys_page(resident_id, NUMA_NO_NODE, big_region.first),
5901                                                   NULL);
5902                     chunk_region = uvm_va_block_chunk_region(block, uvm_gpu_chunk_get_size(chunk), big_region.first);
5903                     UVM_ASSERT(uvm_va_block_region_contains_region(chunk_region, big_region));
5904                 }
5905             }
5906         }
5907     }
5908 
5909     return true;
5910 }
5911 
block_check_mappings(uvm_va_block_t * block,uvm_va_block_context_t * block_context)5912 static bool block_check_mappings(uvm_va_block_t *block, uvm_va_block_context_t *block_context)
5913 {
5914     uvm_page_index_t page_index;
5915     uvm_processor_id_t id;
5916 
5917     // Verify the master masks, since block_check_mappings_page relies on them
5918     for_each_id(id) {
5919         const uvm_page_mask_t *resident_mask, *map_mask;
5920 
5921         if (UVM_ID_IS_GPU(id) && !uvm_va_block_gpu_state_get(block, id)) {
5922             UVM_ASSERT(!uvm_processor_mask_test(&block->resident, id));
5923             UVM_ASSERT(!uvm_processor_mask_test(&block->mapped, id));
5924             UVM_ASSERT(!uvm_processor_mask_test(&block->evicted_gpus, id));
5925             continue;
5926         }
5927 
5928         resident_mask = uvm_va_block_resident_mask_get(block, id, NUMA_NO_NODE);
5929         UVM_ASSERT(uvm_processor_mask_test(&block->resident, id) == !uvm_page_mask_empty(resident_mask));
5930 
5931         map_mask = uvm_va_block_map_mask_get(block, id);
5932         UVM_ASSERT(uvm_processor_mask_test(&block->mapped, id) == !uvm_page_mask_empty(map_mask));
5933 
5934         if (UVM_ID_IS_GPU(id)) {
5935             const uvm_page_mask_t *evicted_mask = block_evicted_mask_get(block, id);
5936             UVM_ASSERT(uvm_processor_mask_test(&block->evicted_gpus, id) == !uvm_page_mask_empty(evicted_mask));
5937 
5938             // Pages cannot be resident if they are marked as evicted
5939             UVM_ASSERT(!uvm_page_mask_intersects(evicted_mask, resident_mask));
5940 
5941             // Pages cannot be resident on a GPU with no memory
5942             if (!block_processor_has_memory(block, id))
5943                 UVM_ASSERT(!uvm_processor_mask_test(&block->resident, id));
5944         }
5945     }
5946 
5947     // Check that every page has coherent mappings
5948     for_each_va_block_page(page_index, block)
5949         block_check_mappings_page(block, block_context, page_index);
5950 
5951     for_each_gpu_id(id) {
5952         if (uvm_va_block_gpu_state_get(block, id)) {
5953             uvm_gpu_t *gpu = block_get_gpu(block, id);
5954 
5955             // Check big and/or 2M PTE state
5956             block_check_mappings_ptes(block, block_context, gpu);
5957         }
5958     }
5959 
5960     return true;
5961 }
5962 
5963 // See the comments on uvm_va_block_unmap
block_unmap_cpu(uvm_va_block_t * block,uvm_va_block_context_t * block_context,uvm_va_block_region_t region,const uvm_page_mask_t * unmap_pages)5964 static void block_unmap_cpu(uvm_va_block_t *block,
5965                             uvm_va_block_context_t *block_context,
5966                             uvm_va_block_region_t region,
5967                             const uvm_page_mask_t *unmap_pages)
5968 {
5969     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
5970     uvm_pte_bits_cpu_t pte_bit;
5971     bool unmapped_something = false;
5972     uvm_va_block_region_t subregion;
5973     NvU32 num_mapped_processors;
5974 
5975     // Early-out if nothing in the region is mapped or being unmapped.
5976     if (!block_has_valid_mapping_cpu(block, region) ||
5977         (unmap_pages && !uvm_page_mask_intersects(unmap_pages, &block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ])))
5978         return;
5979 
5980     // We can't actually unmap HMM ranges from the CPU here.
5981     // Unmapping happens as part of migrate_vma_setup().
5982     if (uvm_va_block_is_hmm(block)) {
5983         UVM_ASSERT(!uvm_va_block_is_hmm(block));
5984         return;
5985     }
5986 
5987     num_mapped_processors = uvm_processor_mask_get_count(&block->mapped);
5988 
5989     // If we are unmapping a page which we are tracking due to CPU faults with
5990     // correct permissions, clear the info. This will cover both the unmap and
5991     // revoke cases (since we implement CPU revocation by unmap + map)
5992     if (block->cpu.fault_authorized.first_fault_stamp &&
5993         uvm_page_mask_region_test(unmap_pages, region, block->cpu.fault_authorized.page_index))
5994         block->cpu.fault_authorized.first_fault_stamp = 0;
5995 
5996     for_each_va_block_subregion_in_mask(subregion, unmap_pages, region) {
5997         if (!block_has_valid_mapping_cpu(block, subregion))
5998             continue;
5999 
6000         unmap_mapping_range(va_space->mapping,
6001                             uvm_va_block_region_start(block, subregion),
6002                             uvm_va_block_region_size(subregion), 1);
6003 
6004         for (pte_bit = 0; pte_bit < UVM_PTE_BITS_CPU_MAX; pte_bit++)
6005             uvm_page_mask_region_clear(&block->cpu.pte_bits[pte_bit], subregion);
6006 
6007         // If the CPU is the only processor with mappings we can safely mark
6008         // the pages as fully unmapped
6009         if (num_mapped_processors == 1 && !uvm_va_block_is_hmm(block))
6010             uvm_page_mask_region_clear(&block->maybe_mapped_pages, subregion);
6011 
6012         unmapped_something = true;
6013     }
6014 
6015     if (!unmapped_something)
6016         return;
6017 
6018     // Check whether the block has any more mappings
6019     if (uvm_page_mask_empty(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ])) {
6020         UVM_ASSERT(uvm_page_mask_empty(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE]));
6021         uvm_processor_mask_clear(&block->mapped, UVM_ID_CPU);
6022     }
6023 
6024     UVM_ASSERT(block_check_mappings(block, block_context));
6025 }
6026 
6027 // Given a mask of mapped pages, returns true if any of the pages in the mask
6028 // are mapped remotely by the given GPU.
block_has_remote_mapping_gpu(uvm_va_block_t * block,uvm_page_mask_t * scratch_page_mask,uvm_gpu_id_t gpu_id,const uvm_page_mask_t * mapped_pages)6029 static bool block_has_remote_mapping_gpu(uvm_va_block_t *block,
6030                                          uvm_page_mask_t *scratch_page_mask,
6031                                          uvm_gpu_id_t gpu_id,
6032                                          const uvm_page_mask_t *mapped_pages)
6033 {
6034     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu_id);
6035 
6036     if (!gpu_state)
6037         return false;
6038 
6039     // The caller must ensure that all pages of the input mask are really mapped
6040     UVM_ASSERT(uvm_page_mask_subset(mapped_pages, &gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ]));
6041 
6042     // UVM-Lite GPUs map the preferred location if it's accessible, regardless
6043     // of the resident location.
6044     if (uvm_processor_mask_test(block_get_uvm_lite_gpus(block), gpu_id)) {
6045         if (uvm_page_mask_empty(mapped_pages))
6046             return false;
6047 
6048         return !uvm_va_policy_preferred_location_equal(uvm_va_range_get_policy(block->va_range), gpu_id, NUMA_NO_NODE);
6049     }
6050 
6051     // Remote pages are pages which are mapped but not resident locally
6052     return uvm_page_mask_andnot(scratch_page_mask, mapped_pages, &gpu_state->resident);
6053 }
6054 
6055 // Writes pte_clear_val to the 4k PTEs covered by clear_page_mask. If
6056 // clear_page_mask is NULL, all 4k PTEs in the {block, gpu} are written.
6057 //
6058 // If tlb_batch is provided, the 4k PTEs written are added to the batch. The
6059 // caller is responsible for ending the TLB batch with the appropriate membar.
block_gpu_pte_clear_4k(uvm_va_block_t * block,uvm_gpu_t * gpu,const uvm_page_mask_t * clear_page_mask,NvU64 pte_clear_val,uvm_pte_batch_t * pte_batch,uvm_tlb_batch_t * tlb_batch)6060 static void block_gpu_pte_clear_4k(uvm_va_block_t *block,
6061                                    uvm_gpu_t *gpu,
6062                                    const uvm_page_mask_t *clear_page_mask,
6063                                    NvU64 pte_clear_val,
6064                                    uvm_pte_batch_t *pte_batch,
6065                                    uvm_tlb_batch_t *tlb_batch)
6066 {
6067     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
6068     uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
6069     uvm_gpu_phys_address_t pte_addr;
6070     NvU32 pte_size = uvm_mmu_pte_size(tree, UVM_PAGE_SIZE_4K);
6071     uvm_va_block_region_t region = uvm_va_block_region_from_block(block);
6072     uvm_va_block_region_t subregion;
6073     size_t num_ptes, ptes_per_page = PAGE_SIZE / UVM_PAGE_SIZE_4K;
6074 
6075     for_each_va_block_subregion_in_mask(subregion, clear_page_mask, region) {
6076         num_ptes = uvm_va_block_region_num_pages(subregion) * ptes_per_page;
6077 
6078         pte_addr = uvm_page_table_range_entry_address(tree,
6079                                                       &gpu_state->page_table_range_4k,
6080                                                       subregion.first * ptes_per_page);
6081 
6082         uvm_pte_batch_clear_ptes(pte_batch, pte_addr, pte_clear_val, pte_size, num_ptes);
6083 
6084         if (tlb_batch) {
6085             uvm_tlb_batch_invalidate(tlb_batch,
6086                                      uvm_va_block_region_start(block, subregion),
6087                                      uvm_va_block_region_size(subregion),
6088                                      UVM_PAGE_SIZE_4K,
6089                                      UVM_MEMBAR_NONE);
6090         }
6091     }
6092 }
6093 
6094 // Writes the 4k PTEs covered by write_page_mask using memory from resident_id
6095 // with new_prot permissions. new_prot must not be UVM_PROT_NONE: use
6096 // block_gpu_pte_clear_4k instead.
6097 //
6098 // If write_page_mask is NULL, all 4k PTEs in the {block, gpu} are written.
6099 //
6100 // If tlb_batch is provided, the 4k PTEs written are added to the batch. The
6101 // caller is responsible for ending the TLB batch with the appropriate membar.
block_gpu_pte_write_4k(uvm_va_block_t * block,uvm_gpu_t * gpu,uvm_processor_id_t resident_id,uvm_prot_t new_prot,const uvm_page_mask_t * write_page_mask,uvm_pte_batch_t * pte_batch,uvm_tlb_batch_t * tlb_batch)6102 static void block_gpu_pte_write_4k(uvm_va_block_t *block,
6103                                    uvm_gpu_t *gpu,
6104                                    uvm_processor_id_t resident_id,
6105                                    uvm_prot_t new_prot,
6106                                    const uvm_page_mask_t *write_page_mask,
6107                                    uvm_pte_batch_t *pte_batch,
6108                                    uvm_tlb_batch_t *tlb_batch)
6109 {
6110     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
6111     uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
6112     NvU32 pte_size = uvm_mmu_pte_size(tree, UVM_PAGE_SIZE_4K);
6113     const size_t ptes_per_page = PAGE_SIZE / UVM_PAGE_SIZE_4K;
6114     uvm_va_block_region_t contig_region = {0};
6115     uvm_gpu_phys_address_t contig_addr = {0};
6116     uvm_gpu_phys_address_t page_addr = {0};
6117     uvm_page_index_t page_index;
6118     NvU64 pte_flags = block_gpu_pte_flag_cacheable(block, gpu, resident_id);
6119     int contig_nid = NUMA_NO_NODE;
6120 
6121     UVM_ASSERT(new_prot != UVM_PROT_NONE);
6122     UVM_ASSERT(UVM_ID_IS_VALID(resident_id));
6123 
6124     for_each_va_block_page_in_mask(page_index, write_page_mask, block) {
6125         uvm_gpu_phys_address_t pte_addr;
6126         size_t i;
6127         int nid = NUMA_NO_NODE;
6128 
6129         if (UVM_ID_IS_CPU(resident_id)) {
6130             nid = block_get_page_node_residency(block, page_index);
6131             UVM_ASSERT(nid != NUMA_NO_NODE);
6132 
6133             // Assume that this mapping will be used to write to the page
6134             if (new_prot > UVM_PROT_READ_ONLY && !uvm_va_block_is_hmm(block))
6135                 block_mark_cpu_page_dirty(block, page_index, nid);
6136         }
6137 
6138         if (page_index >= contig_region.outer || nid != contig_nid) {
6139             contig_region = block_phys_contig_region(block, page_index, resident_id, nid);
6140             contig_addr = block_phys_page_address(block, block_phys_page(resident_id, nid, contig_region.first), gpu);
6141             page_addr = contig_addr;
6142             contig_nid = nid;
6143         }
6144 
6145         page_addr.address = contig_addr.address + (page_index - contig_region.first) * PAGE_SIZE;
6146 
6147         pte_addr = uvm_page_table_range_entry_address(tree,
6148                                                       &gpu_state->page_table_range_4k,
6149                                                       page_index * ptes_per_page);
6150 
6151         // Handle PAGE_SIZE > GPU PTE size
6152         for (i = 0; i < ptes_per_page; i++) {
6153             NvU64 pte_val = tree->hal->make_pte(page_addr.aperture, page_addr.address, new_prot, pte_flags);
6154             uvm_pte_batch_write_pte(pte_batch, pte_addr, pte_val, pte_size);
6155             page_addr.address += UVM_PAGE_SIZE_4K;
6156             pte_addr.address += pte_size;
6157         }
6158 
6159         if (tlb_batch) {
6160             NvU64 page_virt_addr = uvm_va_block_cpu_page_address(block, page_index);
6161             uvm_tlb_batch_invalidate(tlb_batch, page_virt_addr, PAGE_SIZE, UVM_PAGE_SIZE_4K, UVM_MEMBAR_NONE);
6162         }
6163     }
6164 }
6165 
6166 // Writes all 4k PTEs under the big PTE regions described by big_ptes_covered.
6167 // This is used to initialize the 4k PTEs when splitting 2M and big PTEs. It
6168 // only writes 4k PTEs, not big PTEs.
6169 //
6170 // For those 4k PTEs, new_pages_mask indicates which ones should inherit the
6171 // mapping from the corresponding big page (0) and which ones should be written
6172 // using memory from resident_id and new_prot (1). Unlike the other pte_write
6173 // functions, new_prot may be UVM_PROT_NONE.
6174 //
6175 // If resident_id is UVM_ID_INVALID, this function looks up the resident ID
6176 // which should inherit the current permissions. new_prot must be UVM_PROT_NONE
6177 // in this case.
6178 //
6179 // new_pages_mask must not be NULL.
6180 //
6181 // No TLB invalidates are required since we've set up the lower PTEs to never be
6182 // cached by the GPU's MMU when covered by larger PTEs.
block_gpu_pte_big_split_write_4k(uvm_va_block_t * block,uvm_va_block_context_t * block_context,uvm_gpu_t * gpu,uvm_processor_id_t resident_id,uvm_prot_t new_prot,const unsigned long * big_ptes_covered,const uvm_page_mask_t * new_pages_mask,uvm_pte_batch_t * pte_batch)6183 static void block_gpu_pte_big_split_write_4k(uvm_va_block_t *block,
6184                                              uvm_va_block_context_t *block_context,
6185                                              uvm_gpu_t *gpu,
6186                                              uvm_processor_id_t resident_id,
6187                                              uvm_prot_t new_prot,
6188                                              const unsigned long *big_ptes_covered,
6189                                              const uvm_page_mask_t *new_pages_mask,
6190                                              uvm_pte_batch_t *pte_batch)
6191 {
6192     uvm_va_block_region_t big_region;
6193     size_t big_page_index;
6194     uvm_processor_id_t curr_resident_id;
6195     uvm_prot_t curr_prot;
6196     NvU32 big_page_size = uvm_va_block_gpu_big_page_size(block, gpu);
6197 
6198     if (UVM_ID_IS_INVALID(resident_id))
6199         UVM_ASSERT(new_prot == UVM_PROT_NONE);
6200 
6201     for_each_set_bit(big_page_index, big_ptes_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) {
6202         big_region = uvm_va_block_big_page_region(block, big_page_index, big_page_size);
6203 
6204         curr_prot = block_page_prot_gpu(block, gpu, big_region.first);
6205 
6206         // The unmap path doesn't know the current residency ahead of time, so
6207         // we have to look it up.
6208         if (UVM_ID_IS_INVALID(resident_id)) {
6209             curr_resident_id = block_gpu_get_processor_to_map(block, block_context, gpu, big_region.first);
6210         }
6211         else {
6212             // Check that we aren't changing the aperture of the existing
6213             // mappings. It could be legal in some cases (switching from {RO, A}
6214             // to {RO, B} for example) but we'd need to issue TLB membars.
6215             if (curr_prot != UVM_PROT_NONE) {
6216                 UVM_ASSERT(uvm_id_equal(block_gpu_get_processor_to_map(block,
6217                                                                        block_context,
6218                                                                        gpu,
6219                                                                        big_region.first),
6220                                         resident_id));
6221             }
6222 
6223             curr_resident_id = resident_id;
6224         }
6225 
6226         // pages in new_pages_mask under this big page get new_prot
6227         uvm_page_mask_zero(&block_context->scratch_page_mask);
6228         uvm_page_mask_region_fill(&block_context->scratch_page_mask, big_region);
6229         if (uvm_page_mask_and(&block_context->scratch_page_mask, &block_context->scratch_page_mask, new_pages_mask)) {
6230             if (new_prot == UVM_PROT_NONE) {
6231                 block_gpu_pte_clear_4k(block, gpu, &block_context->scratch_page_mask, 0, pte_batch, NULL);
6232             }
6233             else {
6234                 block_gpu_pte_write_4k(block,
6235                                        gpu,
6236                                        curr_resident_id,
6237                                        new_prot,
6238                                        &block_context->scratch_page_mask,
6239                                        pte_batch,
6240                                        NULL);
6241             }
6242         }
6243 
6244         // All other pages under this big page inherit curr_prot
6245         uvm_page_mask_zero(&block_context->scratch_page_mask);
6246         uvm_page_mask_region_fill(&block_context->scratch_page_mask, big_region);
6247         if (uvm_page_mask_andnot(&block_context->scratch_page_mask, &block_context->scratch_page_mask, new_pages_mask)) {
6248             if (curr_prot == UVM_PROT_NONE) {
6249                 block_gpu_pte_clear_4k(block, gpu, &block_context->scratch_page_mask, 0, pte_batch, NULL);
6250             }
6251             else {
6252                 block_gpu_pte_write_4k(block,
6253                                        gpu,
6254                                        curr_resident_id,
6255                                        curr_prot,
6256                                        &block_context->scratch_page_mask,
6257                                        pte_batch,
6258                                        NULL);
6259             }
6260         }
6261     }
6262 }
6263 
6264 // Writes pte_clear_val to the big PTEs in big_ptes_mask. If big_ptes_mask is
6265 // NULL, all big PTEs in the {block, gpu} are cleared.
6266 //
6267 // If tlb_batch is provided, the big PTEs written are added to the batch. The
6268 // caller is responsible for ending the TLB batch with the appropriate membar.
block_gpu_pte_clear_big(uvm_va_block_t * block,uvm_gpu_t * gpu,const unsigned long * big_ptes_mask,NvU64 pte_clear_val,uvm_pte_batch_t * pte_batch,uvm_tlb_batch_t * tlb_batch)6269 static void block_gpu_pte_clear_big(uvm_va_block_t *block,
6270                                     uvm_gpu_t *gpu,
6271                                     const unsigned long *big_ptes_mask,
6272                                     NvU64 pte_clear_val,
6273                                     uvm_pte_batch_t *pte_batch,
6274                                     uvm_tlb_batch_t *tlb_batch)
6275 {
6276     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
6277     uvm_gpu_va_space_t *gpu_va_space = uvm_va_block_get_gpu_va_space(block, gpu);
6278     NvU32 big_page_size = gpu_va_space->page_tables.big_page_size;
6279     uvm_gpu_phys_address_t pte_addr;
6280     NvU32 pte_size = uvm_mmu_pte_size(&gpu_va_space->page_tables, big_page_size);
6281     size_t big_page_index;
6282     DECLARE_BITMAP(big_ptes_to_clear, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6283 
6284     if (big_ptes_mask)
6285         bitmap_copy(big_ptes_to_clear, big_ptes_mask, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6286     else
6287         bitmap_set(big_ptes_to_clear, 0, uvm_va_block_num_big_pages(block, big_page_size));
6288 
6289     for_each_set_bit(big_page_index, big_ptes_to_clear, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) {
6290         pte_addr = uvm_page_table_range_entry_address(&gpu_va_space->page_tables,
6291                                                       &gpu_state->page_table_range_big,
6292                                                       big_page_index);
6293         uvm_pte_batch_clear_ptes(pte_batch, pte_addr, pte_clear_val, pte_size, 1);
6294 
6295         if (tlb_batch) {
6296             uvm_tlb_batch_invalidate(tlb_batch,
6297                                      uvm_va_block_big_page_addr(block, big_page_index, big_page_size),
6298                                      big_page_size,
6299                                      big_page_size,
6300                                      UVM_MEMBAR_NONE);
6301         }
6302     }
6303 }
6304 
6305 // Writes the big PTEs in big_ptes_mask using memory from resident_id with
6306 // new_prot permissions. new_prot must not be UVM_PROT_NONE: use
6307 // block_gpu_pte_clear_big instead.
6308 //
6309 // Unlike block_gpu_pte_clear_big, big_ptes_mask must not be NULL.
6310 //
6311 // If tlb_batch is provided, the big PTEs written are added to the batch. The
6312 // caller is responsible for ending the TLB batch with the appropriate membar.
block_gpu_pte_write_big(uvm_va_block_t * block,uvm_gpu_t * gpu,uvm_processor_id_t resident_id,uvm_prot_t new_prot,const unsigned long * big_ptes_mask,uvm_pte_batch_t * pte_batch,uvm_tlb_batch_t * tlb_batch)6313 static void block_gpu_pte_write_big(uvm_va_block_t *block,
6314                                     uvm_gpu_t *gpu,
6315                                     uvm_processor_id_t resident_id,
6316                                     uvm_prot_t new_prot,
6317                                     const unsigned long *big_ptes_mask,
6318                                     uvm_pte_batch_t *pte_batch,
6319                                     uvm_tlb_batch_t *tlb_batch)
6320 {
6321     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
6322     uvm_gpu_va_space_t *gpu_va_space = uvm_va_block_get_gpu_va_space(block, gpu);
6323     uvm_page_tree_t *tree = &gpu_va_space->page_tables;
6324     NvU32 big_page_size = tree->big_page_size;
6325     NvU32 pte_size = uvm_mmu_pte_size(tree, big_page_size);
6326     size_t big_page_index;
6327     uvm_va_block_region_t contig_region = {0};
6328     uvm_gpu_phys_address_t contig_addr = {0};
6329     uvm_gpu_phys_address_t page_addr = {0};
6330     NvU64 pte_flags = block_gpu_pte_flag_cacheable(block, gpu, resident_id);
6331     int contig_nid = NUMA_NO_NODE;
6332 
6333     UVM_ASSERT(new_prot != UVM_PROT_NONE);
6334     UVM_ASSERT(UVM_ID_IS_VALID(resident_id));
6335     UVM_ASSERT(big_ptes_mask);
6336 
6337     if (!bitmap_empty(big_ptes_mask, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)) {
6338         UVM_ASSERT(uvm_va_block_num_big_pages(block, big_page_size) > 0);
6339 
6340         if (!gpu->parent->can_map_sysmem_with_large_pages)
6341             UVM_ASSERT(UVM_ID_IS_GPU(resident_id));
6342     }
6343 
6344     for_each_set_bit(big_page_index, big_ptes_mask, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) {
6345         NvU64 pte_val;
6346         uvm_gpu_phys_address_t pte_addr;
6347         uvm_va_block_region_t big_region = uvm_va_block_big_page_region(block, big_page_index, big_page_size);
6348         int nid = NUMA_NO_NODE;
6349 
6350         if (UVM_ID_IS_CPU(resident_id)) {
6351             nid = block_get_page_node_residency(block, big_region.first);
6352             UVM_ASSERT(nid != NUMA_NO_NODE);
6353 
6354             // Assume that this mapping will be used to write to the page
6355             if (new_prot > UVM_PROT_READ_ONLY && !uvm_va_block_is_hmm(block)) {
6356                 uvm_page_index_t page_index;
6357 
6358                 for_each_va_block_page_in_region(page_index, big_region)
6359                     block_mark_cpu_page_dirty(block, page_index, nid);
6360             }
6361         }
6362 
6363         if (big_region.first >= contig_region.outer || nid != contig_nid) {
6364             contig_region = block_phys_contig_region(block, big_region.first, resident_id, nid);
6365             contig_addr = block_phys_page_address(block, block_phys_page(resident_id, nid, contig_region.first), gpu);
6366             page_addr = contig_addr;
6367             contig_nid = nid;
6368         }
6369 
6370         page_addr.address = contig_addr.address + (big_region.first - contig_region.first) * PAGE_SIZE;
6371 
6372         pte_addr = uvm_page_table_range_entry_address(tree, &gpu_state->page_table_range_big, big_page_index);
6373         pte_val = tree->hal->make_pte(page_addr.aperture, page_addr.address, new_prot, pte_flags);
6374         uvm_pte_batch_write_pte(pte_batch, pte_addr, pte_val, pte_size);
6375 
6376         if (tlb_batch) {
6377             uvm_tlb_batch_invalidate(tlb_batch,
6378                                      uvm_va_block_region_start(block, big_region),
6379                                      big_page_size,
6380                                      big_page_size,
6381                                      UVM_MEMBAR_NONE);
6382         }
6383     }
6384 }
6385 
6386 // Switches any mix of valid or invalid 4k PTEs under the big PTEs in
6387 // big_ptes_to_merge to an unmapped big PTE. This also ends both pte_batch and
6388 // tlb_batch in order to poison the now-unused 4k PTEs.
6389 //
6390 // The 4k PTEs are invalidated with the specified membar.
block_gpu_pte_merge_big_and_end(uvm_va_block_t * block,uvm_va_block_context_t * block_context,uvm_gpu_t * gpu,const unsigned long * big_ptes_to_merge,uvm_push_t * push,uvm_pte_batch_t * pte_batch,uvm_tlb_batch_t * tlb_batch,uvm_membar_t tlb_membar)6391 static void block_gpu_pte_merge_big_and_end(uvm_va_block_t *block,
6392                                             uvm_va_block_context_t *block_context,
6393                                             uvm_gpu_t *gpu,
6394                                             const unsigned long *big_ptes_to_merge,
6395                                             uvm_push_t *push,
6396                                             uvm_pte_batch_t *pte_batch,
6397                                             uvm_tlb_batch_t *tlb_batch,
6398                                             uvm_membar_t tlb_membar)
6399 {
6400     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
6401     uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
6402     NvU32 big_page_size = tree->big_page_size;
6403     NvU64 unmapped_pte_val = tree->hal->unmapped_pte(big_page_size);
6404     size_t big_page_index;
6405     DECLARE_BITMAP(dummy_big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6406 
6407     UVM_ASSERT(!bitmap_empty(big_ptes_to_merge, MAX_BIG_PAGES_PER_UVM_VA_BLOCK));
6408     UVM_ASSERT(!bitmap_and(dummy_big_ptes, gpu_state->big_ptes, big_ptes_to_merge, MAX_BIG_PAGES_PER_UVM_VA_BLOCK));
6409 
6410     // We can be called with the 4k PTEs in two cases:
6411     // 1) 4k PTEs allocated. In this case the 4k PTEs are currently active.
6412     //
6413     // 2) 4k PTEs unallocated. In this case the GPU may not have invalid 4k PTEs
6414     //    active under the big PTE, depending on whether neighboring blocks
6415     //    caused the page tables to be allocated.
6416     //
6417     // In both cases we need to invalidate the 4k PTEs in case the GPU MMU has
6418     // them cached.
6419 
6420     // Each big PTE is currently invalid so the 4ks are active (or unallocated).
6421     // First make the big PTEs unmapped to disable future lookups of the 4ks
6422     // under it. We can't directly transition the entry from valid 4k PTEs to
6423     // valid big PTEs, because that could cause the GPU TLBs to cache the same
6424     // VA in different cache lines. That could cause memory ordering to not be
6425     // maintained.
6426     block_gpu_pte_clear_big(block, gpu, big_ptes_to_merge, unmapped_pte_val, pte_batch, tlb_batch);
6427 
6428     // Now invalidate the big PTEs we just wrote as well as all 4ks under them.
6429     // Subsequent MMU fills will stop at the now-unmapped big PTEs, so we only
6430     // need to invalidate the 4k PTEs without actually writing them.
6431     for_each_set_bit(big_page_index, big_ptes_to_merge, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) {
6432         uvm_tlb_batch_invalidate(tlb_batch,
6433                                  uvm_va_block_big_page_addr(block, big_page_index, big_page_size),
6434                                  big_page_size,
6435                                  big_page_size | UVM_PAGE_SIZE_4K,
6436                                  UVM_MEMBAR_NONE);
6437     }
6438 
6439     // End the batches for the caller. We need to do this here in order to
6440     // poison the 4ks below.
6441     uvm_pte_batch_end(pte_batch);
6442     uvm_tlb_batch_end(tlb_batch, push, tlb_membar);
6443 
6444     // As a guard against bad PTE writes/TLB invalidates, fill the now-unused
6445     // PTEs with a pattern which will trigger fatal faults on access. We have to
6446     // do this after the TLB invalidate of the big PTEs, or the GPU might use
6447     // the new values.
6448     if (UVM_IS_DEBUG() && gpu_state->page_table_range_4k.table) {
6449         uvm_page_mask_init_from_big_ptes(block, gpu, &block_context->scratch_page_mask, big_ptes_to_merge);
6450         uvm_pte_batch_begin(push, pte_batch);
6451         block_gpu_pte_clear_4k(block,
6452                                gpu,
6453                                &block_context->scratch_page_mask,
6454                                tree->hal->poisoned_pte(),
6455                                pte_batch,
6456                                NULL);
6457         uvm_pte_batch_end(pte_batch);
6458     }
6459 }
6460 
6461 // Writes 0 (invalid) to the 2M PTE for this {block, gpu}.
6462 //
6463 // If tlb_batch is provided, the 2M PTE is added to the batch. The caller is
6464 // responsible for ending the TLB batch with the appropriate membar.
block_gpu_pte_clear_2m(uvm_va_block_t * block,uvm_gpu_t * gpu,uvm_pte_batch_t * pte_batch,uvm_tlb_batch_t * tlb_batch)6465 static void block_gpu_pte_clear_2m(uvm_va_block_t *block,
6466                                    uvm_gpu_t *gpu,
6467                                    uvm_pte_batch_t *pte_batch,
6468                                    uvm_tlb_batch_t *tlb_batch)
6469 {
6470     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
6471     uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
6472     uvm_gpu_phys_address_t pte_addr = uvm_page_table_range_entry_address(tree, &gpu_state->page_table_range_2m, 0);
6473     NvU32 pte_size = uvm_mmu_pte_size(tree, UVM_PAGE_SIZE_2M);
6474 
6475     // uvm_pte_batch_write_pte only writes the lower 8 bytes of the 16-byte PTE,
6476     // which would cause a problem when trying to make the entry invalid since
6477     // both halves must be 0. Using uvm_pte_batch_clear_ptes writes the entire
6478     // 16 bytes.
6479     uvm_pte_batch_clear_ptes(pte_batch, pte_addr, 0, pte_size, 1);
6480 
6481     if (tlb_batch)
6482         uvm_tlb_batch_invalidate(tlb_batch, block->start, UVM_PAGE_SIZE_2M, UVM_PAGE_SIZE_2M, UVM_MEMBAR_NONE);
6483 }
6484 
6485 // Writes the 2M PTE for {block, gpu} using memory from resident_id with
6486 // new_prot permissions. new_prot must not be UVM_PROT_NONE: use
6487 // block_gpu_pte_clear_2m instead.
6488 //
6489 // If tlb_batch is provided, the 2M PTE is added to the batch. The caller is
6490 // responsible for ending the TLB batch with the appropriate membar.
block_gpu_pte_write_2m(uvm_va_block_t * block,uvm_gpu_t * gpu,uvm_processor_id_t resident_id,uvm_prot_t new_prot,uvm_pte_batch_t * pte_batch,uvm_tlb_batch_t * tlb_batch)6491 static void block_gpu_pte_write_2m(uvm_va_block_t *block,
6492                                    uvm_gpu_t *gpu,
6493                                    uvm_processor_id_t resident_id,
6494                                    uvm_prot_t new_prot,
6495                                    uvm_pte_batch_t *pte_batch,
6496                                    uvm_tlb_batch_t *tlb_batch)
6497 {
6498     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
6499     uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
6500     uvm_gpu_phys_address_t pte_addr = uvm_page_table_range_entry_address(tree, &gpu_state->page_table_range_2m, 0);
6501     uvm_gpu_phys_address_t page_addr;
6502     NvU32 pte_size = uvm_mmu_pte_size(tree, UVM_PAGE_SIZE_2M);
6503     NvU64 pte_val;
6504     NvU64 pte_flags = block_gpu_pte_flag_cacheable(block, gpu, resident_id);
6505     int nid = NUMA_NO_NODE;
6506 
6507     UVM_ASSERT(new_prot != UVM_PROT_NONE);
6508     UVM_ASSERT(UVM_ID_IS_VALID(resident_id));
6509 
6510     if (UVM_ID_IS_CPU(resident_id)) {
6511         nid = block_get_page_node_residency(block, 0);
6512         UVM_ASSERT(nid != NUMA_NO_NODE);
6513         if (!uvm_va_block_is_hmm(block))
6514             block_mark_cpu_page_dirty(block, 0, nid);
6515     }
6516 
6517     page_addr = block_phys_page_address(block, block_phys_page(resident_id, nid, 0), gpu);
6518     pte_val = tree->hal->make_pte(page_addr.aperture, page_addr.address, new_prot, pte_flags);
6519     uvm_pte_batch_write_pte(pte_batch, pte_addr, pte_val, pte_size);
6520 
6521     if (tlb_batch)
6522         uvm_tlb_batch_invalidate(tlb_batch, block->start, UVM_PAGE_SIZE_2M, UVM_PAGE_SIZE_2M, UVM_MEMBAR_NONE);
6523 }
6524 
block_gpu_needs_to_activate_table(uvm_va_block_t * block,uvm_gpu_t * gpu)6525 static bool block_gpu_needs_to_activate_table(uvm_va_block_t *block, uvm_gpu_t *gpu)
6526 {
6527     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
6528 
6529     if (!block_gpu_supports_2m(block, gpu))
6530         return false;
6531 
6532     if ((gpu_state->page_table_range_big.table && !gpu_state->activated_big) ||
6533         (gpu_state->page_table_range_4k.table  && !gpu_state->activated_4k))
6534         return true;
6535 
6536     return false;
6537 }
6538 
6539 // Only used if 2M PTEs are supported. Either transitions a 2M PTE to a PDE, or
6540 // activates a newly-allocated page table (big or 4k) while the other is already
6541 // active. The caller must have already written the new PTEs under the table
6542 // with the appropriate membar.
block_gpu_write_pde(uvm_va_block_t * block,uvm_gpu_t * gpu,uvm_push_t * push,uvm_tlb_batch_t * tlb_batch)6543 static void block_gpu_write_pde(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_push_t *push, uvm_tlb_batch_t *tlb_batch)
6544 {
6545     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
6546     uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
6547 
6548     if (!gpu_state->pte_is_2m)
6549         UVM_ASSERT(block_gpu_needs_to_activate_table(block, gpu));
6550 
6551     UVM_ASSERT(gpu_state->page_table_range_big.table || gpu_state->page_table_range_4k.table);
6552 
6553     // We always need a membar to order PDE/PTE writes with the TLB invalidate.
6554     // write_pde will do a MEMBAR_SYS by default.
6555     if (uvm_page_table_range_aperture(&gpu_state->page_table_range_2m) == UVM_APERTURE_VID)
6556         uvm_push_set_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU);
6557     uvm_page_tree_write_pde(tree, &gpu_state->page_table_range_2m, push);
6558 
6559     gpu->parent->host_hal->wait_for_idle(push);
6560 
6561     // Invalidate just the PDE
6562     uvm_tlb_batch_invalidate(tlb_batch, block->start, UVM_PAGE_SIZE_2M, UVM_PAGE_SIZE_2M, UVM_MEMBAR_NONE);
6563 
6564     if (gpu_state->page_table_range_big.table)
6565         gpu_state->activated_big = true;
6566 
6567     if (gpu_state->page_table_range_4k.table)
6568         gpu_state->activated_4k = true;
6569 }
6570 
6571 // Called to switch the 2M PTE (valid or invalid) to a PDE. The caller should
6572 // have written all lower PTEs as appropriate into the given pte_batch already.
6573 // This function ends the PTE batch, activates the 2M PDE, and does a TLB
6574 // invalidate.
6575 //
6576 // The caller does not need to do any TLB invalidates since none of the lower
6577 // PTEs could be cached.
block_gpu_pte_finish_split_2m(uvm_va_block_t * block,uvm_gpu_t * gpu,uvm_push_t * push,uvm_pte_batch_t * pte_batch,uvm_tlb_batch_t * tlb_batch,uvm_membar_t tlb_membar)6578 static void block_gpu_pte_finish_split_2m(uvm_va_block_t *block,
6579                                           uvm_gpu_t *gpu,
6580                                           uvm_push_t *push,
6581                                           uvm_pte_batch_t *pte_batch,
6582                                           uvm_tlb_batch_t *tlb_batch,
6583                                           uvm_membar_t tlb_membar)
6584 {
6585     uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
6586     uvm_prot_t curr_prot = block_page_prot_gpu(block, gpu, 0);
6587 
6588     // Step 1: Make the 2M entry invalid. We can't directly transition from a
6589     //         valid 2M PTE to valid lower PTEs, because that could cause the
6590     //         GPU TLBs to cache the same VA in different cache lines. That
6591     //         could cause memory ordering to not be maintained.
6592     //
6593     //         If the 2M PTE is already invalid, no TLB invalidate is needed.
6594 
6595     if (curr_prot == UVM_PROT_NONE) {
6596         // If we aren't downgrading, then we don't need a membar.
6597         UVM_ASSERT(tlb_membar == UVM_MEMBAR_NONE);
6598 
6599         // End the batch, which pushes a membar to ensure that the caller's PTE
6600         // writes below 2M are observed before the PDE write we're about to do.
6601         uvm_pte_batch_end(pte_batch);
6602     }
6603     else {
6604         // The 64k and 4k PTEs can't possibly be cached since the 2M entry is
6605         // not yet a PDE, so we just need to invalidate this single 2M entry.
6606         uvm_tlb_batch_begin(tree, tlb_batch);
6607         block_gpu_pte_clear_2m(block, gpu, pte_batch, tlb_batch);
6608 
6609         // Make sure the PTE writes are observed before the TLB invalidate
6610         uvm_pte_batch_end(pte_batch);
6611         uvm_tlb_batch_end(tlb_batch, push, tlb_membar);
6612     }
6613 
6614     // Step 2: Switch the 2M entry from invalid to a PDE. This activates the
6615     //         smaller PTEs.
6616     uvm_tlb_batch_begin(tree, tlb_batch);
6617     block_gpu_write_pde(block, gpu, push, tlb_batch);
6618     uvm_tlb_batch_end(tlb_batch, push, UVM_MEMBAR_NONE);
6619 }
6620 
6621 // Switches any mix of valid or invalid 4k or 64k PTEs to an invalid 2M PTE.
6622 // Any lower PTEs are invalidated with the specified membar.
block_gpu_pte_merge_2m(uvm_va_block_t * block,uvm_va_block_context_t * block_context,uvm_gpu_t * gpu,uvm_push_t * push,uvm_membar_t tlb_membar)6623 static void block_gpu_pte_merge_2m(uvm_va_block_t *block,
6624                                    uvm_va_block_context_t *block_context,
6625                                    uvm_gpu_t *gpu,
6626                                    uvm_push_t *push,
6627                                    uvm_membar_t tlb_membar)
6628 {
6629     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
6630     uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
6631     uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch;
6632     uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch;
6633     NvU32 tlb_inval_sizes;
6634 
6635     UVM_ASSERT(!gpu_state->pte_is_2m);
6636     UVM_ASSERT(gpu_state->page_table_range_big.table || gpu_state->page_table_range_4k.table);
6637 
6638     // The 2M entry is currently a PDE, so first make it invalid. We can't
6639     // directly transition the entry from a valid PDE to a valid 2M PTE, because
6640     // that could cause the GPU TLBs to cache the same VA in different cache
6641     // lines. That could cause memory ordering to not be maintained.
6642     uvm_pte_batch_begin(push, pte_batch);
6643     block_gpu_pte_clear_2m(block, gpu, pte_batch, NULL);
6644     uvm_pte_batch_end(pte_batch);
6645 
6646     // Now invalidate both the 2M entry we just wrote as well as all lower-level
6647     // entries which could be cached. Subsequent MMU fills will stop at the now-
6648     // invalid 2M entry, so we only need to invalidate the lower PTEs without
6649     // actually writing them.
6650     tlb_inval_sizes = UVM_PAGE_SIZE_2M;
6651     if (gpu_state->page_table_range_big.table)
6652         tlb_inval_sizes |= UVM_PAGE_SIZE_64K;
6653 
6654     // Strictly-speaking we only need to invalidate those 4k ranges which are
6655     // not covered by a big pte. However, any such invalidate will require
6656     // enough 4k invalidates to force the TLB batching to invalidate everything
6657     // anyway, so just do the simpler thing.
6658     if (!bitmap_full(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK))
6659         tlb_inval_sizes |= UVM_PAGE_SIZE_4K;
6660 
6661     uvm_tlb_batch_begin(tree, tlb_batch);
6662     uvm_tlb_batch_invalidate(tlb_batch, block->start, UVM_PAGE_SIZE_2M, tlb_inval_sizes, UVM_MEMBAR_NONE);
6663     uvm_tlb_batch_end(tlb_batch, push, tlb_membar);
6664 
6665     // As a guard against bad PTE writes/TLB invalidates, fill the now-unused
6666     // PTEs with a pattern which will trigger fatal faults on access. We have to
6667     // do this after the TLB invalidate of the 2M entry, or the GPU might use
6668     // the new values.
6669     if (UVM_IS_DEBUG()) {
6670         uvm_pte_batch_begin(push, pte_batch);
6671 
6672         if (gpu_state->page_table_range_big.table) {
6673             block_gpu_pte_clear_big(block,
6674                                     gpu,
6675                                     NULL,
6676                                     tree->hal->poisoned_pte(),
6677                                     pte_batch,
6678                                     NULL);
6679         }
6680 
6681         if (gpu_state->page_table_range_4k.table) {
6682             block_gpu_pte_clear_4k(block,
6683                                    gpu,
6684                                    NULL,
6685                                    tree->hal->poisoned_pte(),
6686                                    pte_batch,
6687                                    NULL);
6688         }
6689 
6690         uvm_pte_batch_end(pte_batch);
6691     }
6692 }
6693 
block_pte_op_membar(block_pte_op_t pte_op,uvm_gpu_t * gpu,uvm_processor_id_t resident_id)6694 static uvm_membar_t block_pte_op_membar(block_pte_op_t pte_op, uvm_gpu_t *gpu, uvm_processor_id_t resident_id)
6695 {
6696     // Permissions upgrades (MAP) don't need membars
6697     if (pte_op == BLOCK_PTE_OP_MAP)
6698         return UVM_MEMBAR_NONE;
6699 
6700     UVM_ASSERT(UVM_ID_IS_VALID(resident_id));
6701     UVM_ASSERT(pte_op == BLOCK_PTE_OP_REVOKE);
6702 
6703     return uvm_hal_downgrade_membar_type(gpu, uvm_id_equal(gpu->id, resident_id));
6704 }
6705 
6706 // Write the 2M PTE for {block, gpu} to the memory on resident_id with new_prot
6707 // permissions. If the 2M entry is currently a PDE, it is first merged into a
6708 // PTE.
6709 //
6710 // new_prot must not be UVM_PROT_NONE: use block_gpu_unmap_to_2m instead.
6711 //
6712 // pte_op specifies whether this is a MAP or REVOKE operation, which determines
6713 // the TLB membar required.
block_gpu_map_to_2m(uvm_va_block_t * block,uvm_va_block_context_t * block_context,uvm_gpu_t * gpu,uvm_processor_id_t resident_id,uvm_prot_t new_prot,uvm_push_t * push,block_pte_op_t pte_op)6714 static void block_gpu_map_to_2m(uvm_va_block_t *block,
6715                                 uvm_va_block_context_t *block_context,
6716                                 uvm_gpu_t *gpu,
6717                                 uvm_processor_id_t resident_id,
6718                                 uvm_prot_t new_prot,
6719                                 uvm_push_t *push,
6720                                 block_pte_op_t pte_op)
6721 {
6722     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
6723     uvm_gpu_va_space_t *gpu_va_space = uvm_va_block_get_gpu_va_space(block, gpu);
6724     uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch;
6725     uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch;
6726     uvm_membar_t tlb_membar;
6727 
6728     UVM_ASSERT(new_prot != UVM_PROT_NONE);
6729 
6730     // If we have a mix of big and 4k PTEs, we have to first merge them to an
6731     // invalid 2M PTE.
6732     if (!gpu_state->pte_is_2m) {
6733         block_gpu_pte_merge_2m(block, block_context, gpu, push, UVM_MEMBAR_NONE);
6734 
6735         gpu_state->pte_is_2m = true;
6736         bitmap_zero(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6737     }
6738 
6739     // Write the new permissions
6740     uvm_pte_batch_begin(push, pte_batch);
6741     uvm_tlb_batch_begin(&gpu_va_space->page_tables, tlb_batch);
6742 
6743     block_gpu_pte_write_2m(block, gpu, resident_id, new_prot, pte_batch, tlb_batch);
6744 
6745     uvm_pte_batch_end(pte_batch);
6746 
6747     tlb_membar = block_pte_op_membar(pte_op, gpu, resident_id);
6748     uvm_tlb_batch_end(tlb_batch, push, tlb_membar);
6749 }
6750 
6751 // Combination split + map operation, called when only part of a 2M PTE mapping
6752 // is being changed. This splits an existing valid or invalid 2M PTE into the
6753 // mix of big and 4k PTEs described by block_context->mapping.new_pte_state.
6754 //
6755 // The PTEs covering the pages in pages_to_write are written to the memory on
6756 // resident_id with new_prot permissions. new_prot must not be UVM_PROT_NONE.
6757 //
6758 // The PTEs covering the pages not set in pages_to_write inherit the mapping of
6759 // the current 2M PTE. If the current mapping is valid, it must target
6760 // resident_id.
6761 //
6762 // pte_op specifies whether this is a MAP or REVOKE operation, which determines
6763 // the TLB membar required.
block_gpu_map_split_2m(uvm_va_block_t * block,uvm_va_block_context_t * block_context,uvm_gpu_t * gpu,uvm_processor_id_t resident_id,const uvm_page_mask_t * pages_to_write,uvm_prot_t new_prot,uvm_push_t * push,block_pte_op_t pte_op)6764 static void block_gpu_map_split_2m(uvm_va_block_t *block,
6765                                    uvm_va_block_context_t *block_context,
6766                                    uvm_gpu_t *gpu,
6767                                    uvm_processor_id_t resident_id,
6768                                    const uvm_page_mask_t *pages_to_write,
6769                                    uvm_prot_t new_prot,
6770                                    uvm_push_t *push,
6771                                    block_pte_op_t pte_op)
6772 {
6773     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
6774     uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
6775     uvm_va_block_new_pte_state_t *new_pte_state = &block_context->mapping.new_pte_state;
6776     uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch;
6777     uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch;
6778     uvm_prot_t curr_prot = block_page_prot_gpu(block, gpu, 0);
6779     uvm_membar_t tlb_membar;
6780     DECLARE_BITMAP(big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6781     DECLARE_BITMAP(big_ptes_inherit, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6782     DECLARE_BITMAP(big_ptes_new_prot, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6783 
6784     UVM_ASSERT(gpu_state->pte_is_2m);
6785 
6786     if (!gpu_state->page_table_range_4k.table)
6787         UVM_ASSERT(bitmap_full(new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK));
6788 
6789     uvm_pte_batch_begin(push, pte_batch);
6790 
6791     // Since the 2M entry is active as a PTE, the GPU MMU can't fetch entries
6792     // from the lower levels. This means we don't need to issue a TLB invalidate
6793     // when writing those levels.
6794 
6795     // Cases to handle:
6796     // 1) Big PTEs which inherit curr_prot
6797     // 2) Big PTEs which get new_prot
6798     // 3) Big PTEs which are split to 4k
6799     //    a) 4k PTEs which inherit curr_prot under the split big PTEs
6800     //    b) 4k PTEs which get new_prot under the split big PTEs
6801 
6802     // Compute the big PTEs which will need to be split to 4k, if any.
6803     bitmap_complement(big_ptes_split, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6804 
6805     if (gpu_state->page_table_range_big.table) {
6806         // Case 1: Write the big PTEs which will inherit the 2M permissions, if
6807         // any. These are the big PTEs which are unchanged (uncovered) by the
6808         // operation.
6809         bitmap_andnot(big_ptes_inherit,
6810                       new_pte_state->big_ptes,
6811                       new_pte_state->big_ptes_covered,
6812                       MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6813 
6814         if (curr_prot == UVM_PROT_NONE) {
6815             block_gpu_pte_clear_big(block,
6816                                     gpu,
6817                                     big_ptes_inherit,
6818                                     tree->hal->unmapped_pte(UVM_PAGE_SIZE_64K),
6819                                     pte_batch,
6820                                     NULL);
6821         }
6822         else {
6823             block_gpu_pte_write_big(block, gpu, resident_id, curr_prot, big_ptes_inherit, pte_batch, NULL);
6824         }
6825 
6826         // Case 2: Write the new big PTEs
6827         bitmap_and(big_ptes_new_prot,
6828                    new_pte_state->big_ptes,
6829                    new_pte_state->big_ptes_covered,
6830                    MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6831         block_gpu_pte_write_big(block, gpu, resident_id, new_prot, big_ptes_new_prot, pte_batch, NULL);
6832 
6833         // Case 3: Write the big PTEs which cover 4k PTEs
6834         block_gpu_pte_clear_big(block, gpu, big_ptes_split, 0, pte_batch, NULL);
6835 
6836         // We just wrote all possible big PTEs, so mark them as initialized
6837         gpu_state->initialized_big = true;
6838     }
6839     else {
6840         UVM_ASSERT(bitmap_empty(new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK));
6841     }
6842 
6843     // Cases 3a and 3b: Write all 4k PTEs under all now-split big PTEs
6844     block_gpu_pte_big_split_write_4k(block,
6845                                      block_context,
6846                                      gpu,
6847                                      resident_id,
6848                                      new_prot,
6849                                      big_ptes_split,
6850                                      pages_to_write,
6851                                      pte_batch);
6852 
6853     // Activate the 2M PDE. This ends the pte_batch and issues a single TLB
6854     // invalidate for the 2M entry.
6855     tlb_membar = block_pte_op_membar(pte_op, gpu, resident_id);
6856     block_gpu_pte_finish_split_2m(block, gpu, push, pte_batch, tlb_batch, tlb_membar);
6857 
6858     gpu_state->pte_is_2m = false;
6859     bitmap_copy(gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6860 }
6861 
6862 // Split the existing 2M PTE into big and 4k PTEs. No permissions are changed.
6863 //
6864 // new_big_ptes specifies which PTEs should be big. NULL means all PTEs should
6865 // be 4k.
block_gpu_split_2m(uvm_va_block_t * block,uvm_va_block_context_t * block_context,uvm_gpu_t * gpu,const unsigned long * new_big_ptes,uvm_push_t * push)6866 static void block_gpu_split_2m(uvm_va_block_t *block,
6867                                uvm_va_block_context_t *block_context,
6868                                uvm_gpu_t *gpu,
6869                                const unsigned long *new_big_ptes,
6870                                uvm_push_t *push)
6871 {
6872     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
6873     uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
6874     uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch;
6875     uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch;
6876     uvm_prot_t curr_prot = block_page_prot_gpu(block, gpu, 0);
6877     DECLARE_BITMAP(new_big_ptes_local, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6878     DECLARE_BITMAP(big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6879     NvU64 unmapped_pte_val;
6880     uvm_processor_id_t curr_residency;
6881 
6882     UVM_ASSERT(gpu_state->pte_is_2m);
6883 
6884     if (new_big_ptes)
6885         bitmap_copy(new_big_ptes_local, new_big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6886     else
6887         bitmap_zero(new_big_ptes_local, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6888 
6889     if (!bitmap_empty(new_big_ptes_local, MAX_BIG_PAGES_PER_UVM_VA_BLOCK))
6890         UVM_ASSERT(gpu_state->page_table_range_big.table);
6891 
6892     // We're splitting from 2M to big only, so we'll be writing all big PTEs
6893     if (gpu_state->page_table_range_big.table)
6894         gpu_state->initialized_big = true;
6895 
6896     // Cases to handle:
6897     // 1) Big PTEs which inherit curr_prot
6898     // 2) Big PTEs which are split to 4k
6899     //    a) 4k PTEs inherit curr_prot under the split big PTEs
6900 
6901     // big_ptes_split will cover the 4k regions
6902     bitmap_complement(big_ptes_split, new_big_ptes_local, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6903     uvm_page_mask_init_from_big_ptes(block, gpu, &block_context->mapping.big_split_page_mask, big_ptes_split);
6904 
6905     uvm_pte_batch_begin(push, pte_batch);
6906 
6907     // Since the 2M entry is active as a PTE, the GPU MMU can't fetch entries
6908     // from the lower levels. This means we don't need to issue a TLB invalidate
6909     // when writing those levels.
6910 
6911     if (curr_prot == UVM_PROT_NONE) {
6912         unmapped_pte_val = tree->hal->unmapped_pte(tree->big_page_size);
6913 
6914         // Case 2a: Clear the 4k PTEs under big_ptes_split
6915         block_gpu_pte_clear_4k(block, gpu, &block_context->mapping.big_split_page_mask, 0, pte_batch, NULL);
6916 
6917         // Case 1: Make the remaining big PTEs unmapped
6918         block_gpu_pte_clear_big(block, gpu, new_big_ptes_local, unmapped_pte_val, pte_batch, NULL);
6919     }
6920     else {
6921         curr_residency = block_gpu_get_processor_to_map(block, block_context, gpu, 0);
6922 
6923         // Case 2a: Write the new 4k PTEs under big_ptes_split
6924         block_gpu_pte_write_4k(block,
6925                                gpu,
6926                                curr_residency,
6927                                curr_prot,
6928                                &block_context->mapping.big_split_page_mask,
6929                                pte_batch,
6930                                NULL);
6931 
6932         // Case 1: Write the new big PTEs
6933         block_gpu_pte_write_big(block, gpu, curr_residency, curr_prot, new_big_ptes_local, pte_batch, NULL);
6934     }
6935 
6936     // Case 2: Make big_ptes_split invalid to activate the 4k PTEs
6937     if (gpu_state->page_table_range_big.table)
6938         block_gpu_pte_clear_big(block, gpu, big_ptes_split, 0, pte_batch, NULL);
6939 
6940     // Activate the 2M PDE. This ends the pte_batch and issues a single TLB
6941     // invalidate for the 2M entry. No membar is necessary since we aren't
6942     // changing permissions.
6943     block_gpu_pte_finish_split_2m(block, gpu, push, pte_batch, tlb_batch, UVM_MEMBAR_NONE);
6944 
6945     gpu_state->pte_is_2m = false;
6946     bitmap_copy(gpu_state->big_ptes, new_big_ptes_local, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6947 }
6948 
6949 // Split the big PTEs in big_ptes_to_split into 4k PTEs. No permissions are
6950 // changed.
6951 //
6952 // big_ptes_to_split must not be NULL.
block_gpu_split_big(uvm_va_block_t * block,uvm_va_block_context_t * block_context,uvm_gpu_t * gpu,const unsigned long * big_ptes_to_split,uvm_push_t * push)6953 static void block_gpu_split_big(uvm_va_block_t *block,
6954                                 uvm_va_block_context_t *block_context,
6955                                 uvm_gpu_t *gpu,
6956                                 const unsigned long *big_ptes_to_split,
6957                                 uvm_push_t *push)
6958 {
6959     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
6960     uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
6961     uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch;
6962     uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch;
6963     NvU32 big_page_size = tree->big_page_size;
6964     uvm_va_block_region_t big_region;
6965     uvm_processor_id_t resident_id;
6966     size_t big_page_index;
6967     uvm_prot_t curr_prot;
6968     DECLARE_BITMAP(big_ptes_valid, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6969 
6970     UVM_ASSERT(!gpu_state->pte_is_2m);
6971     UVM_ASSERT(bitmap_subset(big_ptes_to_split, gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK));
6972     UVM_ASSERT(!bitmap_empty(big_ptes_to_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK));
6973 
6974     uvm_pte_batch_begin(push, pte_batch);
6975     uvm_tlb_batch_begin(tree, tlb_batch);
6976 
6977     // Write all 4k PTEs under all big PTEs which are being split. We'll make
6978     // the big PTEs inactive below after flushing these writes. No TLB
6979     // invalidate is needed since the big PTE is active.
6980     bitmap_zero(big_ptes_valid, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6981     for_each_set_bit(big_page_index, big_ptes_to_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) {
6982         big_region = uvm_va_block_big_page_region(block, big_page_index, big_page_size);
6983         curr_prot = block_page_prot_gpu(block, gpu, big_region.first);
6984 
6985         uvm_page_mask_zero(&block_context->mapping.big_split_page_mask);
6986         uvm_page_mask_region_fill(&block_context->mapping.big_split_page_mask, big_region);
6987         if (curr_prot == UVM_PROT_NONE) {
6988             block_gpu_pte_clear_4k(block, gpu, &block_context->mapping.big_split_page_mask, 0, pte_batch, NULL);
6989         }
6990         else {
6991             __set_bit(big_page_index, big_ptes_valid);
6992 
6993             resident_id = block_gpu_get_processor_to_map(block, block_context, gpu, big_region.first);
6994 
6995             block_gpu_pte_write_4k(block,
6996                                    gpu,
6997                                    resident_id,
6998                                    curr_prot,
6999                                    &block_context->mapping.big_split_page_mask,
7000                                    pte_batch,
7001                                    NULL);
7002         }
7003     }
7004 
7005     // Unmap the big PTEs which are valid and are being split to 4k. We can't
7006     // directly transition from a valid big PTE to valid lower PTEs, because
7007     // that could cause the GPU TLBs to cache the same VA in different cache
7008     // lines. That could cause memory ordering to not be maintained.
7009     block_gpu_pte_clear_big(block, gpu, big_ptes_valid, tree->hal->unmapped_pte(big_page_size), pte_batch, tlb_batch);
7010 
7011     // End the batches. We have to commit the membars and TLB invalidates
7012     // before we finish splitting formerly-big PTEs. No membar is necessary
7013     // since we aren't changing permissions.
7014     uvm_pte_batch_end(pte_batch);
7015     uvm_tlb_batch_end(tlb_batch, push, UVM_MEMBAR_NONE);
7016 
7017     // Finish the split by switching the big PTEs from unmapped to invalid. This
7018     // causes the GPU MMU to start reading the 4k PTEs instead of stopping at
7019     // the unmapped big PTEs.
7020     uvm_pte_batch_begin(push, pte_batch);
7021     uvm_tlb_batch_begin(tree, tlb_batch);
7022 
7023     block_gpu_pte_clear_big(block, gpu, big_ptes_to_split, 0, pte_batch, tlb_batch);
7024 
7025     uvm_pte_batch_end(pte_batch);
7026 
7027     // Finally, activate the page tables if they're inactive
7028     if (block_gpu_needs_to_activate_table(block, gpu))
7029         block_gpu_write_pde(block, gpu, push, tlb_batch);
7030 
7031     uvm_tlb_batch_end(tlb_batch, push, UVM_MEMBAR_NONE);
7032 
7033     bitmap_andnot(gpu_state->big_ptes, gpu_state->big_ptes, big_ptes_to_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
7034 }
7035 
7036 // Changes permissions on some pre-existing mix of big and 4k PTEs into some
7037 // other mix of big and 4k PTEs, as described by
7038 // block_context->mapping.new_pte_state.
7039 //
7040 // The PTEs covering the pages in pages_to_write are written to the memory on
7041 // resident_id with new_prot permissions. new_prot must not be UVM_PROT_NONE.
7042 //
7043 // pte_op specifies whether this is a MAP or REVOKE operation, which determines
7044 // the TLB membar required.
block_gpu_map_big_and_4k(uvm_va_block_t * block,uvm_va_block_context_t * block_context,uvm_gpu_t * gpu,uvm_processor_id_t resident_id,const uvm_page_mask_t * pages_to_write,uvm_prot_t new_prot,uvm_push_t * push,block_pte_op_t pte_op)7045 static void block_gpu_map_big_and_4k(uvm_va_block_t *block,
7046                                      uvm_va_block_context_t *block_context,
7047                                      uvm_gpu_t *gpu,
7048                                      uvm_processor_id_t resident_id,
7049                                      const uvm_page_mask_t *pages_to_write,
7050                                      uvm_prot_t new_prot,
7051                                      uvm_push_t *push,
7052                                      block_pte_op_t pte_op)
7053 {
7054     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
7055     uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
7056     uvm_va_block_new_pte_state_t *new_pte_state = &block_context->mapping.new_pte_state;
7057     uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch;
7058     uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch;
7059     DECLARE_BITMAP(big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
7060     DECLARE_BITMAP(big_ptes_before_or_after, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
7061     DECLARE_BITMAP(big_ptes_merge, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
7062     DECLARE_BITMAP(big_ptes_mask, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
7063     uvm_va_block_region_t big_region;
7064     size_t big_page_index;
7065     NvU32 big_page_size = tree->big_page_size;
7066     uvm_membar_t tlb_membar = block_pte_op_membar(pte_op, gpu, resident_id);
7067 
7068     UVM_ASSERT(!gpu_state->pte_is_2m);
7069 
7070     uvm_pte_batch_begin(push, pte_batch);
7071     uvm_tlb_batch_begin(tree, tlb_batch);
7072 
7073     // All of these cases might be perfomed in the same call:
7074     // 1) Split currently-big PTEs to 4k
7075     //    a) Write new 4k PTEs which inherit curr_prot under the split big PTEs
7076     //    b) Write new 4k PTEs which get new_prot under the split big PTEs
7077     // 2) Merge currently-4k PTEs to big with new_prot
7078     // 3) Write currently-big PTEs which wholly get new_prot
7079     // 4) Write currently-4k PTEs which get new_prot
7080     // 5) Initialize big PTEs which are not covered by this operation
7081 
7082     // Cases 1a and 1b: Write all 4k PTEs under all currently-big PTEs which are
7083     // being split. We'll make the big PTEs inactive below after flushing these
7084     // writes. No TLB invalidate is needed since the big PTE is active.
7085     //
7086     // Mask computation: big_before && !big_after
7087     bitmap_andnot(big_ptes_split, gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
7088 
7089     block_gpu_pte_big_split_write_4k(block,
7090                                      block_context,
7091                                      gpu,
7092                                      resident_id,
7093                                      new_prot,
7094                                      big_ptes_split,
7095                                      pages_to_write,
7096                                      pte_batch);
7097 
7098     // Case 4: Write the 4k PTEs which weren't covered by a big PTE before, and
7099     // remain uncovered after the operation.
7100     //
7101     // Mask computation: !big_before && !big_after
7102     bitmap_or(big_ptes_before_or_after, gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
7103     uvm_page_mask_init_from_big_ptes(block, gpu, &block_context->scratch_page_mask, big_ptes_before_or_after);
7104     if (uvm_page_mask_andnot(&block_context->scratch_page_mask, pages_to_write, &block_context->scratch_page_mask)) {
7105         block_gpu_pte_write_4k(block,
7106                                gpu,
7107                                resident_id,
7108                                new_prot,
7109                                &block_context->scratch_page_mask,
7110                                pte_batch,
7111                                tlb_batch);
7112     }
7113 
7114     // Case 5: If the big page table is newly-allocated, make sure that all big
7115     // PTEs we aren't otherwise writing (that is, those which cover 4k PTEs) are
7116     // all initialized to invalid.
7117     //
7118     // The similar case of making newly-allocated big PTEs unmapped when no
7119     // lower 4k table is present is handled by having
7120     // block_gpu_compute_new_pte_state set new_pte_state->big_ptes
7121     // appropriately.
7122     if (gpu_state->page_table_range_big.table && !gpu_state->initialized_big) {
7123         // TODO: Bug 1766424: If we have the 4k page table already, we could
7124         //       attempt to merge all uncovered big PTE regions when first
7125         //       allocating the big table. That's probably not worth doing.
7126         UVM_ASSERT(gpu_state->page_table_range_4k.table);
7127         UVM_ASSERT(bitmap_empty(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK));
7128         bitmap_complement(big_ptes_mask, new_pte_state->big_ptes, uvm_va_block_num_big_pages(block, big_page_size));
7129         block_gpu_pte_clear_big(block, gpu, big_ptes_mask, 0, pte_batch, tlb_batch);
7130         gpu_state->initialized_big = true;
7131     }
7132 
7133     // Case 1 (step 1): Unmap the currently-big PTEs which are valid and are
7134     // being split to 4k. We can't directly transition from a valid big PTE to
7135     // valid lower PTEs, because that could cause the GPU TLBs to cache the same
7136     // VA in different cache lines. That could cause memory ordering to not be
7137     // maintained.
7138     bitmap_zero(big_ptes_mask, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
7139     for_each_set_bit(big_page_index, big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) {
7140         big_region = uvm_va_block_big_page_region(block, big_page_index, big_page_size);
7141         if (uvm_page_mask_test(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ], big_region.first))
7142             __set_bit(big_page_index, big_ptes_mask);
7143     }
7144 
7145     block_gpu_pte_clear_big(block, gpu, big_ptes_mask, tree->hal->unmapped_pte(big_page_size), pte_batch, tlb_batch);
7146 
7147     // Case 3: Write the currently-big PTEs which remain big PTEs, and are
7148     // wholly changing permissions.
7149     //
7150     // Mask computation: big_before && big_after && covered
7151     bitmap_and(big_ptes_mask, gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
7152     if (bitmap_and(big_ptes_mask, big_ptes_mask, new_pte_state->big_ptes_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK))
7153         block_gpu_pte_write_big(block, gpu, resident_id, new_prot, big_ptes_mask, pte_batch, tlb_batch);
7154 
7155     // Case 2 (step 1): Merge the new big PTEs and end the batches, now that
7156     // we've done all of the independent PTE writes we can. This also merges
7157     // newly-allocated uncovered big PTEs to unmapped (see
7158     // block_gpu_compute_new_pte_state).
7159     //
7160     // Mask computation: !big_before && big_after
7161     if (bitmap_andnot(big_ptes_merge, new_pte_state->big_ptes, gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)) {
7162         // This writes the newly-big PTEs to unmapped and ends the PTE and TLB
7163         // batches.
7164         block_gpu_pte_merge_big_and_end(block,
7165                                         block_context,
7166                                         gpu,
7167                                         big_ptes_merge,
7168                                         push,
7169                                         pte_batch,
7170                                         tlb_batch,
7171                                         tlb_membar);
7172 
7173         // Remove uncovered big PTEs. We needed to merge them to unmapped above,
7174         // but they shouldn't get new_prot below.
7175         bitmap_and(big_ptes_merge, big_ptes_merge, new_pte_state->big_ptes_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
7176     }
7177     else {
7178         // End the batches. We have to commit the membars and TLB invalidates
7179         // before we finish splitting formerly-big PTEs.
7180         uvm_pte_batch_end(pte_batch);
7181         uvm_tlb_batch_end(tlb_batch, push, tlb_membar);
7182     }
7183 
7184     if (!bitmap_empty(big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) ||
7185         !bitmap_empty(big_ptes_merge, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) ||
7186         block_gpu_needs_to_activate_table(block, gpu)) {
7187 
7188         uvm_pte_batch_begin(push, pte_batch);
7189         uvm_tlb_batch_begin(tree, tlb_batch);
7190 
7191         // Case 1 (step 2): Finish splitting our big PTEs, if we have any, by
7192         // switching them from unmapped to invalid. This causes the GPU MMU to
7193         // start reading the 4k PTEs instead of stopping at the unmapped big
7194         // PTEs.
7195         block_gpu_pte_clear_big(block, gpu, big_ptes_split, 0, pte_batch, tlb_batch);
7196 
7197         // Case 2 (step 2): Finish merging our big PTEs, if we have any, by
7198         // switching them from unmapped to new_prot.
7199         block_gpu_pte_write_big(block, gpu, resident_id, new_prot, big_ptes_merge, pte_batch, tlb_batch);
7200 
7201         uvm_pte_batch_end(pte_batch);
7202 
7203         // Finally, activate the page tables if they're inactive
7204         if (block_gpu_needs_to_activate_table(block, gpu))
7205             block_gpu_write_pde(block, gpu, push, tlb_batch);
7206 
7207         uvm_tlb_batch_end(tlb_batch, push, UVM_MEMBAR_NONE);
7208     }
7209 
7210     // Update gpu_state
7211     bitmap_copy(gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
7212 }
7213 
7214 // Unmap all PTEs for {block, gpu}. If the 2M entry is currently a PDE, it is
7215 // merged into a PTE.
block_gpu_unmap_to_2m(uvm_va_block_t * block,uvm_va_block_context_t * block_context,uvm_gpu_t * gpu,uvm_push_t * push,uvm_membar_t tlb_membar)7216 static void block_gpu_unmap_to_2m(uvm_va_block_t *block,
7217                                   uvm_va_block_context_t *block_context,
7218                                   uvm_gpu_t *gpu,
7219                                   uvm_push_t *push,
7220                                   uvm_membar_t tlb_membar)
7221 {
7222     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
7223     uvm_gpu_va_space_t *gpu_va_space = uvm_va_block_get_gpu_va_space(block, gpu);
7224     uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch;
7225     uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch;
7226 
7227     if (gpu_state->pte_is_2m) {
7228         // If we're already mapped as a valid 2M PTE, just write it to invalid
7229         uvm_pte_batch_begin(push, pte_batch);
7230         uvm_tlb_batch_begin(&gpu_va_space->page_tables, tlb_batch);
7231 
7232         block_gpu_pte_clear_2m(block, gpu, pte_batch, tlb_batch);
7233 
7234         uvm_pte_batch_end(pte_batch);
7235         uvm_tlb_batch_end(tlb_batch, push, tlb_membar);
7236     }
7237     else {
7238         // Otherwise we have a mix of big and 4K PTEs which need to be merged
7239         // into an invalid 2M PTE.
7240         block_gpu_pte_merge_2m(block, block_context, gpu, push, tlb_membar);
7241 
7242         gpu_state->pte_is_2m = true;
7243         bitmap_zero(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
7244     }
7245 }
7246 
7247 // Combination split + unmap operation, called when only part of a valid 2M PTE
7248 // mapping is being unmapped. The 2M PTE is split into a mix of valid and
7249 // invalid big and/or 4k PTEs, as described by
7250 // block_context->mapping.new_pte_state.
7251 //
7252 // The PTEs covering the pages in pages_to_unmap are cleared (unmapped).
7253 //
7254 // The PTEs covering the pages not set in pages_to_unmap inherit the mapping of
7255 // the current 2M PTE.
block_gpu_unmap_split_2m(uvm_va_block_t * block,uvm_va_block_context_t * block_context,uvm_gpu_t * gpu,const uvm_page_mask_t * pages_to_unmap,uvm_push_t * push,uvm_membar_t tlb_membar)7256 static void block_gpu_unmap_split_2m(uvm_va_block_t *block,
7257                                      uvm_va_block_context_t *block_context,
7258                                      uvm_gpu_t *gpu,
7259                                      const uvm_page_mask_t *pages_to_unmap,
7260                                      uvm_push_t *push,
7261                                      uvm_membar_t tlb_membar)
7262 {
7263     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
7264     uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
7265     uvm_va_block_new_pte_state_t *new_pte_state = &block_context->mapping.new_pte_state;
7266     uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch;
7267     uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch;
7268     uvm_prot_t curr_prot = block_page_prot_gpu(block, gpu, 0);
7269     uvm_processor_id_t resident_id;
7270     DECLARE_BITMAP(big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
7271     DECLARE_BITMAP(big_ptes_inherit, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
7272     DECLARE_BITMAP(big_ptes_new_prot, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
7273 
7274     UVM_ASSERT(gpu_state->pte_is_2m);
7275 
7276     resident_id = block_gpu_get_processor_to_map(block, block_context, gpu, 0);
7277 
7278     uvm_pte_batch_begin(push, pte_batch);
7279 
7280     // Since the 2M entry is active as a PTE, the GPU MMU can't fetch entries
7281     // from the lower levels. This means we don't need to issue a TLB invalidate
7282     // when writing those levels.
7283 
7284     // Cases to handle:
7285     // 1) Big PTEs which inherit curr_prot
7286     // 2) Big PTEs which get unmapped
7287     // 3) Big PTEs which are split to 4k
7288     //    a) 4k PTEs which inherit curr_prot under the split big PTEs
7289     //    b) 4k PTEs which get unmapped under the split big PTEs
7290 
7291     // Compute the big PTEs which will need to be split to 4k, if any.
7292     bitmap_complement(big_ptes_split, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
7293 
7294     if (gpu_state->page_table_range_big.table) {
7295         // Case 1: Write the big PTEs which will inherit the 2M permissions, if
7296         // any. These are the big PTEs which are unchanged (uncovered) by the
7297         // operation.
7298         bitmap_andnot(big_ptes_inherit,
7299                       new_pte_state->big_ptes,
7300                       new_pte_state->big_ptes_covered,
7301                       MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
7302 
7303         block_gpu_pte_write_big(block, gpu, resident_id, curr_prot, big_ptes_inherit, pte_batch, NULL);
7304 
7305         // Case 2: Clear the new big PTEs which get unmapped (those not covering
7306         // 4ks)
7307         bitmap_and(big_ptes_new_prot,
7308                    new_pte_state->big_ptes,
7309                    new_pte_state->big_ptes_covered,
7310                    MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
7311 
7312         block_gpu_pte_clear_big(block,
7313                                 gpu,
7314                                 big_ptes_new_prot,
7315                                 tree->hal->unmapped_pte(UVM_PAGE_SIZE_64K),
7316                                 pte_batch,
7317                                 NULL);
7318 
7319         // Case 3: Write the big PTEs which cover 4k PTEs
7320         block_gpu_pte_clear_big(block, gpu, big_ptes_split, 0, pte_batch, NULL);
7321 
7322         // We just wrote all possible big PTEs, so mark them as initialized
7323         gpu_state->initialized_big = true;
7324     }
7325     else {
7326         UVM_ASSERT(bitmap_empty(new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK));
7327         UVM_ASSERT(bitmap_full(new_pte_state->big_ptes_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK));
7328     }
7329 
7330     // Cases 3a and 3b: Write all 4k PTEs under all now-split big PTEs
7331     block_gpu_pte_big_split_write_4k(block,
7332                                      block_context,
7333                                      gpu,
7334                                      resident_id,
7335                                      UVM_PROT_NONE,
7336                                      big_ptes_split,
7337                                      pages_to_unmap,
7338                                      pte_batch);
7339 
7340     // And activate the 2M PDE. This ends the pte_batch and issues a single TLB
7341     // invalidate for the 2M entry.
7342     block_gpu_pte_finish_split_2m(block, gpu, push, pte_batch, tlb_batch, tlb_membar);
7343 
7344     gpu_state->pte_is_2m = false;
7345     bitmap_copy(gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
7346 }
7347 
7348 // Unmap some pre-existing mix of big and 4k PTEs into some other mix of big
7349 // and 4k PTEs.
7350 //
7351 // The PTEs covering the pages in pages_to_unmap are cleared (unmapped).
block_gpu_unmap_big_and_4k(uvm_va_block_t * block,uvm_va_block_context_t * block_context,uvm_gpu_t * gpu,const uvm_page_mask_t * pages_to_unmap,uvm_push_t * push,uvm_membar_t tlb_membar)7352 static void block_gpu_unmap_big_and_4k(uvm_va_block_t *block,
7353                                        uvm_va_block_context_t *block_context,
7354                                        uvm_gpu_t *gpu,
7355                                        const uvm_page_mask_t *pages_to_unmap,
7356                                        uvm_push_t *push,
7357                                        uvm_membar_t tlb_membar)
7358 {
7359     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
7360     uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
7361     uvm_va_block_new_pte_state_t *new_pte_state = &block_context->mapping.new_pte_state;
7362     uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch;
7363     uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch;
7364     DECLARE_BITMAP(big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
7365     DECLARE_BITMAP(big_ptes_before_or_after, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
7366     DECLARE_BITMAP(big_ptes_mask, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
7367     NvU32 big_page_size = tree->big_page_size;
7368     NvU64 unmapped_pte_val = tree->hal->unmapped_pte(big_page_size);
7369 
7370     UVM_ASSERT(!gpu_state->pte_is_2m);
7371 
7372     uvm_pte_batch_begin(push, pte_batch);
7373     uvm_tlb_batch_begin(tree, tlb_batch);
7374 
7375     // All of these cases might be perfomed in the same call:
7376     // 1) Split currently-big PTEs to 4k
7377     //    a) Write new 4k PTEs which inherit curr_prot under the split big PTEs
7378     //    b) Clear new 4k PTEs which get unmapped under the split big PTEs
7379     // 2) Merge currently-4k PTEs to unmapped big
7380     // 3) Clear currently-big PTEs which wholly get unmapped
7381     // 4) Clear currently-4k PTEs which get unmapped
7382     // 5) Initialize big PTEs which are not covered by this operation
7383 
7384     // Cases 1a and 1b: Write all 4k PTEs under all currently-big PTEs which are
7385     // being split. We'll make the big PTEs inactive below after flushing these
7386     // writes. No TLB invalidate is needed since the big PTE is active.
7387     //
7388     // Mask computation: big_before && !big_after
7389     bitmap_andnot(big_ptes_split, gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
7390 
7391     block_gpu_pte_big_split_write_4k(block,
7392                                      block_context,
7393                                      gpu,
7394                                      UVM_ID_INVALID,
7395                                      UVM_PROT_NONE,
7396                                      big_ptes_split,
7397                                      pages_to_unmap,
7398                                      pte_batch);
7399 
7400     // Case 4: Clear the 4k PTEs which weren't covered by a big PTE before, and
7401     // remain uncovered after the unmap.
7402     //
7403     // Mask computation: !big_before && !big_after
7404     bitmap_or(big_ptes_before_or_after, gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
7405     uvm_page_mask_init_from_big_ptes(block, gpu, &block_context->scratch_page_mask, big_ptes_before_or_after);
7406     if (uvm_page_mask_andnot(&block_context->scratch_page_mask, pages_to_unmap, &block_context->scratch_page_mask))
7407         block_gpu_pte_clear_4k(block, gpu, &block_context->scratch_page_mask, 0, pte_batch, tlb_batch);
7408 
7409     // Case 5: If the big page table is newly-allocated, make sure that all big
7410     // PTEs we aren't otherwise writing (that is, those which cover 4k PTEs) are
7411     // all initialized to invalid.
7412     //
7413     // The similar case of making newly-allocated big PTEs unmapped when no
7414     // lower 4k table is present is handled by having
7415     // block_gpu_compute_new_pte_state set new_pte_state->big_ptes
7416     // appropriately.
7417     if (gpu_state->page_table_range_big.table && !gpu_state->initialized_big) {
7418         // TODO: Bug 1766424: If we have the 4k page table already, we could
7419         //       attempt to merge all uncovered big PTE regions when first
7420         //       allocating the big table. That's probably not worth doing.
7421         UVM_ASSERT(gpu_state->page_table_range_4k.table);
7422         UVM_ASSERT(bitmap_empty(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK));
7423         bitmap_complement(big_ptes_mask, new_pte_state->big_ptes, uvm_va_block_num_big_pages(block, big_page_size));
7424         block_gpu_pte_clear_big(block, gpu, big_ptes_mask, 0, pte_batch, tlb_batch);
7425         gpu_state->initialized_big = true;
7426     }
7427 
7428     // Case 3 and step 1 of case 1: Unmap both currently-big PTEs which are
7429     // getting wholly unmapped, and those currently-big PTEs which are being
7430     // split to 4k. We can't directly transition from a valid big PTE to valid
7431     // lower PTEs, because that could cause the GPU TLBs to cache the same VA in
7432     // different cache lines. That could cause memory ordering to not be
7433     // maintained.
7434     //
7435     // Mask computation: (big_before && big_after && covered) ||
7436     //                   (big_before && !big_after)
7437     bitmap_and(big_ptes_mask, gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
7438     bitmap_and(big_ptes_mask, big_ptes_mask, new_pte_state->big_ptes_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
7439     bitmap_or(big_ptes_mask, big_ptes_mask, big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
7440     block_gpu_pte_clear_big(block, gpu, big_ptes_mask, unmapped_pte_val, pte_batch, tlb_batch);
7441 
7442     // Case 2: Merge the new big PTEs and end the batches, now that we've done
7443     // all of the independent PTE writes we can.
7444     //
7445     // Mask computation: !big_before && big_after
7446     if (bitmap_andnot(big_ptes_mask, new_pte_state->big_ptes, gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)) {
7447         // This writes the newly-big PTEs to unmapped and ends the PTE and TLB
7448         // batches.
7449         block_gpu_pte_merge_big_and_end(block,
7450                                         block_context,
7451                                         gpu,
7452                                         big_ptes_mask,
7453                                         push,
7454                                         pte_batch,
7455                                         tlb_batch,
7456                                         tlb_membar);
7457     }
7458     else {
7459         // End the batches. We have to commit the membars and TLB invalidates
7460         // before we finish splitting formerly-big PTEs.
7461         uvm_pte_batch_end(pte_batch);
7462         uvm_tlb_batch_end(tlb_batch, push, tlb_membar);
7463     }
7464 
7465     if (!bitmap_empty(big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) ||
7466         block_gpu_needs_to_activate_table(block, gpu)) {
7467         uvm_pte_batch_begin(push, pte_batch);
7468         uvm_tlb_batch_begin(tree, tlb_batch);
7469 
7470         // Case 1 (step 2): Finish splitting our big PTEs, if we have any, by
7471         // switching them from unmapped to invalid. This causes the GPU MMU to
7472         // start reading the 4k PTEs instead of stopping at the unmapped big
7473         // PTEs.
7474         block_gpu_pte_clear_big(block, gpu, big_ptes_split, 0, pte_batch, tlb_batch);
7475 
7476         uvm_pte_batch_end(pte_batch);
7477 
7478         // Finally, activate the page tables if they're inactive
7479         if (block_gpu_needs_to_activate_table(block, gpu))
7480             block_gpu_write_pde(block, gpu, push, tlb_batch);
7481 
7482         uvm_tlb_batch_end(tlb_batch, push, UVM_MEMBAR_NONE);
7483     }
7484 
7485     // Update gpu_state
7486     bitmap_copy(gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
7487 }
7488 
7489 // When PTE state is about to change (for example due to a map/unmap/revoke
7490 // operation), this function decides how to split and merge the PTEs in response
7491 // to that operation.
7492 //
7493 // The operation is described with the two page masks:
7494 //
7495 // - pages_changing indicates which pages will have their PTE mappings changed
7496 //   on the GPU in some way as a result of the operation (for example, which
7497 //   pages will actually have their mapping permissions upgraded).
7498 //
7499 // - page_mask_after indicates which pages on this GPU will have exactly the
7500 //   same PTE attributes (permissions, residency) as pages_changing after the
7501 //   operation is applied.
7502 //
7503 // PTEs are merged eagerly.
block_gpu_compute_new_pte_state(uvm_va_block_t * block,uvm_gpu_t * gpu,uvm_processor_id_t resident_id,const uvm_page_mask_t * pages_changing,const uvm_page_mask_t * page_mask_after,uvm_va_block_new_pte_state_t * new_pte_state)7504 static void block_gpu_compute_new_pte_state(uvm_va_block_t *block,
7505                                             uvm_gpu_t *gpu,
7506                                             uvm_processor_id_t resident_id,
7507                                             const uvm_page_mask_t *pages_changing,
7508                                             const uvm_page_mask_t *page_mask_after,
7509                                             uvm_va_block_new_pte_state_t *new_pte_state)
7510 {
7511     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
7512     uvm_va_block_region_t big_region_all, big_page_region, region;
7513     NvU32 big_page_size;
7514     uvm_page_index_t page_index;
7515     size_t big_page_index;
7516     DECLARE_BITMAP(big_ptes_not_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
7517     bool can_make_new_big_ptes;
7518 
7519     memset(new_pte_state, 0, sizeof(*new_pte_state));
7520     new_pte_state->needs_4k = true;
7521 
7522     // TODO: Bug 1676485: Force a specific page size for perf testing
7523 
7524     if (gpu_state->force_4k_ptes)
7525         return;
7526 
7527     // Limit HMM GPU allocations to PAGE_SIZE since migrate_vma_*(),
7528     // hmm_range_fault(), and make_device_exclusive_range() don't handle folios
7529     // yet. Also, it makes mremap() difficult since the new address may not
7530     // align with the GPU block size otherwise.
7531     // If PAGE_SIZE is 64K, the code following this check is OK since 64K
7532     // big_pages is supported on all HMM supported GPUs (Turing+).
7533     // TODO: Bug 3368756: add support for transparent huge pages (THP).
7534     if (uvm_va_block_is_hmm(block) && PAGE_SIZE == UVM_PAGE_SIZE_4K)
7535         return;
7536 
7537     UVM_ASSERT(uvm_page_mask_subset(pages_changing, page_mask_after));
7538 
7539     // If all pages in the 2M mask have the same attributes after the
7540     // operation is applied, we can use a 2M PTE.
7541     if (block_gpu_supports_2m(block, gpu) && uvm_page_mask_full(page_mask_after) &&
7542         (UVM_ID_IS_INVALID(resident_id) ||
7543          is_block_phys_contig(block, resident_id, block_get_page_node_residency(block, 0)))) {
7544         new_pte_state->pte_is_2m = true;
7545         new_pte_state->needs_4k = false;
7546         return;
7547     }
7548 
7549     // Find big PTEs with matching attributes
7550 
7551     // Can this block fit any big pages?
7552     big_page_size = uvm_va_block_gpu_big_page_size(block, gpu);
7553     big_region_all = uvm_va_block_big_page_region_all(block, big_page_size);
7554     if (big_region_all.first >= big_region_all.outer)
7555         return;
7556 
7557     new_pte_state->needs_4k = false;
7558 
7559     can_make_new_big_ptes = true;
7560 
7561     // Big pages can be used when mapping sysmem if the GPU supports it (Pascal+).
7562     if (UVM_ID_IS_CPU(resident_id) && !gpu->parent->can_map_sysmem_with_large_pages)
7563         can_make_new_big_ptes = false;
7564 
7565     // We must not fail during teardown: unmap (resident_id == UVM_ID_INVALID)
7566     // with no splits required. That means we should avoid allocating PTEs
7567     // which are only needed for merges.
7568     //
7569     // This only matters if we're merging to big PTEs. If we're merging to 2M,
7570     // then we must already have the 2M level (since it has to be allocated
7571     // before the lower levels).
7572     //
7573     // If pte_is_2m already and we don't have a big table, we're splitting so we
7574     // have to allocate.
7575     if (UVM_ID_IS_INVALID(resident_id) && !gpu_state->page_table_range_big.table && !gpu_state->pte_is_2m)
7576         can_make_new_big_ptes = false;
7577 
7578     for_each_va_block_page_in_region_mask(page_index, pages_changing, big_region_all) {
7579         uvm_cpu_chunk_t *chunk = NULL;
7580         int nid = NUMA_NO_NODE;
7581 
7582         if (UVM_ID_IS_CPU(resident_id)) {
7583             nid = block_get_page_node_residency(block, page_index);
7584             UVM_ASSERT(nid != NUMA_NO_NODE);
7585             chunk = uvm_cpu_chunk_get_chunk_for_page(block, nid, page_index);
7586         }
7587 
7588         big_page_index = uvm_va_block_big_page_index(block, page_index, big_page_size);
7589         big_page_region = uvm_va_block_big_page_region(block, big_page_index, big_page_size);
7590 
7591         __set_bit(big_page_index, new_pte_state->big_ptes_covered);
7592 
7593         // When mapping sysmem, we can use big pages only if we are mapping all
7594         // pages in the big page subregion and the CPU pages backing the
7595         // subregion are physically contiguous.
7596         if (can_make_new_big_ptes &&
7597             uvm_page_mask_region_full(page_mask_after, big_page_region) &&
7598             (!UVM_ID_IS_CPU(resident_id) ||
7599              (uvm_cpu_chunk_get_size(chunk) >= big_page_size &&
7600               uvm_va_block_cpu_is_region_resident_on(block, nid, big_page_region))))
7601             __set_bit(big_page_index, new_pte_state->big_ptes);
7602 
7603         if (!test_bit(big_page_index, new_pte_state->big_ptes))
7604             new_pte_state->needs_4k = true;
7605 
7606         // Skip to the end of the region
7607         page_index = big_page_region.outer - 1;
7608     }
7609 
7610     if (!new_pte_state->needs_4k) {
7611         // All big page regions in pages_changing will be big PTEs. Now check if
7612         // there are any unaligned pages outside of big_region_all which are
7613         // changing.
7614         region = uvm_va_block_region(0, big_region_all.first);
7615         if (!uvm_page_mask_region_empty(pages_changing, region)) {
7616             new_pte_state->needs_4k = true;
7617         }
7618         else {
7619             region = uvm_va_block_region(big_region_all.outer, uvm_va_block_num_cpu_pages(block));
7620             if (!uvm_page_mask_region_empty(pages_changing, region))
7621                 new_pte_state->needs_4k = true;
7622         }
7623     }
7624 
7625     // Now add in the PTEs which should be big but weren't covered by this
7626     // operation.
7627     //
7628     // Note that we can't assume that a given page table range has been
7629     // initialized if it's present here, since it could have been allocated by a
7630     // thread which had to restart its operation due to allocation retry.
7631     if (gpu_state->pte_is_2m || (block_gpu_supports_2m(block, gpu) && !gpu_state->page_table_range_2m.table)) {
7632         // We're splitting a 2M PTE so all of the uncovered big PTE regions will
7633         // become big PTEs which inherit the 2M permissions. If we haven't
7634         // allocated the 2M table yet, it will start as a 2M PTE until the lower
7635         // levels are allocated, so it's the same split case regardless of
7636         // whether this operation will need to retry a later allocation.
7637         bitmap_complement(big_ptes_not_covered, new_pte_state->big_ptes_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
7638     }
7639     else if (!gpu_state->page_table_range_4k.table && !new_pte_state->needs_4k) {
7640         // If we don't have 4k PTEs and we won't be allocating them for this
7641         // operation, all of our PTEs need to be big.
7642         UVM_ASSERT(!bitmap_empty(new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK));
7643         bitmap_zero(big_ptes_not_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
7644         bitmap_set(big_ptes_not_covered, 0, uvm_va_block_num_big_pages(block, big_page_size));
7645     }
7646     else {
7647         // Otherwise, add in all of the currently-big PTEs which are unchanging.
7648         // They won't be written, but they need to be carried into the new
7649         // gpu_state->big_ptes when it's updated.
7650         bitmap_andnot(big_ptes_not_covered,
7651                       gpu_state->big_ptes,
7652                       new_pte_state->big_ptes_covered,
7653                       MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
7654     }
7655 
7656     bitmap_or(new_pte_state->big_ptes, new_pte_state->big_ptes, big_ptes_not_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
7657 }
7658 
7659 // Wrapper around uvm_page_tree_get_ptes() and uvm_page_tree_alloc_table() that
7660 // handles allocation retry. If the block lock has been unlocked and relocked as
7661 // part of the allocation, NV_ERR_MORE_PROCESSING_REQUIRED is returned to signal
7662 // to the caller that the operation likely needs to be restarted. If that
7663 // happens, the pending tracker is added to the block's tracker.
block_alloc_pt_range_with_retry(uvm_va_block_t * va_block,uvm_gpu_t * gpu,NvU32 page_size,uvm_page_table_range_t * page_table_range,uvm_tracker_t * pending_tracker)7664 static NV_STATUS block_alloc_pt_range_with_retry(uvm_va_block_t *va_block,
7665                                                  uvm_gpu_t *gpu,
7666                                                  NvU32 page_size,
7667                                                  uvm_page_table_range_t *page_table_range,
7668                                                  uvm_tracker_t *pending_tracker)
7669 {
7670     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id);
7671     uvm_gpu_va_space_t *gpu_va_space = uvm_va_block_get_gpu_va_space(va_block, gpu);
7672     uvm_page_tree_t *page_tables = &gpu_va_space->page_tables;
7673     uvm_va_block_test_t *va_block_test = uvm_va_block_get_test(va_block);
7674     uvm_page_table_range_t local_range;
7675     NV_STATUS status;
7676 
7677     // Blocks may contain large PTEs without starting on a PTE boundary or
7678     // having an aligned size. Cover the PTEs of this size in the block's
7679     // interior so we match uvm_va_block_gpu_state_t::big_ptes.
7680     NvU64 start = UVM_ALIGN_UP(va_block->start, page_size);
7681     NvU64 size  = UVM_ALIGN_DOWN(va_block->end + 1, page_size) - start;
7682 
7683     // VA blocks which can use the 2MB level as either a PTE or a PDE need to
7684     // account for the PDE specially, so they must use uvm_page_tree_alloc_table
7685     // to allocate the lower levels.
7686     bool use_alloc_table = block_gpu_supports_2m(va_block, gpu) && page_size < UVM_PAGE_SIZE_2M;
7687 
7688     UVM_ASSERT(page_table_range->table == NULL);
7689 
7690     if (va_block_test && va_block_test->page_table_allocation_retry_force_count > 0) {
7691         --va_block_test->page_table_allocation_retry_force_count;
7692         status = NV_ERR_NO_MEMORY;
7693     }
7694     else if (use_alloc_table) {
7695         // Pascal+: 4k/64k tables under a 2M entry
7696         UVM_ASSERT(gpu_state->page_table_range_2m.table);
7697         status = uvm_page_tree_alloc_table(page_tables,
7698                                            page_size,
7699                                            UVM_PMM_ALLOC_FLAGS_NONE,
7700                                            &gpu_state->page_table_range_2m,
7701                                            page_table_range);
7702     }
7703     else {
7704         // 4k/big tables on pre-Pascal, and the 2M entry on Pascal+
7705         status = uvm_page_tree_get_ptes(page_tables,
7706                                         page_size,
7707                                         start,
7708                                         size,
7709                                         UVM_PMM_ALLOC_FLAGS_NONE,
7710                                         page_table_range);
7711     }
7712 
7713     if (status == NV_OK)
7714         goto allocated;
7715 
7716     if (status != NV_ERR_NO_MEMORY)
7717         return status;
7718 
7719     // Before unlocking the block lock, any pending work on the block has to be
7720     // added to the block's tracker.
7721     if (pending_tracker) {
7722         status = uvm_tracker_add_tracker_safe(&va_block->tracker, pending_tracker);
7723         if (status != NV_OK)
7724             return status;
7725     }
7726 
7727     // Unlock the va block and retry with eviction enabled
7728     uvm_mutex_unlock(&va_block->lock);
7729 
7730     if (use_alloc_table) {
7731         // Although we don't hold the block lock here, it's safe to pass
7732         // gpu_state->page_table_range_2m to the page tree code because we know
7733         // that the 2m range has already been allocated, and that it can't go
7734         // away while we have the va_space lock held.
7735         status = uvm_page_tree_alloc_table(page_tables,
7736                                            page_size,
7737                                            UVM_PMM_ALLOC_FLAGS_EVICT,
7738                                            &gpu_state->page_table_range_2m,
7739                                            &local_range);
7740     }
7741     else {
7742         status = uvm_page_tree_get_ptes(page_tables,
7743                                         page_size,
7744                                         start,
7745                                         size,
7746                                         UVM_PMM_ALLOC_FLAGS_EVICT,
7747                                         &local_range);
7748     }
7749 
7750     uvm_mutex_lock(&va_block->lock);
7751 
7752     if (status != NV_OK)
7753         return status;
7754 
7755     status = NV_ERR_MORE_PROCESSING_REQUIRED;
7756 
7757     if (page_table_range->table) {
7758         // A different caller allocated the page tables in the meantime, release the
7759         // local copy.
7760         uvm_page_tree_put_ptes(page_tables, &local_range);
7761         return status;
7762     }
7763 
7764     *page_table_range = local_range;
7765 
7766 allocated:
7767     // Mark the 2M PTE as active when we first allocate it, since we don't have
7768     // any PTEs below it yet.
7769     if (page_size == UVM_PAGE_SIZE_2M) {
7770         UVM_ASSERT(!gpu_state->pte_is_2m);
7771         gpu_state->pte_is_2m = true;
7772     }
7773     else if (page_size != UVM_PAGE_SIZE_4K) {
7774         // uvm_page_tree_get_ptes initializes big PTEs to invalid.
7775         // uvm_page_tree_alloc_table does not, so we'll have to do it later.
7776         if (use_alloc_table)
7777             UVM_ASSERT(!gpu_state->initialized_big);
7778         else
7779             gpu_state->initialized_big = true;
7780     }
7781 
7782     return status;
7783 }
7784 
7785 // Helper which allocates all page table ranges necessary for the given page
7786 // sizes. See block_alloc_pt_range_with_retry.
block_alloc_ptes_with_retry(uvm_va_block_t * va_block,uvm_gpu_t * gpu,NvU32 page_sizes,uvm_tracker_t * pending_tracker)7787 static NV_STATUS block_alloc_ptes_with_retry(uvm_va_block_t *va_block,
7788                                              uvm_gpu_t *gpu,
7789                                              NvU32 page_sizes,
7790                                              uvm_tracker_t *pending_tracker)
7791 {
7792     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id);
7793     uvm_gpu_va_space_t *gpu_va_space = uvm_va_block_get_gpu_va_space(va_block, gpu);
7794     uvm_page_table_range_t *range;
7795     NvU32 page_size;
7796     NV_STATUS status, final_status = NV_OK;
7797 
7798     UVM_ASSERT(gpu_state);
7799 
7800     // Blocks which can map 2M PTE/PDEs must always allocate the 2MB level first
7801     // in order to allocate the levels below.
7802     if (block_gpu_supports_2m(va_block, gpu))
7803         page_sizes |= UVM_PAGE_SIZE_2M;
7804 
7805     UVM_ASSERT((page_sizes & gpu_va_space->page_tables.hal->page_sizes()) == page_sizes);
7806 
7807     for_each_chunk_size_rev(page_size, page_sizes) {
7808         if (page_size == UVM_PAGE_SIZE_2M)
7809             range = &gpu_state->page_table_range_2m;
7810         else if (page_size == UVM_PAGE_SIZE_4K)
7811             range = &gpu_state->page_table_range_4k;
7812         else
7813             range = &gpu_state->page_table_range_big;
7814 
7815         if (range->table)
7816             continue;
7817 
7818         if (page_size == UVM_PAGE_SIZE_2M) {
7819             UVM_ASSERT(!gpu_state->pte_is_2m);
7820             UVM_ASSERT(!gpu_state->page_table_range_big.table);
7821             UVM_ASSERT(!gpu_state->page_table_range_4k.table);
7822         }
7823         else if (page_size != UVM_PAGE_SIZE_4K) {
7824             UVM_ASSERT(uvm_va_block_num_big_pages(va_block, uvm_va_block_gpu_big_page_size(va_block, gpu)) > 0);
7825             UVM_ASSERT(bitmap_empty(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK));
7826         }
7827 
7828         status = block_alloc_pt_range_with_retry(va_block, gpu, page_size, range, pending_tracker);
7829 
7830         // Keep going to allocate the remaining levels even if the allocation
7831         // requires a retry, since we'll likely still need them when we retry
7832         // anyway.
7833         if (status == NV_ERR_MORE_PROCESSING_REQUIRED)
7834             final_status = NV_ERR_MORE_PROCESSING_REQUIRED;
7835         else if (status != NV_OK)
7836             return status;
7837     }
7838 
7839     return final_status;
7840 }
7841 
block_alloc_ptes_new_state(uvm_va_block_t * va_block,uvm_gpu_t * gpu,uvm_va_block_new_pte_state_t * new_pte_state,uvm_tracker_t * pending_tracker)7842 static NV_STATUS block_alloc_ptes_new_state(uvm_va_block_t *va_block,
7843                                             uvm_gpu_t *gpu,
7844                                             uvm_va_block_new_pte_state_t *new_pte_state,
7845                                             uvm_tracker_t *pending_tracker)
7846 {
7847     NvU32 page_sizes = 0;
7848 
7849     if (new_pte_state->pte_is_2m) {
7850         page_sizes |= UVM_PAGE_SIZE_2M;
7851     }
7852     else {
7853         if (!bitmap_empty(new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK))
7854             page_sizes |= uvm_va_block_gpu_big_page_size(va_block, gpu);
7855 
7856         if (new_pte_state->needs_4k)
7857             page_sizes |= UVM_PAGE_SIZE_4K;
7858         else
7859             UVM_ASSERT(!bitmap_empty(new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK));
7860     }
7861 
7862     return block_alloc_ptes_with_retry(va_block, gpu, page_sizes, pending_tracker);
7863 }
7864 
7865 // Make sure that GMMU PDEs down to PDE1 are populated for the given VA block.
7866 // This is currently used on ATS systems to prevent GPUs from inadvertently
7867 // accessing sysmem via ATS because there is no PDE1 in the GMMU page tables,
7868 // which is where the NOATS bit resides.
7869 //
7870 // The current implementation simply pre-allocates the PTEs for the VA Block,
7871 // which is wasteful because the GPU may never need them.
7872 //
7873 // TODO: Bug 2064188: Change the MMU code to be able to directly refcount PDE1
7874 // page table entries without having to request PTEs.
block_pre_populate_pde1_gpu(uvm_va_block_t * block,uvm_gpu_va_space_t * gpu_va_space,uvm_tracker_t * pending_tracker)7875 static NV_STATUS block_pre_populate_pde1_gpu(uvm_va_block_t *block,
7876                                              uvm_gpu_va_space_t *gpu_va_space,
7877                                              uvm_tracker_t *pending_tracker)
7878 {
7879     NvU32 page_sizes;
7880     NvU32 big_page_size;
7881     uvm_gpu_t *gpu;
7882     uvm_va_block_gpu_state_t *gpu_state;
7883 
7884     UVM_ASSERT(block);
7885     UVM_ASSERT(gpu_va_space);
7886     UVM_ASSERT(gpu_va_space->ats.enabled);
7887     UVM_ASSERT(uvm_gpu_va_space_state(gpu_va_space) == UVM_GPU_VA_SPACE_STATE_ACTIVE);
7888 
7889     gpu = gpu_va_space->gpu;
7890     big_page_size = gpu_va_space->page_tables.big_page_size;
7891 
7892     gpu_state = block_gpu_state_get_alloc(block, gpu);
7893     if (!gpu_state)
7894         return NV_ERR_NO_MEMORY;
7895 
7896     // If the VA Block supports 2M pages, allocate the 2M PTE only, as it
7897     // requires less memory
7898     if (block_gpu_supports_2m(block, gpu))
7899         page_sizes = UVM_PAGE_SIZE_2M;
7900     else if (uvm_va_block_num_big_pages(block, big_page_size) > 0)
7901         page_sizes = big_page_size;
7902     else
7903         page_sizes = UVM_PAGE_SIZE_4K;
7904 
7905     return block_alloc_ptes_with_retry(block, gpu, page_sizes, pending_tracker);
7906 }
7907 
block_pre_populate_pde1_all_gpus(uvm_va_block_t * block,uvm_tracker_t * pending_tracker)7908 static NV_STATUS block_pre_populate_pde1_all_gpus(uvm_va_block_t *block, uvm_tracker_t *pending_tracker)
7909 {
7910     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
7911     NV_STATUS status = NV_OK;
7912 
7913     // Pre-populate PDEs down to PDE1 for all GPU VA spaces on ATS systems. See
7914     // comments in block_pre_populate_pde1_gpu.
7915     if (g_uvm_global.ats.enabled && !block->cpu.ever_mapped) {
7916         uvm_gpu_va_space_t *gpu_va_space;
7917 
7918         for_each_gpu_va_space(gpu_va_space, va_space) {
7919             // We only care about systems where ATS is supported and the application
7920             // enabled it.
7921             if (!gpu_va_space->ats.enabled)
7922                 continue;
7923 
7924             status = block_pre_populate_pde1_gpu(block, gpu_va_space, pending_tracker);
7925             if (status != NV_OK)
7926                 break;
7927         }
7928     }
7929 
7930     return status;
7931 }
7932 
block_unmap_gpu(uvm_va_block_t * block,uvm_va_block_context_t * block_context,uvm_gpu_t * gpu,const uvm_page_mask_t * unmap_page_mask,uvm_tracker_t * out_tracker)7933 static NV_STATUS block_unmap_gpu(uvm_va_block_t *block,
7934                                  uvm_va_block_context_t *block_context,
7935                                  uvm_gpu_t *gpu,
7936                                  const uvm_page_mask_t *unmap_page_mask,
7937                                  uvm_tracker_t *out_tracker)
7938 {
7939     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
7940     uvm_pte_bits_gpu_t pte_bit;
7941     uvm_push_t push;
7942     uvm_membar_t tlb_membar;
7943     bool only_local_mappings;
7944     uvm_page_mask_t *pages_to_unmap = &block_context->mapping.page_mask;
7945     NV_STATUS status;
7946     uvm_va_block_new_pte_state_t *new_pte_state = &block_context->mapping.new_pte_state;
7947     bool mask_empty;
7948     uvm_processor_mask_t *non_uvm_lite_gpus = &block_context->mapping.non_uvm_lite_gpus;
7949 
7950     // We have to check gpu_state before looking at any VA space state like our
7951     // gpu_va_space, because we could be on the eviction path where we don't
7952     // have a lock on that state. However, since remove_gpu_va_space walks each
7953     // block to unmap the GPU before destroying the gpu_va_space, we're
7954     // guaranteed that if this GPU has page tables, the gpu_va_space can't go
7955     // away while we're holding the block lock.
7956     if (!block_gpu_has_page_tables(block, gpu))
7957         return NV_OK;
7958 
7959     if (!uvm_page_mask_and(pages_to_unmap, unmap_page_mask, &gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ]))
7960         return NV_OK;
7961 
7962     // block_gpu_compute_new_pte_state needs a mask of pages which will have
7963     // matching attributes after the operation is performed. In the case of
7964     // unmap, those are the pages with unset bits.
7965     uvm_page_mask_andnot(&block_context->scratch_page_mask, &gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ], pages_to_unmap);
7966     uvm_page_mask_complement(&block_context->scratch_page_mask, &block_context->scratch_page_mask);
7967     block_gpu_compute_new_pte_state(block,
7968                                     gpu,
7969                                     UVM_ID_INVALID,
7970                                     pages_to_unmap,
7971                                     &block_context->scratch_page_mask,
7972                                     new_pte_state);
7973 
7974     status = block_alloc_ptes_new_state(block, gpu, new_pte_state, out_tracker);
7975     if (status != NV_OK)
7976         return status;
7977 
7978     only_local_mappings = !block_has_remote_mapping_gpu(block, &block_context->scratch_page_mask, gpu->id, pages_to_unmap);
7979     tlb_membar = uvm_hal_downgrade_membar_type(gpu, only_local_mappings);
7980 
7981     status = uvm_push_begin_acquire(gpu->channel_manager,
7982                                     UVM_CHANNEL_TYPE_MEMOPS,
7983                                     &block->tracker,
7984                                     &push,
7985                                     "Unmapping pages in block [0x%llx, 0x%llx)",
7986                                     block->start,
7987                                     block->end + 1);
7988     if (status != NV_OK)
7989         return status;
7990 
7991     if (new_pte_state->pte_is_2m) {
7992         // We're either unmapping a whole valid 2M PTE, or we're unmapping all
7993         // remaining pages in a split 2M PTE.
7994         block_gpu_unmap_to_2m(block, block_context, gpu, &push, tlb_membar);
7995     }
7996     else if (gpu_state->pte_is_2m) {
7997         // The block is currently mapped as a valid 2M PTE and we're unmapping
7998         // some pages within the 2M, so we have to split it into the appropriate
7999         // mix of big and 4k PTEs.
8000         block_gpu_unmap_split_2m(block, block_context, gpu, pages_to_unmap, &push, tlb_membar);
8001     }
8002     else {
8003         // We're unmapping some pre-existing mix of big and 4K PTEs into some
8004         // other mix of big and 4K PTEs.
8005         block_gpu_unmap_big_and_4k(block, block_context, gpu, pages_to_unmap, &push, tlb_membar);
8006     }
8007 
8008     uvm_push_end(&push);
8009 
8010     if (!uvm_processor_mask_test(block_get_uvm_lite_gpus(block), gpu->id)) {
8011 
8012         uvm_processor_mask_andnot(non_uvm_lite_gpus, &block->mapped, block_get_uvm_lite_gpus(block));
8013 
8014         UVM_ASSERT(uvm_processor_mask_test(non_uvm_lite_gpus, gpu->id));
8015 
8016         // If the GPU is the only non-UVM-Lite processor with mappings, we can
8017         // safely mark pages as fully unmapped
8018         if (uvm_processor_mask_get_count(non_uvm_lite_gpus) == 1 && !uvm_va_block_is_hmm(block))
8019             uvm_page_mask_andnot(&block->maybe_mapped_pages, &block->maybe_mapped_pages, pages_to_unmap);
8020     }
8021 
8022     // Clear block PTE state
8023     for (pte_bit = 0; pte_bit < UVM_PTE_BITS_GPU_MAX; pte_bit++) {
8024         mask_empty = !uvm_page_mask_andnot(&gpu_state->pte_bits[pte_bit],
8025                                            &gpu_state->pte_bits[pte_bit],
8026                                            pages_to_unmap);
8027         if (pte_bit == UVM_PTE_BITS_GPU_READ && mask_empty)
8028             uvm_processor_mask_clear(&block->mapped, gpu->id);
8029     }
8030 
8031     UVM_ASSERT(block_check_mappings(block, block_context));
8032 
8033     return uvm_tracker_add_push_safe(out_tracker, &push);
8034 }
8035 
uvm_va_block_unmap(uvm_va_block_t * va_block,uvm_va_block_context_t * va_block_context,uvm_processor_id_t id,uvm_va_block_region_t region,const uvm_page_mask_t * unmap_page_mask,uvm_tracker_t * out_tracker)8036 NV_STATUS uvm_va_block_unmap(uvm_va_block_t *va_block,
8037                              uvm_va_block_context_t *va_block_context,
8038                              uvm_processor_id_t id,
8039                              uvm_va_block_region_t region,
8040                              const uvm_page_mask_t *unmap_page_mask,
8041                              uvm_tracker_t *out_tracker)
8042 {
8043     uvm_page_mask_t *region_page_mask = &va_block_context->mapping.map_running_page_mask;
8044 
8045     UVM_ASSERT(!uvm_va_block_is_dead(va_block));
8046     uvm_assert_mutex_locked(&va_block->lock);
8047 
8048     if (UVM_ID_IS_CPU(id)) {
8049        block_unmap_cpu(va_block, va_block_context, region, unmap_page_mask);
8050        return NV_OK;
8051     }
8052 
8053     uvm_page_mask_init_from_region(region_page_mask, region, unmap_page_mask);
8054 
8055     return block_unmap_gpu(va_block, va_block_context, block_get_gpu(va_block, id), region_page_mask, out_tracker);
8056 }
8057 
8058 // This function essentially works as a wrapper around vm_insert_page (hence
8059 // the similar function prototype). This is needed since vm_insert_page
8060 // doesn't take permissions as input, but uses vma->vm_page_prot instead.
8061 // Since we may have multiple VA blocks under one VMA which need to map
8062 // with different permissions, we have to manually change vma->vm_page_prot for
8063 // each call to vm_insert_page. Multiple faults under one VMA in separate
8064 // blocks can be serviced concurrently, so the VMA wrapper lock is used
8065 // to protect access to vma->vm_page_prot.
uvm_cpu_insert_page(struct vm_area_struct * vma,NvU64 addr,struct page * page,uvm_prot_t new_prot)8066 static NV_STATUS uvm_cpu_insert_page(struct vm_area_struct *vma,
8067                                      NvU64 addr,
8068                                      struct page *page,
8069                                      uvm_prot_t new_prot)
8070 {
8071     uvm_vma_wrapper_t *vma_wrapper;
8072     unsigned long target_flags;
8073     pgprot_t target_pgprot;
8074     int ret;
8075 
8076     UVM_ASSERT(vma);
8077     UVM_ASSERT(vma->vm_private_data);
8078 
8079     vma_wrapper = vma->vm_private_data;
8080     target_flags = vma->vm_flags;
8081 
8082     if (new_prot == UVM_PROT_READ_ONLY)
8083         target_flags &= ~VM_WRITE;
8084 
8085     target_pgprot = vm_get_page_prot(target_flags);
8086 
8087     // Take VMA wrapper lock to check vma->vm_page_prot
8088     uvm_down_read(&vma_wrapper->lock);
8089 
8090     // Take a write lock if we need to modify the VMA vm_page_prot
8091     // - vma->vm_page_prot creates writable PTEs but new prot is RO
8092     // - vma->vm_page_prot creates read-only PTEs but new_prot is RW
8093     if (pgprot_val(vma->vm_page_prot) != pgprot_val(target_pgprot)) {
8094         uvm_up_read(&vma_wrapper->lock);
8095         uvm_down_write(&vma_wrapper->lock);
8096 
8097         vma->vm_page_prot = target_pgprot;
8098 
8099         uvm_downgrade_write(&vma_wrapper->lock);
8100     }
8101 
8102     ret = vm_insert_page(vma, addr, page);
8103     uvm_up_read(&vma_wrapper->lock);
8104     if (ret) {
8105         UVM_ASSERT_MSG(ret == -ENOMEM, "ret: %d\n", ret);
8106         return errno_to_nv_status(ret);
8107     }
8108 
8109     return NV_OK;
8110 }
8111 
compute_logical_prot(uvm_va_block_t * va_block,struct vm_area_struct * hmm_vma,uvm_page_index_t page_index)8112 static uvm_prot_t compute_logical_prot(uvm_va_block_t *va_block,
8113                                        struct vm_area_struct *hmm_vma,
8114                                        uvm_page_index_t page_index)
8115 {
8116     uvm_prot_t logical_prot;
8117 
8118     if (uvm_va_block_is_hmm(va_block)) {
8119         NvU64 addr = uvm_va_block_cpu_page_address(va_block, page_index);
8120 
8121         logical_prot = uvm_hmm_compute_logical_prot(va_block, hmm_vma, addr);
8122     }
8123     else {
8124         uvm_va_range_t *va_range = va_block->va_range;
8125 
8126         UVM_ASSERT(va_range->type == UVM_VA_RANGE_TYPE_MANAGED);
8127 
8128         // Zombified VA ranges no longer have a vma, so they have no permissions
8129         if (uvm_va_range_is_managed_zombie(va_range)) {
8130             logical_prot = UVM_PROT_NONE;
8131         }
8132         else {
8133             struct vm_area_struct *vma;
8134 
8135             vma = uvm_va_range_vma(va_range);
8136 
8137             if (!(vma->vm_flags & VM_READ))
8138                 logical_prot = UVM_PROT_NONE;
8139             else if (!(vma->vm_flags & VM_WRITE))
8140                 logical_prot = UVM_PROT_READ_ONLY;
8141             else
8142                 logical_prot = UVM_PROT_READ_WRITE_ATOMIC;
8143         }
8144     }
8145 
8146     return logical_prot;
8147 }
8148 
block_page_get(uvm_va_block_t * block,block_phys_page_t block_page)8149 static struct page *block_page_get(uvm_va_block_t *block, block_phys_page_t block_page)
8150 {
8151     struct page *page;
8152 
8153     if (UVM_ID_IS_CPU(block_page.processor)) {
8154         page = uvm_va_block_get_cpu_page(block, block_page.page_index);
8155     }
8156     else {
8157         uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
8158         uvm_gpu_t *gpu = uvm_va_space_get_gpu(va_space, block_page.processor);
8159         size_t chunk_offset;
8160         uvm_gpu_chunk_t *chunk = block_phys_page_chunk(block, block_page, &chunk_offset);
8161 
8162         UVM_ASSERT(gpu->mem_info.numa.enabled);
8163         page = uvm_gpu_chunk_to_page(&gpu->pmm, chunk) + chunk_offset / PAGE_SIZE;
8164     }
8165 
8166     UVM_ASSERT(page);
8167     return page;
8168 }
8169 
8170 // Creates or upgrades a CPU mapping for the given page, updating the block's
8171 // mapping and pte_bits bitmaps as appropriate. Upon successful return, the page
8172 // will be mapped with at least new_prot permissions.
8173 //
8174 // This never downgrades mappings, so new_prot must not be UVM_PROT_NONE. Use
8175 // block_unmap_cpu or uvm_va_block_revoke_prot instead.
8176 //
8177 // If the existing mapping is >= new_prot already, this is a no-op.
8178 //
8179 // It is the caller's responsibility to:
8180 //  - Revoke mappings from other processors as appropriate so the CPU can map
8181 //    with new_prot permissions
8182 //  - Guarantee that vm_insert_page is safe to use (vma->vm_mm has a reference
8183 //    and mmap_lock is held in at least read mode)
8184 //  - For HMM blocks that vma is valid and safe to use, vma->vm_mm has a
8185 //    reference and mmap_lock is held in at least read mode
8186 //  - Ensure that the struct page corresponding to the physical memory being
8187 //    mapped exists
8188 //  - Manage the block's residency bitmap
8189 //  - Ensure that the block hasn't been killed (block->va_range is present)
8190 //  - Update the pte/mapping tracking state on success
block_map_cpu_page_to(uvm_va_block_t * block,struct vm_area_struct * hmm_vma,uvm_processor_id_t resident_id,uvm_page_index_t page_index,uvm_prot_t new_prot)8191 static NV_STATUS block_map_cpu_page_to(uvm_va_block_t *block,
8192                                        struct vm_area_struct *hmm_vma,
8193                                        uvm_processor_id_t resident_id,
8194                                        uvm_page_index_t page_index,
8195                                        uvm_prot_t new_prot)
8196 {
8197     uvm_prot_t curr_prot = block_page_prot_cpu(block, page_index);
8198     uvm_va_range_t *va_range = block->va_range;
8199     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
8200     struct vm_area_struct *vma;
8201     NV_STATUS status;
8202     NvU64 addr;
8203     struct page *page;
8204     int nid = NUMA_NO_NODE;
8205 
8206     UVM_ASSERT((uvm_va_block_is_hmm(block) && hmm_vma) || va_range->type == UVM_VA_RANGE_TYPE_MANAGED);
8207     UVM_ASSERT(new_prot != UVM_PROT_NONE);
8208     UVM_ASSERT(new_prot < UVM_PROT_MAX);
8209     UVM_ASSERT(uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(resident_id)], UVM_ID_CPU));
8210 
8211     uvm_assert_mutex_locked(&block->lock);
8212     if (UVM_ID_IS_CPU(resident_id))
8213         UVM_ASSERT(uvm_page_mask_test(&block->cpu.allocated, page_index));
8214 
8215     // For the CPU, write implies atomic
8216     if (new_prot == UVM_PROT_READ_WRITE)
8217         new_prot = UVM_PROT_READ_WRITE_ATOMIC;
8218 
8219     // Only upgrades are supported in this function
8220     UVM_ASSERT(curr_prot <= new_prot);
8221 
8222     if (new_prot == curr_prot)
8223         return NV_OK;
8224 
8225     // Check for existing VMA permissions. They could have been modified after
8226     // the initial mmap by mprotect.
8227     if (new_prot > compute_logical_prot(block, hmm_vma, page_index))
8228         return NV_ERR_INVALID_ACCESS_TYPE;
8229 
8230     if (uvm_va_block_is_hmm(block)) {
8231         // Do not map CPU pages because they belong to the Linux kernel.
8232         return NV_OK;
8233     }
8234 
8235     UVM_ASSERT(va_range);
8236 
8237     if (UVM_ID_IS_CPU(resident_id)) {
8238         if (UVM_ID_IS_CPU(uvm_va_range_get_policy(va_range)->preferred_location)) {
8239             // Add the page's range group range to the range group's migrated list.
8240             uvm_range_group_range_t *rgr = uvm_range_group_range_find(va_space,
8241                                                                       uvm_va_block_cpu_page_address(block, page_index));
8242             if (rgr != NULL) {
8243                 uvm_spin_lock(&rgr->range_group->migrated_ranges_lock);
8244                 if (list_empty(&rgr->range_group_migrated_list_node))
8245                     list_move_tail(&rgr->range_group_migrated_list_node, &rgr->range_group->migrated_ranges);
8246                 uvm_spin_unlock(&rgr->range_group->migrated_ranges_lock);
8247             }
8248         }
8249 
8250         nid = block_get_page_node_residency(block, page_index);
8251         UVM_ASSERT(nid != NUMA_NO_NODE);
8252     }
8253 
8254     // It's possible here that current->mm != vma->vm_mm. That can happen for
8255     // example due to access_process_vm (ptrace) or get_user_pages from another
8256     // driver.
8257     //
8258     // In such cases the caller has taken care of ref counting vma->vm_mm for
8259     // us, so we can safely operate on the vma but we can't use
8260     // uvm_va_range_vma_current.
8261     vma = uvm_va_range_vma(va_range);
8262     uvm_assert_mmap_lock_locked(vma->vm_mm);
8263     UVM_ASSERT(!uvm_va_space_mm_enabled(va_space) || va_space->va_space_mm.mm == vma->vm_mm);
8264 
8265     // Add the mapping
8266     addr = uvm_va_block_cpu_page_address(block, page_index);
8267 
8268     // This unmap handles upgrades as vm_insert_page returns -EBUSY when
8269     // there's already a mapping present at fault_addr, so we have to unmap
8270     // first anyway when upgrading from RO -> RW.
8271     if (curr_prot != UVM_PROT_NONE)
8272         unmap_mapping_range(va_space->mapping, addr, PAGE_SIZE, 1);
8273 
8274     // Don't map the CPU until prior copies and GPU PTE updates finish,
8275     // otherwise we might not stay coherent.
8276     status = uvm_tracker_wait(&block->tracker);
8277     if (status != NV_OK)
8278         return status;
8279 
8280     page = block_page_get(block, block_phys_page(resident_id, nid, page_index));
8281     return uvm_cpu_insert_page(vma, addr, page, new_prot);
8282 }
8283 
8284 // Maps the CPU to the given pages which are resident on resident_id.
8285 // map_page_mask is an in/out parameter: the pages which are mapped to
8286 // resident_id are removed from the mask before returning.
8287 //
8288 // Caller must ensure that:
8289 // -  Pages in map_page_mask must not be set in the corresponding cpu.pte_bits
8290 // mask for the requested protection.
block_map_cpu_to(uvm_va_block_t * block,uvm_va_block_context_t * block_context,uvm_processor_id_t resident_id,uvm_va_block_region_t region,uvm_page_mask_t * map_page_mask,uvm_prot_t new_prot,uvm_tracker_t * out_tracker)8291 static NV_STATUS block_map_cpu_to(uvm_va_block_t *block,
8292                                   uvm_va_block_context_t *block_context,
8293                                   uvm_processor_id_t resident_id,
8294                                   uvm_va_block_region_t region,
8295                                   uvm_page_mask_t *map_page_mask,
8296                                   uvm_prot_t new_prot,
8297                                   uvm_tracker_t *out_tracker)
8298 {
8299     NV_STATUS status = NV_OK;
8300     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
8301     uvm_page_index_t page_index;
8302     uvm_page_mask_t *pages_to_map = &block_context->mapping.page_mask;
8303     const uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(block, resident_id, NUMA_NO_NODE);
8304     uvm_pte_bits_cpu_t prot_pte_bit = get_cpu_pte_bit_index(new_prot);
8305     uvm_pte_bits_cpu_t pte_bit;
8306 
8307     UVM_ASSERT(uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(resident_id)], UVM_ID_CPU));
8308 
8309     // TODO: Bug 1766424: Check if optimizing the unmap_mapping_range calls
8310     //       within block_map_cpu_page_to by doing them once here is helpful.
8311 
8312     UVM_ASSERT(!uvm_page_mask_and(&block_context->scratch_page_mask,
8313                                   map_page_mask,
8314                                   &block->cpu.pte_bits[prot_pte_bit]));
8315 
8316     // The pages which will actually change are those in the input page mask
8317     // which are resident on the target.
8318     if (!uvm_page_mask_and(pages_to_map, map_page_mask, resident_mask))
8319         return NV_OK;
8320 
8321     status = block_pre_populate_pde1_all_gpus(block, out_tracker);
8322     if (status != NV_OK)
8323         return status;
8324 
8325     block->cpu.ever_mapped = true;
8326 
8327     for_each_va_block_page_in_region_mask(page_index, pages_to_map, region) {
8328         status = block_map_cpu_page_to(block,
8329                                        block_context->hmm.vma,
8330                                        resident_id,
8331                                        page_index,
8332                                        new_prot);
8333         if (status != NV_OK)
8334             break;
8335 
8336         uvm_processor_mask_set(&block->mapped, UVM_ID_CPU);
8337     }
8338 
8339     // If there was some error, shrink the region so that we only update the
8340     // pte/mapping tracking bits for the pages that succeeded
8341     if (status != NV_OK) {
8342         region = uvm_va_block_region(region.first, page_index);
8343         uvm_page_mask_region_clear_outside(pages_to_map, region);
8344     }
8345 
8346     // If pages are mapped from a remote residency, notify the remote mapping
8347     // events to tools. We skip event notification if the cause is Invalid. We
8348     // use it to signal that this function is being called from the revocation
8349     // path to avoid reporting duplicate events.
8350     if (UVM_ID_IS_GPU(resident_id) &&
8351         va_space->tools.enabled &&
8352         block_context->mapping.cause != UvmEventMapRemoteCauseInvalid) {
8353         uvm_va_block_region_t subregion;
8354         for_each_va_block_subregion_in_mask(subregion, pages_to_map, region) {
8355             uvm_tools_record_map_remote(block,
8356                                         NULL,
8357                                         UVM_ID_CPU,
8358                                         resident_id,
8359                                         uvm_va_block_region_start(block, subregion),
8360                                         uvm_va_block_region_size(subregion),
8361                                         block_context->mapping.cause);
8362         }
8363     }
8364 
8365     // Update CPU mapping state
8366     for (pte_bit = 0; pte_bit <= prot_pte_bit; pte_bit++)
8367         uvm_page_mask_or(&block->cpu.pte_bits[pte_bit], &block->cpu.pte_bits[pte_bit], pages_to_map);
8368 
8369     if (!uvm_va_block_is_hmm(block))
8370         uvm_page_mask_or(&block->maybe_mapped_pages, &block->maybe_mapped_pages, pages_to_map);
8371 
8372     UVM_ASSERT(block_check_mappings(block, block_context));
8373 
8374     // Remove all pages that were newly-mapped from the input mask
8375     uvm_page_mask_andnot(map_page_mask, map_page_mask, pages_to_map);
8376 
8377     return status;
8378 }
8379 
8380 // Maps the GPU to the given pages which are resident on resident_id.
8381 // map_page_mask is an in/out parameter: the pages which are mapped
8382 // to resident_id are removed from the mask before returning.
8383 //
8384 // Caller must ensure that:
8385 // -  Pages in map_page_mask must not be set in the corresponding pte_bits mask
8386 // for the requested protection on the mapping GPU.
block_map_gpu_to(uvm_va_block_t * va_block,uvm_va_block_context_t * block_context,uvm_gpu_t * gpu,uvm_processor_id_t resident_id,int resident_nid,uvm_page_mask_t * map_page_mask,uvm_prot_t new_prot,uvm_tracker_t * out_tracker)8387 static NV_STATUS block_map_gpu_to(uvm_va_block_t *va_block,
8388                                   uvm_va_block_context_t *block_context,
8389                                   uvm_gpu_t *gpu,
8390                                   uvm_processor_id_t resident_id,
8391                                   int resident_nid,
8392                                   uvm_page_mask_t *map_page_mask,
8393                                   uvm_prot_t new_prot,
8394                                   uvm_tracker_t *out_tracker)
8395 {
8396     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id);
8397     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
8398     uvm_push_t push;
8399     NV_STATUS status;
8400     uvm_page_mask_t *pages_to_map = &block_context->mapping.page_mask;
8401     const uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, resident_id, resident_nid);
8402     uvm_pte_bits_gpu_t pte_bit;
8403     uvm_pte_bits_gpu_t prot_pte_bit = get_gpu_pte_bit_index(new_prot);
8404     uvm_va_block_new_pte_state_t *new_pte_state = &block_context->mapping.new_pte_state;
8405     block_pte_op_t pte_op;
8406 
8407     UVM_ASSERT(map_page_mask);
8408     UVM_ASSERT(uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(resident_id)], gpu->id));
8409 
8410     if (uvm_processor_mask_test(block_get_uvm_lite_gpus(va_block), gpu->id)) {
8411         uvm_va_policy_t *policy = uvm_va_range_get_policy(va_block->va_range);
8412         UVM_ASSERT(uvm_va_policy_preferred_location_equal(policy, resident_id, policy->preferred_nid));
8413     }
8414 
8415     UVM_ASSERT(!uvm_page_mask_and(&block_context->scratch_page_mask,
8416                                   map_page_mask,
8417                                   &gpu_state->pte_bits[prot_pte_bit]));
8418 
8419     // The pages which will actually change are those in the input page mask
8420     // which are resident on the target.
8421     if (!uvm_page_mask_and(pages_to_map, map_page_mask, resident_mask))
8422         return NV_OK;
8423 
8424     UVM_ASSERT(block_check_mapping_residency(va_block, block_context, gpu, resident_id, pages_to_map));
8425 
8426     // For PTE merge/split computation, compute all resident pages which will
8427     // have exactly new_prot after performing the mapping.
8428     uvm_page_mask_or(&block_context->scratch_page_mask, &gpu_state->pte_bits[prot_pte_bit], pages_to_map);
8429     if (prot_pte_bit < UVM_PTE_BITS_GPU_ATOMIC) {
8430         uvm_page_mask_andnot(&block_context->scratch_page_mask,
8431                              &block_context->scratch_page_mask,
8432                              &gpu_state->pte_bits[prot_pte_bit + 1]);
8433     }
8434     uvm_page_mask_and(&block_context->scratch_page_mask, &block_context->scratch_page_mask, resident_mask);
8435 
8436     block_gpu_compute_new_pte_state(va_block,
8437                                     gpu,
8438                                     resident_id,
8439                                     pages_to_map,
8440                                     &block_context->scratch_page_mask,
8441                                     new_pte_state);
8442 
8443     status = block_alloc_ptes_new_state(va_block, gpu, new_pte_state, out_tracker);
8444     if (status != NV_OK)
8445         return status;
8446 
8447     status = uvm_push_begin_acquire(gpu->channel_manager,
8448                                     UVM_CHANNEL_TYPE_MEMOPS,
8449                                     &va_block->tracker,
8450                                     &push,
8451                                     "Mapping pages in block [0x%llx, 0x%llx) as %s",
8452                                     va_block->start,
8453                                     va_block->end + 1,
8454                                     uvm_prot_string(new_prot));
8455     if (status != NV_OK)
8456         return status;
8457 
8458     pte_op = BLOCK_PTE_OP_MAP;
8459     if (new_pte_state->pte_is_2m) {
8460         // We're either modifying permissions of a pre-existing 2M PTE, or all
8461         // permissions match so we can merge to a new 2M PTE.
8462         block_gpu_map_to_2m(va_block, block_context, gpu, resident_id, new_prot, &push, pte_op);
8463     }
8464     else if (gpu_state->pte_is_2m) {
8465         // Permissions on a subset of the existing 2M PTE are being upgraded, so
8466         // we have to split it into the appropriate mix of big and 4k PTEs.
8467         block_gpu_map_split_2m(va_block, block_context, gpu, resident_id, pages_to_map, new_prot, &push, pte_op);
8468     }
8469     else {
8470         // We're upgrading permissions on some pre-existing mix of big and 4K
8471         // PTEs into some other mix of big and 4K PTEs.
8472         block_gpu_map_big_and_4k(va_block, block_context, gpu, resident_id, pages_to_map, new_prot, &push, pte_op);
8473     }
8474 
8475     // If we are mapping remotely, record the event
8476     if (va_space->tools.enabled && !uvm_id_equal(resident_id, gpu->id)) {
8477         uvm_va_block_region_t subregion, region = uvm_va_block_region_from_block(va_block);
8478 
8479         UVM_ASSERT(block_context->mapping.cause != UvmEventMapRemoteCauseInvalid);
8480 
8481         for_each_va_block_subregion_in_mask(subregion, pages_to_map, region) {
8482             uvm_tools_record_map_remote(va_block,
8483                                         &push,
8484                                         gpu->id,
8485                                         resident_id,
8486                                         uvm_va_block_region_start(va_block, subregion),
8487                                         uvm_va_block_region_size(subregion),
8488                                         block_context->mapping.cause);
8489         }
8490     }
8491 
8492     uvm_push_end(&push);
8493 
8494     // Update GPU mapping state
8495     for (pte_bit = 0; pte_bit <= prot_pte_bit; pte_bit++)
8496         uvm_page_mask_or(&gpu_state->pte_bits[pte_bit], &gpu_state->pte_bits[pte_bit], pages_to_map);
8497 
8498     uvm_processor_mask_set(&va_block->mapped, gpu->id);
8499 
8500     // If we are mapping a UVM-Lite GPU or HMM va_block, do not update
8501     // maybe_mapped_pages.
8502     if (!uvm_processor_mask_test(block_get_uvm_lite_gpus(va_block), gpu->id) &&
8503         !uvm_va_block_is_hmm(va_block))
8504         uvm_page_mask_or(&va_block->maybe_mapped_pages, &va_block->maybe_mapped_pages, pages_to_map);
8505 
8506     // Remove all pages resident on this processor from the input mask, which
8507     // were newly-mapped.
8508     uvm_page_mask_andnot(map_page_mask, map_page_mask, pages_to_map);
8509 
8510     UVM_ASSERT(block_check_mappings(va_block, block_context));
8511 
8512     return uvm_tracker_add_push_safe(out_tracker, &push);
8513 }
8514 
8515 // allowed_nid_mask is only valid if the CPU is set in allowed_mask.
map_get_allowed_destinations(uvm_va_block_t * block,uvm_va_block_context_t * va_block_context,const uvm_va_policy_t * policy,uvm_processor_id_t id,uvm_processor_mask_t * allowed_mask,nodemask_t * allowed_nid_mask)8516 static void map_get_allowed_destinations(uvm_va_block_t *block,
8517                                          uvm_va_block_context_t *va_block_context,
8518                                          const uvm_va_policy_t *policy,
8519                                          uvm_processor_id_t id,
8520                                          uvm_processor_mask_t *allowed_mask,
8521                                          nodemask_t *allowed_nid_mask)
8522 {
8523     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
8524 
8525     *allowed_nid_mask = node_possible_map;
8526 
8527     if (uvm_processor_mask_test(block_get_uvm_lite_gpus(block), id)) {
8528         // UVM-Lite can only map resident pages on the preferred location
8529         uvm_processor_mask_zero(allowed_mask);
8530         uvm_processor_mask_set(allowed_mask, policy->preferred_location);
8531         if (UVM_ID_IS_CPU(policy->preferred_location) &&
8532             !uvm_va_policy_preferred_location_equal(policy, UVM_ID_CPU, NUMA_NO_NODE)) {
8533             nodes_clear(*allowed_nid_mask);
8534             node_set(policy->preferred_nid, *allowed_nid_mask);
8535         }
8536     }
8537     else if ((uvm_va_policy_is_read_duplicate(policy, va_space) ||
8538               (uvm_id_equal(policy->preferred_location, id) &&
8539                !is_uvm_fault_force_sysmem_set() &&
8540                !uvm_hmm_must_use_sysmem(block, va_block_context->hmm.vma))) &&
8541              uvm_va_space_processor_has_memory(va_space, id)) {
8542         // When operating under read-duplication we should only map the local
8543         // processor to cause fault-and-duplicate of remote pages.
8544         //
8545         // The same holds when this processor is the preferred location: only
8546         // create local mappings to force remote pages to fault-and-migrate.
8547         uvm_processor_mask_zero(allowed_mask);
8548         uvm_processor_mask_set(allowed_mask, id);
8549     }
8550     else {
8551         // Common case: Just map wherever the memory happens to reside
8552         uvm_processor_mask_and(allowed_mask, &block->resident, &va_space->can_access[uvm_id_value(id)]);
8553         return;
8554     }
8555 
8556     // Clamp to resident and accessible processors
8557     uvm_processor_mask_and(allowed_mask, allowed_mask, &block->resident);
8558     uvm_processor_mask_and(allowed_mask, allowed_mask, &va_space->can_access[uvm_id_value(id)]);
8559 }
8560 
uvm_va_block_map(uvm_va_block_t * va_block,uvm_va_block_context_t * va_block_context,uvm_processor_id_t id,uvm_va_block_region_t region,const uvm_page_mask_t * map_page_mask,uvm_prot_t new_prot,UvmEventMapRemoteCause cause,uvm_tracker_t * out_tracker)8561 NV_STATUS uvm_va_block_map(uvm_va_block_t *va_block,
8562                            uvm_va_block_context_t *va_block_context,
8563                            uvm_processor_id_t id,
8564                            uvm_va_block_region_t region,
8565                            const uvm_page_mask_t *map_page_mask,
8566                            uvm_prot_t new_prot,
8567                            UvmEventMapRemoteCause cause,
8568                            uvm_tracker_t *out_tracker)
8569 {
8570     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
8571     uvm_gpu_t *gpu = NULL;
8572     uvm_processor_mask_t *allowed_destinations;
8573     uvm_processor_id_t resident_id;
8574     const uvm_page_mask_t *pte_mask;
8575     uvm_page_mask_t *running_page_mask = &va_block_context->mapping.map_running_page_mask;
8576     NV_STATUS status = NV_OK;
8577     const uvm_va_policy_t *policy = uvm_va_policy_get_region(va_block, region);
8578     nodemask_t *allowed_nid_destinations;
8579 
8580     va_block_context->mapping.cause = cause;
8581 
8582     UVM_ASSERT(new_prot != UVM_PROT_NONE);
8583     UVM_ASSERT(new_prot < UVM_PROT_MAX);
8584     uvm_assert_mutex_locked(&va_block->lock);
8585 
8586     // Mapping is not supported on the eviction path that doesn't hold the VA
8587     // space lock.
8588     uvm_assert_rwsem_locked(&va_space->lock);
8589 
8590     if (UVM_ID_IS_CPU(id)) {
8591         uvm_pte_bits_cpu_t prot_pte_bit;
8592 
8593         // Check if the current thread is allowed to call vm_insert_page
8594         if (!uvm_va_block_is_hmm(va_block) && !uvm_va_range_vma_check(va_block->va_range, va_block_context->mm))
8595             return NV_OK;
8596 
8597         prot_pte_bit = get_cpu_pte_bit_index(new_prot);
8598         pte_mask = &va_block->cpu.pte_bits[prot_pte_bit];
8599     }
8600     else {
8601         uvm_va_block_gpu_state_t *gpu_state;
8602         uvm_pte_bits_gpu_t prot_pte_bit;
8603 
8604         gpu = uvm_va_space_get_gpu(va_space, id);
8605 
8606         // Although this GPU UUID is registered in the VA space, it might not have a
8607         // GPU VA space registered.
8608         if (!uvm_gpu_va_space_get(va_space, gpu))
8609             return NV_OK;
8610 
8611         gpu_state = block_gpu_state_get_alloc(va_block, gpu);
8612         if (!gpu_state)
8613             return NV_ERR_NO_MEMORY;
8614 
8615         prot_pte_bit = get_gpu_pte_bit_index(new_prot);
8616         pte_mask = &gpu_state->pte_bits[prot_pte_bit];
8617     }
8618 
8619     uvm_page_mask_init_from_region(running_page_mask, region, map_page_mask);
8620 
8621     if (!uvm_page_mask_andnot(running_page_mask, running_page_mask, pte_mask))
8622         return NV_OK;
8623 
8624     allowed_destinations = uvm_processor_mask_cache_alloc();
8625     if (!allowed_destinations)
8626         return NV_ERR_NO_MEMORY;
8627 
8628     allowed_nid_destinations = uvm_kvmalloc(sizeof(*allowed_nid_destinations));
8629     if (!allowed_nid_destinations) {
8630         uvm_processor_mask_cache_free(allowed_destinations);
8631         return NV_ERR_NO_MEMORY;
8632     }
8633 
8634     // Map per resident location so we can more easily detect physically-
8635     // contiguous mappings.
8636     map_get_allowed_destinations(va_block,
8637                                  va_block_context,
8638                                  policy,
8639                                  id,
8640                                  allowed_destinations,
8641                                  allowed_nid_destinations);
8642     for_each_closest_id(resident_id, allowed_destinations, id, va_space) {
8643         if (UVM_ID_IS_CPU(id)) {
8644             status = block_map_cpu_to(va_block,
8645                                       va_block_context,
8646                                       resident_id,
8647                                       region,
8648                                       running_page_mask,
8649                                       new_prot,
8650                                       out_tracker);
8651         }
8652         else if (UVM_ID_IS_CPU(resident_id)) {
8653             int nid;
8654 
8655             // map_get_allowed_distinations() will set the mask of CPU NUMA
8656             // nodes that should be mapped.
8657             for_each_node_mask(nid, *allowed_nid_destinations) {
8658                 status = block_map_gpu_to(va_block,
8659                                           va_block_context,
8660                                           gpu,
8661                                           resident_id,
8662                                           nid,
8663                                           running_page_mask,
8664                                           new_prot,
8665                                           out_tracker);
8666                 if (status != NV_OK)
8667                     break;
8668             }
8669         }
8670         else {
8671             status = block_map_gpu_to(va_block,
8672                                       va_block_context,
8673                                       gpu,
8674                                       resident_id,
8675                                       NUMA_NO_NODE,
8676                                       running_page_mask,
8677                                       new_prot,
8678                                       out_tracker);
8679         }
8680 
8681         if (status != NV_OK)
8682             break;
8683 
8684         // If we've mapped all requested pages, we're done
8685         if (uvm_page_mask_region_empty(running_page_mask, region))
8686             break;
8687     }
8688 
8689     uvm_processor_mask_cache_free(allowed_destinations);
8690     uvm_kvfree(allowed_nid_destinations);
8691 
8692     return status;
8693 }
8694 
8695 // Revokes the given pages mapped by cpu. This is implemented by unmapping all
8696 // pages and mapping them later with the lower permission. This is required
8697 // because vm_insert_page can only be used for upgrades from Invalid.
8698 //
8699 // Caller must ensure that:
8700 // -  Pages in revoke_page_mask must be set in the
8701 // cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE] mask.
block_revoke_cpu_write(uvm_va_block_t * block,uvm_va_block_context_t * block_context,uvm_va_block_region_t region,const uvm_page_mask_t * revoke_page_mask,uvm_tracker_t * out_tracker)8702 static NV_STATUS block_revoke_cpu_write(uvm_va_block_t *block,
8703                                         uvm_va_block_context_t *block_context,
8704                                         uvm_va_block_region_t region,
8705                                         const uvm_page_mask_t *revoke_page_mask,
8706                                         uvm_tracker_t *out_tracker)
8707 {
8708     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
8709     uvm_va_block_region_t subregion;
8710 
8711     UVM_ASSERT(revoke_page_mask);
8712 
8713     UVM_ASSERT(uvm_page_mask_subset(revoke_page_mask, &block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE]));
8714 
8715     block_unmap_cpu(block, block_context, region, revoke_page_mask);
8716 
8717     // Coalesce revocation event notification
8718     for_each_va_block_subregion_in_mask(subregion, revoke_page_mask, region) {
8719         uvm_perf_event_notify_revocation(&va_space->perf_events,
8720                                          block,
8721                                          UVM_ID_CPU,
8722                                          uvm_va_block_region_start(block, subregion),
8723                                          uvm_va_block_region_size(subregion),
8724                                          UVM_PROT_READ_WRITE_ATOMIC,
8725                                          UVM_PROT_READ_ONLY);
8726     }
8727 
8728     // uvm_va_block_map will skip this remap if we aren't holding the right mm
8729     // lock.
8730     return uvm_va_block_map(block,
8731                             block_context,
8732                             UVM_ID_CPU,
8733                             region,
8734                             revoke_page_mask,
8735                             UVM_PROT_READ_ONLY,
8736                             UvmEventMapRemoteCauseInvalid,
8737                             out_tracker);
8738 }
8739 
block_revoke_prot_gpu_perf_notify(uvm_va_block_t * block,uvm_va_block_context_t * block_context,uvm_gpu_t * gpu,uvm_prot_t prot_revoked,const uvm_page_mask_t * pages_revoked)8740 static void block_revoke_prot_gpu_perf_notify(uvm_va_block_t *block,
8741                                               uvm_va_block_context_t *block_context,
8742                                               uvm_gpu_t *gpu,
8743                                               uvm_prot_t prot_revoked,
8744                                               const uvm_page_mask_t *pages_revoked)
8745 {
8746     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
8747     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
8748     uvm_va_block_region_t subregion, region = uvm_va_block_region_from_block(block);
8749     uvm_pte_bits_gpu_t pte_bit;
8750 
8751     for (pte_bit = UVM_PTE_BITS_GPU_ATOMIC; pte_bit >= get_gpu_pte_bit_index(prot_revoked); pte_bit--) {
8752         uvm_prot_t old_prot;
8753 
8754         if (!uvm_page_mask_and(&block_context->scratch_page_mask, &gpu_state->pte_bits[pte_bit], pages_revoked))
8755             continue;
8756 
8757         if (pte_bit == UVM_PTE_BITS_GPU_ATOMIC)
8758             old_prot = UVM_PROT_READ_WRITE_ATOMIC;
8759         else
8760             old_prot = UVM_PROT_READ_WRITE;
8761 
8762         for_each_va_block_subregion_in_mask(subregion, &block_context->scratch_page_mask, region) {
8763             uvm_perf_event_notify_revocation(&va_space->perf_events,
8764                                              block,
8765                                              gpu->id,
8766                                              uvm_va_block_region_start(block, subregion),
8767                                              uvm_va_block_region_size(subregion),
8768                                              old_prot,
8769                                              prot_revoked - 1);
8770         }
8771     }
8772 }
8773 
8774 // Revokes the given pages mapped by gpu which are resident on resident_id.
8775 // revoke_page_mask is an in/out parameter: the pages which have the appropriate
8776 // permissions and are mapped to resident_id are removed from the mask before
8777 // returning.
8778 //
8779 // Caller must ensure that:
8780 // -  Pages in map_page_mask must be set in the corresponding pte_bits mask for
8781 // the protection to be revoked on the mapping GPU.
block_revoke_prot_gpu_to(uvm_va_block_t * va_block,uvm_va_block_context_t * block_context,uvm_gpu_t * gpu,uvm_processor_id_t resident_id,uvm_page_mask_t * revoke_page_mask,uvm_prot_t prot_to_revoke,uvm_tracker_t * out_tracker)8782 static NV_STATUS block_revoke_prot_gpu_to(uvm_va_block_t *va_block,
8783                                           uvm_va_block_context_t *block_context,
8784                                           uvm_gpu_t *gpu,
8785                                           uvm_processor_id_t resident_id,
8786                                           uvm_page_mask_t *revoke_page_mask,
8787                                           uvm_prot_t prot_to_revoke,
8788                                           uvm_tracker_t *out_tracker)
8789 {
8790     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id);
8791     uvm_push_t push;
8792     NV_STATUS status;
8793     uvm_pte_bits_gpu_t pte_bit;
8794     uvm_pte_bits_gpu_t prot_pte_bit = get_gpu_pte_bit_index(prot_to_revoke);
8795     uvm_prot_t new_prot = prot_to_revoke - 1;
8796     uvm_va_block_new_pte_state_t *new_pte_state = &block_context->mapping.new_pte_state;
8797     block_pte_op_t pte_op;
8798     const uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, resident_id, NUMA_NO_NODE);
8799     uvm_page_mask_t *pages_to_revoke = &block_context->mapping.page_mask;
8800 
8801     UVM_ASSERT(revoke_page_mask);
8802     UVM_ASSERT(uvm_page_mask_subset(revoke_page_mask, &gpu_state->pte_bits[prot_pte_bit]));
8803 
8804     // The pages which will actually change are those in the input page mask
8805     // which are resident on the target.
8806     if (!uvm_page_mask_and(pages_to_revoke, revoke_page_mask, resident_mask))
8807         return NV_OK;
8808 
8809     UVM_ASSERT(block_check_mapping_residency(va_block, block_context, gpu, resident_id, pages_to_revoke));
8810 
8811     // For PTE merge/split computation, compute all resident pages which will
8812     // have exactly prot_to_revoke-1 after performing the revocation.
8813     uvm_page_mask_andnot(&block_context->scratch_page_mask, &gpu_state->pte_bits[prot_pte_bit], pages_to_revoke);
8814     uvm_page_mask_andnot(&block_context->scratch_page_mask,
8815                          &gpu_state->pte_bits[prot_pte_bit - 1],
8816                          &block_context->scratch_page_mask);
8817     uvm_page_mask_and(&block_context->scratch_page_mask, &block_context->scratch_page_mask, resident_mask);
8818 
8819     block_gpu_compute_new_pte_state(va_block,
8820                                     gpu,
8821                                     resident_id,
8822                                     pages_to_revoke,
8823                                     &block_context->scratch_page_mask,
8824                                     new_pte_state);
8825 
8826     status = block_alloc_ptes_new_state(va_block, gpu, new_pte_state, out_tracker);
8827     if (status != NV_OK)
8828         return status;
8829 
8830     status = uvm_push_begin_acquire(gpu->channel_manager,
8831                                     UVM_CHANNEL_TYPE_MEMOPS,
8832                                     &va_block->tracker,
8833                                     &push,
8834                                     "Revoking %s access privileges in block [0x%llx, 0x%llx) ",
8835                                     uvm_prot_string(prot_to_revoke),
8836                                     va_block->start,
8837                                     va_block->end + 1);
8838     if (status != NV_OK)
8839         return status;
8840 
8841     pte_op = BLOCK_PTE_OP_REVOKE;
8842     if (new_pte_state->pte_is_2m) {
8843         // We're either modifying permissions of a pre-existing 2M PTE, or all
8844         // permissions match so we can merge to a new 2M PTE.
8845         block_gpu_map_to_2m(va_block, block_context, gpu, resident_id, new_prot, &push, pte_op);
8846     }
8847     else if (gpu_state->pte_is_2m) {
8848         // Permissions on a subset of the existing 2M PTE are being downgraded,
8849         // so we have to split it into the appropriate mix of big and 4k PTEs.
8850         block_gpu_map_split_2m(va_block, block_context, gpu, resident_id, pages_to_revoke, new_prot, &push, pte_op);
8851     }
8852     else {
8853         // We're downgrading permissions on some pre-existing mix of big and 4K
8854         // PTEs into some other mix of big and 4K PTEs.
8855         block_gpu_map_big_and_4k(va_block, block_context, gpu, resident_id, pages_to_revoke, new_prot, &push, pte_op);
8856     }
8857 
8858     uvm_push_end(&push);
8859 
8860     block_revoke_prot_gpu_perf_notify(va_block, block_context, gpu, prot_to_revoke, pages_to_revoke);
8861 
8862     // Update GPU mapping state
8863     for (pte_bit = UVM_PTE_BITS_GPU_ATOMIC; pte_bit >= prot_pte_bit; pte_bit--)
8864         uvm_page_mask_andnot(&gpu_state->pte_bits[pte_bit], &gpu_state->pte_bits[pte_bit], pages_to_revoke);
8865 
8866     // Remove all pages resident on this processor from the input mask, which
8867     // pages which were revoked and pages which already had the correct
8868     // permissions.
8869     uvm_page_mask_andnot(revoke_page_mask, revoke_page_mask, pages_to_revoke);
8870 
8871     UVM_ASSERT(block_check_mappings(va_block, block_context));
8872 
8873     return uvm_tracker_add_push_safe(out_tracker, &push);
8874 }
8875 
uvm_va_block_revoke_prot(uvm_va_block_t * va_block,uvm_va_block_context_t * va_block_context,uvm_processor_id_t id,uvm_va_block_region_t region,const uvm_page_mask_t * revoke_page_mask,uvm_prot_t prot_to_revoke,uvm_tracker_t * out_tracker)8876 NV_STATUS uvm_va_block_revoke_prot(uvm_va_block_t *va_block,
8877                                    uvm_va_block_context_t *va_block_context,
8878                                    uvm_processor_id_t id,
8879                                    uvm_va_block_region_t region,
8880                                    const uvm_page_mask_t *revoke_page_mask,
8881                                    uvm_prot_t prot_to_revoke,
8882                                    uvm_tracker_t *out_tracker)
8883 {
8884     uvm_gpu_t *gpu;
8885     uvm_va_block_gpu_state_t *gpu_state;
8886     uvm_processor_mask_t *resident_procs;
8887     uvm_processor_id_t resident_id;
8888     uvm_page_mask_t *running_page_mask = &va_block_context->mapping.revoke_running_page_mask;
8889     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
8890     uvm_pte_bits_gpu_t prot_pte_bit;
8891     NV_STATUS status = NV_OK;
8892 
8893     UVM_ASSERT(prot_to_revoke > UVM_PROT_READ_ONLY);
8894     UVM_ASSERT(prot_to_revoke < UVM_PROT_MAX);
8895     uvm_assert_mutex_locked(&va_block->lock);
8896 
8897     if (UVM_ID_IS_CPU(id)) {
8898         if (prot_to_revoke == UVM_PROT_READ_WRITE_ATOMIC)
8899             return NV_OK;
8900 
8901         if (uvm_va_block_is_hmm(va_block)) {
8902             // Linux is responsible for CPU page table updates.
8903             uvm_page_mask_region_clear(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE], region);
8904             return NV_OK;
8905         }
8906 
8907         uvm_page_mask_init_from_region(running_page_mask, region, revoke_page_mask);
8908 
8909         if (uvm_page_mask_and(running_page_mask, running_page_mask, &va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE]))
8910             return block_revoke_cpu_write(va_block, va_block_context, region, running_page_mask, out_tracker);
8911 
8912         return NV_OK;
8913     }
8914 
8915     gpu = uvm_va_space_get_gpu(va_space, id);
8916 
8917     // UVM-Lite GPUs should never have access revoked
8918     UVM_ASSERT_MSG(!uvm_processor_mask_test(block_get_uvm_lite_gpus(va_block), gpu->id),
8919                    "GPU %s\n", uvm_gpu_name(gpu));
8920 
8921     // Return early if there are no mappings for the GPU present in the block
8922     if (!uvm_processor_mask_test(&va_block->mapped, gpu->id))
8923         return NV_OK;
8924 
8925     gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id);
8926     prot_pte_bit = get_gpu_pte_bit_index(prot_to_revoke);
8927 
8928     uvm_page_mask_init_from_region(running_page_mask, region, revoke_page_mask);
8929 
8930     if (!uvm_page_mask_and(running_page_mask, running_page_mask, &gpu_state->pte_bits[prot_pte_bit]))
8931         return NV_OK;
8932 
8933     resident_procs = uvm_processor_mask_cache_alloc();
8934     if (!resident_procs)
8935         return NV_ERR_NO_MEMORY;
8936 
8937     // Revoke per resident location so we can more easily detect physically-
8938     // contiguous mappings.
8939     uvm_processor_mask_copy(resident_procs, &va_block->resident);
8940 
8941     for_each_closest_id(resident_id, resident_procs, gpu->id, va_space) {
8942         NV_STATUS status = block_revoke_prot_gpu_to(va_block,
8943                                                     va_block_context,
8944                                                     gpu,
8945                                                     resident_id,
8946                                                     running_page_mask,
8947                                                     prot_to_revoke,
8948                                                     out_tracker);
8949         if (status != NV_OK)
8950             break;
8951 
8952         // If we've revoked all requested pages, we're done
8953         if (uvm_page_mask_region_empty(running_page_mask, region))
8954             break;
8955     }
8956 
8957     uvm_processor_mask_cache_free(resident_procs);
8958 
8959     return status;
8960 }
8961 
uvm_va_block_map_mask(uvm_va_block_t * va_block,uvm_va_block_context_t * va_block_context,const uvm_processor_mask_t * map_processor_mask,uvm_va_block_region_t region,const uvm_page_mask_t * map_page_mask,uvm_prot_t new_prot,UvmEventMapRemoteCause cause)8962 NV_STATUS uvm_va_block_map_mask(uvm_va_block_t *va_block,
8963                                 uvm_va_block_context_t *va_block_context,
8964                                 const uvm_processor_mask_t *map_processor_mask,
8965                                 uvm_va_block_region_t region,
8966                                 const uvm_page_mask_t *map_page_mask,
8967                                 uvm_prot_t new_prot,
8968                                 UvmEventMapRemoteCause cause)
8969 {
8970     uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
8971     NV_STATUS status = NV_OK;
8972     NV_STATUS tracker_status;
8973     uvm_processor_id_t id;
8974 
8975     for_each_id_in_mask(id, map_processor_mask) {
8976         status = uvm_va_block_map(va_block,
8977                                   va_block_context,
8978                                   id,
8979                                   region,
8980                                   map_page_mask,
8981                                   new_prot,
8982                                   cause,
8983                                   &local_tracker);
8984         if (status != NV_OK)
8985             break;
8986     }
8987 
8988     // Regardless of error, add the successfully-pushed mapping operations into
8989     // the block's tracker. Note that we can't overwrite the tracker because we
8990     // aren't guaranteed that the map actually pushed anything (in which case it
8991     // would've acquired the block tracker first).
8992     tracker_status = uvm_tracker_add_tracker_safe(&va_block->tracker, &local_tracker);
8993     uvm_tracker_deinit(&local_tracker);
8994 
8995     return status == NV_OK ? tracker_status : status;
8996 }
8997 
uvm_va_block_unmap_mask(uvm_va_block_t * va_block,uvm_va_block_context_t * va_block_context,const uvm_processor_mask_t * unmap_processor_mask,uvm_va_block_region_t region,const uvm_page_mask_t * unmap_page_mask)8998 NV_STATUS uvm_va_block_unmap_mask(uvm_va_block_t *va_block,
8999                                   uvm_va_block_context_t *va_block_context,
9000                                   const uvm_processor_mask_t *unmap_processor_mask,
9001                                   uvm_va_block_region_t region,
9002                                   const uvm_page_mask_t *unmap_page_mask)
9003 {
9004     uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
9005     NV_STATUS status = NV_OK;
9006     NV_STATUS tracker_status;
9007     uvm_processor_id_t id;
9008 
9009     // Watch out, unmap_mask could change during iteration since it could be
9010     // va_block->mapped.
9011     for_each_id_in_mask(id, unmap_processor_mask) {
9012         // Errors could either be a system-fatal error (ECC) or an allocation
9013         // retry due to PTE splitting. In either case we should stop after
9014         // hitting the first one.
9015         status = uvm_va_block_unmap(va_block, va_block_context, id, region, unmap_page_mask, &local_tracker);
9016         if (status != NV_OK)
9017             break;
9018     }
9019 
9020     // See the comment in uvm_va_block_map_mask for adding to the tracker.
9021     tracker_status = uvm_tracker_add_tracker_safe(&va_block->tracker, &local_tracker);
9022     uvm_tracker_deinit(&local_tracker);
9023 
9024     return status == NV_OK ? tracker_status : status;
9025 }
9026 
uvm_va_block_revoke_prot_mask(uvm_va_block_t * va_block,uvm_va_block_context_t * va_block_context,const uvm_processor_mask_t * revoke_processor_mask,uvm_va_block_region_t region,const uvm_page_mask_t * revoke_page_mask,uvm_prot_t prot_to_revoke)9027 NV_STATUS uvm_va_block_revoke_prot_mask(uvm_va_block_t *va_block,
9028                                         uvm_va_block_context_t *va_block_context,
9029                                         const uvm_processor_mask_t *revoke_processor_mask,
9030                                         uvm_va_block_region_t region,
9031                                         const uvm_page_mask_t *revoke_page_mask,
9032                                         uvm_prot_t prot_to_revoke)
9033 {
9034     uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
9035     NV_STATUS status = NV_OK;
9036     NV_STATUS tracker_status;
9037     uvm_processor_id_t id;
9038 
9039     for_each_id_in_mask(id, revoke_processor_mask) {
9040         status = uvm_va_block_revoke_prot(va_block,
9041                                           va_block_context,
9042                                           id,
9043                                           region,
9044                                           revoke_page_mask,
9045                                           prot_to_revoke,
9046                                           &local_tracker);
9047         if (status != NV_OK)
9048             break;
9049     }
9050 
9051     // See the comment in uvm_va_block_map_mask for adding to the tracker.
9052     tracker_status = uvm_tracker_add_tracker_safe(&va_block->tracker, &local_tracker);
9053     uvm_tracker_deinit(&local_tracker);
9054 
9055     return status == NV_OK ? tracker_status : status;
9056 }
9057 
9058 // Updates the read_duplicated_pages mask in the block when the state of GPU id
9059 // is being destroyed
update_read_duplicated_pages_mask(uvm_va_block_t * block,uvm_gpu_id_t id,uvm_va_block_gpu_state_t * gpu_state)9060 static void update_read_duplicated_pages_mask(uvm_va_block_t *block,
9061                                               uvm_gpu_id_t id,
9062                                               uvm_va_block_gpu_state_t *gpu_state)
9063 {
9064     uvm_gpu_id_t running_id;
9065     bool first = true;
9066     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
9067     uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, NULL);
9068     uvm_page_mask_t *running_page_mask = &block_context->update_read_duplicated_pages.running_page_mask;
9069     uvm_page_mask_t *tmp_page_mask = &block_context->scratch_page_mask;
9070 
9071     uvm_page_mask_zero(&block->read_duplicated_pages);
9072 
9073     for_each_id_in_mask(running_id, &block->resident) {
9074         const uvm_page_mask_t *running_residency_mask;
9075 
9076         if (uvm_id_equal(running_id, id))
9077             continue;
9078 
9079         running_residency_mask = uvm_va_block_resident_mask_get(block, running_id, NUMA_NO_NODE);
9080 
9081         if (first) {
9082             uvm_page_mask_copy(running_page_mask, running_residency_mask);
9083             first = false;
9084             continue;
9085         }
9086 
9087         if (uvm_page_mask_and(tmp_page_mask, running_page_mask, running_residency_mask))
9088             uvm_page_mask_or(&block->read_duplicated_pages, &block->read_duplicated_pages, tmp_page_mask);
9089 
9090         uvm_page_mask_or(running_page_mask, running_page_mask, running_residency_mask);
9091     }
9092 }
9093 
9094 // Unmaps all GPU mappings under this block, frees the page tables, and frees
9095 // all the GPU chunks. This simply drops the chunks on the floor, so the caller
9096 // must take care of copying the data elsewhere if it needs to remain intact.
9097 //
9098 // This serializes on the block tracker since it must unmap page tables.
block_destroy_gpu_state(uvm_va_block_t * block,uvm_va_block_context_t * block_context,uvm_gpu_id_t id)9099 static void block_destroy_gpu_state(uvm_va_block_t *block, uvm_va_block_context_t *block_context, uvm_gpu_id_t id)
9100 {
9101     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, id);
9102     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
9103     uvm_gpu_va_space_t *gpu_va_space;
9104     uvm_gpu_t *gpu, *other_gpu;
9105 
9106     if (!gpu_state)
9107         return;
9108 
9109     uvm_assert_mutex_locked(&block->lock);
9110 
9111     // Unmap PTEs and free page tables
9112     gpu = uvm_va_space_get_gpu(va_space, id);
9113     gpu_va_space = uvm_gpu_va_space_get(va_space, gpu);
9114     if (gpu_va_space) {
9115 
9116         uvm_va_block_remove_gpu_va_space(block, gpu_va_space, block_context);
9117     }
9118 
9119     UVM_ASSERT(!uvm_processor_mask_test(&block->mapped, id));
9120 
9121     // No processor should have this GPU mapped at this point
9122     UVM_ASSERT(block_check_processor_not_mapped(block, block_context, id));
9123 
9124     // We need to remove the mappings of the indirect peers from the reverse
9125     // map when the GPU state is being destroyed (for example, on
9126     // unregister_gpu) and when peer access between indirect peers is disabled.
9127     // However, we need to avoid double mapping removals. There are two
9128     // possible scenarios:
9129     // - Disable peer access first. This will remove all mappings between A and
9130     // B GPUs, and the indirect_peers bit is cleared. Thus, the later call to
9131     // unregister_gpu will not operate on that pair of GPUs.
9132     // - Unregister GPU first. This will remove all mappings from all indirect
9133     // peers to the GPU being unregistered. It will also destroy its GPU state.
9134     // Subsequent calls to disable peers will remove the mappings from the GPU
9135     // being unregistered, but never to the GPU being unregistered (since it no
9136     // longer has a valid GPU state).
9137     for_each_va_space_gpu_in_mask(other_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)])
9138         block_gpu_unmap_all_chunks_indirect_peer(block, gpu, other_gpu);
9139 
9140     if (gpu_state->chunks) {
9141         size_t i, num_chunks;
9142 
9143         update_read_duplicated_pages_mask(block, id, gpu_state);
9144         uvm_page_mask_zero(&gpu_state->resident);
9145         block_clear_resident_processor(block, id);
9146 
9147         num_chunks = block_num_gpu_chunks(block, gpu);
9148         for (i = 0; i < num_chunks; i++) {
9149             uvm_gpu_chunk_t *chunk = gpu_state->chunks[i];
9150             if (!chunk)
9151                 continue;
9152 
9153             uvm_mmu_chunk_unmap(chunk, &block->tracker);
9154             uvm_pmm_gpu_free(&gpu->pmm, chunk, &block->tracker);
9155         }
9156 
9157         uvm_kvfree(gpu_state->chunks);
9158     }
9159     else {
9160         UVM_ASSERT(!uvm_processor_mask_test(&block->resident, id));
9161     }
9162 
9163 
9164     // Pending operations may still need the DMA memory to be mapped.
9165     uvm_tracker_wait(&block->tracker);
9166 
9167     block_gpu_unmap_phys_all_cpu_pages(block, gpu);
9168     uvm_processor_mask_clear(&block->evicted_gpus, id);
9169 
9170     kmem_cache_free(g_uvm_va_block_gpu_state_cache, gpu_state);
9171     block->gpus[uvm_id_gpu_index(id)] = NULL;
9172 }
9173 
block_put_ptes_safe(uvm_page_tree_t * tree,uvm_page_table_range_t * range)9174 static void block_put_ptes_safe(uvm_page_tree_t *tree, uvm_page_table_range_t *range)
9175 {
9176     if (range->table) {
9177         uvm_page_tree_put_ptes(tree, range);
9178         memset(range, 0, sizeof(*range));
9179     }
9180 }
9181 
uvm_va_block_add_gpu_va_space(uvm_va_block_t * va_block,uvm_gpu_va_space_t * gpu_va_space)9182 NV_STATUS uvm_va_block_add_gpu_va_space(uvm_va_block_t *va_block, uvm_gpu_va_space_t *gpu_va_space)
9183 {
9184     uvm_assert_mutex_locked(&va_block->lock);
9185 
9186     if (!gpu_va_space->ats.enabled || !va_block->cpu.ever_mapped)
9187         return NV_OK;
9188 
9189     // Pre-populate PDEs down to PDE1 for all GPU VA spaces on ATS systems. See
9190     // comments in pre_populate_pde1_gpu.
9191     return block_pre_populate_pde1_gpu(va_block, gpu_va_space, NULL);
9192 }
9193 
uvm_va_block_remove_gpu_va_space(uvm_va_block_t * va_block,uvm_gpu_va_space_t * gpu_va_space,uvm_va_block_context_t * block_context)9194 void uvm_va_block_remove_gpu_va_space(uvm_va_block_t *va_block,
9195                                       uvm_gpu_va_space_t *gpu_va_space,
9196                                       uvm_va_block_context_t *block_context)
9197 {
9198     uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch;
9199     uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch;
9200     uvm_gpu_t *gpu = gpu_va_space->gpu;
9201     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id);
9202     uvm_va_block_region_t region = uvm_va_block_region_from_block(va_block);
9203     uvm_push_t push;
9204     NV_STATUS status;
9205 
9206     uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
9207 
9208     if (!gpu_state)
9209         return;
9210 
9211     uvm_assert_mutex_locked(&va_block->lock);
9212 
9213     // Unmapping the whole block won't cause a page table split, so this should
9214     // only fail if we have a system-fatal error.
9215     status = uvm_va_block_unmap(va_block, block_context, gpu->id, region, NULL, &local_tracker);
9216     if (status != NV_OK) {
9217         UVM_ASSERT(status == uvm_global_get_status());
9218         return; // Just leak
9219     }
9220 
9221     UVM_ASSERT(!uvm_processor_mask_test(&va_block->mapped, gpu->id));
9222 
9223     // Reset the page tables if other allocations could reuse them
9224     if (!block_gpu_supports_2m(va_block, gpu) &&
9225         !bitmap_empty(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)) {
9226 
9227         status = uvm_push_begin_acquire(gpu->channel_manager,
9228                                         UVM_CHANNEL_TYPE_MEMOPS,
9229                                         &local_tracker,
9230                                         &push,
9231                                         "Resetting PTEs for block [0x%llx, 0x%llx)",
9232                                         va_block->start,
9233                                         va_block->end + 1);
9234         if (status != NV_OK) {
9235             UVM_ASSERT(status == uvm_global_get_status());
9236             return; // Just leak
9237         }
9238 
9239         uvm_pte_batch_begin(&push, pte_batch);
9240         uvm_tlb_batch_begin(&gpu_va_space->page_tables, tlb_batch);
9241 
9242         // When the big PTEs is active, the 4k PTEs under it are garbage. Make
9243         // them invalid so the page tree code can reuse them for other
9244         // allocations on this VA. These don't need TLB invalidates since the
9245         // big PTEs above them are active.
9246         if (gpu_state->page_table_range_4k.table) {
9247             uvm_page_mask_init_from_big_ptes(va_block, gpu, &block_context->scratch_page_mask, gpu_state->big_ptes);
9248             block_gpu_pte_clear_4k(va_block, gpu, &block_context->scratch_page_mask, 0, pte_batch, NULL);
9249         }
9250 
9251         // We unmapped all big PTEs above, which means they have the unmapped
9252         // pattern so the GPU MMU won't read 4k PTEs under them. Set them to
9253         // invalid to activate the 4ks below so new allocations using just those
9254         // 4k PTEs will work.
9255         block_gpu_pte_clear_big(va_block, gpu, gpu_state->big_ptes, 0, pte_batch, tlb_batch);
9256 
9257         uvm_pte_batch_end(pte_batch);
9258         uvm_tlb_batch_end(tlb_batch, &push, UVM_MEMBAR_NONE);
9259 
9260         uvm_push_end(&push);
9261         uvm_tracker_overwrite_with_push(&local_tracker, &push);
9262     }
9263 
9264     // The unmap must finish before we free the page tables
9265     status = uvm_tracker_wait_deinit(&local_tracker);
9266     if (status != NV_OK)
9267         return; // System-fatal error, just leak
9268 
9269     // Note that if the PTE is currently 2M with lower tables allocated but not
9270     // in use, calling put_ptes on those lower ranges will re-write the 2M entry
9271     // to be a PDE.
9272     block_put_ptes_safe(&gpu_va_space->page_tables, &gpu_state->page_table_range_4k);
9273     block_put_ptes_safe(&gpu_va_space->page_tables, &gpu_state->page_table_range_big);
9274     block_put_ptes_safe(&gpu_va_space->page_tables, &gpu_state->page_table_range_2m);
9275 
9276     gpu_state->pte_is_2m = false;
9277     gpu_state->initialized_big = false;
9278     gpu_state->activated_big = false;
9279     gpu_state->activated_4k = false;
9280     bitmap_zero(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
9281 
9282     UVM_ASSERT(block_check_mappings(va_block, block_context));
9283 }
9284 
uvm_va_block_enable_peer(uvm_va_block_t * va_block,uvm_gpu_t * gpu0,uvm_gpu_t * gpu1)9285 NV_STATUS uvm_va_block_enable_peer(uvm_va_block_t *va_block, uvm_gpu_t *gpu0, uvm_gpu_t *gpu1)
9286 {
9287     NV_STATUS status;
9288     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
9289 
9290     UVM_ASSERT(uvm_gpu_peer_caps(gpu0, gpu1)->link_type != UVM_GPU_LINK_INVALID);
9291     uvm_assert_rwsem_locked_write(&va_space->lock);
9292     uvm_assert_mutex_locked(&va_block->lock);
9293 
9294     if (uvm_processor_mask_test(&va_space->indirect_peers[uvm_id_value(gpu0->id)], gpu1->id)) {
9295         status = block_gpu_map_all_chunks_indirect_peer(va_block, gpu0, gpu1);
9296         if (status != NV_OK)
9297             return status;
9298 
9299         status = block_gpu_map_all_chunks_indirect_peer(va_block, gpu1, gpu0);
9300         if (status != NV_OK) {
9301             block_gpu_unmap_all_chunks_indirect_peer(va_block, gpu0, gpu1);
9302             return status;
9303         }
9304     }
9305 
9306     // TODO: Bug 1767224: Refactor the uvm_va_block_set_accessed_by logic so we
9307     //       call it here.
9308 
9309     return NV_OK;
9310 }
9311 
uvm_va_block_disable_peer(uvm_va_block_t * va_block,uvm_gpu_t * gpu0,uvm_gpu_t * gpu1)9312 void uvm_va_block_disable_peer(uvm_va_block_t *va_block, uvm_gpu_t *gpu0, uvm_gpu_t *gpu1)
9313 {
9314     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
9315     NV_STATUS status;
9316     uvm_tracker_t tracker = UVM_TRACKER_INIT();
9317     uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, NULL);
9318     uvm_page_mask_t *unmap_page_mask = &block_context->caller_page_mask;
9319     const uvm_page_mask_t *resident0;
9320     const uvm_page_mask_t *resident1;
9321 
9322     uvm_assert_mutex_locked(&va_block->lock);
9323 
9324     // See comment in block_destroy_gpu_state
9325     if (uvm_processor_mask_test(&va_space->indirect_peers[uvm_id_value(gpu0->id)], gpu1->id)) {
9326         block_gpu_unmap_all_chunks_indirect_peer(va_block, gpu0, gpu1);
9327         block_gpu_unmap_all_chunks_indirect_peer(va_block, gpu1, gpu0);
9328     }
9329 
9330     // If either of the GPUs doesn't have GPU state then nothing could be mapped
9331     // between them.
9332     if (!uvm_va_block_gpu_state_get(va_block, gpu0->id) || !uvm_va_block_gpu_state_get(va_block, gpu1->id))
9333         return;
9334 
9335     resident0 = uvm_va_block_resident_mask_get(va_block, gpu0->id, NUMA_NO_NODE);
9336     resident1 = uvm_va_block_resident_mask_get(va_block, gpu1->id, NUMA_NO_NODE);
9337 
9338     // Unmap all pages resident on gpu1, but not on gpu0, from gpu0
9339     if (uvm_page_mask_andnot(unmap_page_mask, resident1, resident0)) {
9340         status = block_unmap_gpu(va_block, block_context, gpu0, unmap_page_mask, &tracker);
9341         if (status != NV_OK) {
9342             // Since all PTEs unmapped by this call have the same aperture, page
9343             // splits should never be required so any failure should be the
9344             // result of a system-fatal error.
9345             UVM_ASSERT_MSG(status == uvm_global_get_status(),
9346                            "Unmapping failed: %s, GPU %s\n",
9347                            nvstatusToString(status),
9348                            uvm_gpu_name(gpu0));
9349         }
9350     }
9351 
9352     // Unmap all pages resident on gpu0, but not on gpu1, from gpu1
9353     if (uvm_page_mask_andnot(unmap_page_mask, resident0, resident1)) {
9354         status = block_unmap_gpu(va_block, block_context, gpu1, unmap_page_mask, &tracker);
9355         if (status != NV_OK) {
9356             UVM_ASSERT_MSG(status == uvm_global_get_status(),
9357                            "Unmapping failed: %s, GPU %s\n",
9358                            nvstatusToString(status),
9359                            uvm_gpu_name(gpu0));
9360         }
9361     }
9362 
9363     status = uvm_tracker_add_tracker_safe(&va_block->tracker, &tracker);
9364     if (status != NV_OK)
9365         UVM_ASSERT(status == uvm_global_get_status());
9366 
9367     status = uvm_tracker_wait_deinit(&tracker);
9368     if (status != NV_OK)
9369         UVM_ASSERT(status == uvm_global_get_status());
9370 }
9371 
uvm_va_block_unmap_preferred_location_uvm_lite(uvm_va_block_t * va_block,uvm_gpu_t * gpu)9372 void uvm_va_block_unmap_preferred_location_uvm_lite(uvm_va_block_t *va_block, uvm_gpu_t *gpu)
9373 {
9374     NV_STATUS status;
9375     uvm_va_range_t *va_range = va_block->va_range;
9376     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
9377     uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, NULL);
9378     uvm_va_block_region_t region = uvm_va_block_region_from_block(va_block);
9379 
9380     uvm_assert_mutex_locked(&va_block->lock);
9381     UVM_ASSERT(uvm_processor_mask_test(&va_range->uvm_lite_gpus, gpu->id));
9382 
9383     // If the GPU doesn't have GPU state then nothing could be mapped.
9384     if (!uvm_va_block_gpu_state_get(va_block, gpu->id))
9385         return;
9386 
9387     // In UVM-Lite mode, mappings to the preferred location are not tracked
9388     // directly, so just unmap the whole block.
9389     status = uvm_va_block_unmap(va_block, block_context, gpu->id, region, NULL, &va_block->tracker);
9390     if (status != NV_OK) {
9391         // Unmapping the whole block should not cause page splits so any failure
9392         // should be the result of a system-fatal error.
9393         UVM_ASSERT_MSG(status == uvm_global_get_status(),
9394                        "Unmapping failed: %s, GPU %s\n",
9395                        nvstatusToString(status), uvm_gpu_name(gpu));
9396     }
9397 
9398     status = uvm_tracker_wait(&va_block->tracker);
9399     if (status != NV_OK) {
9400         UVM_ASSERT_MSG(status == uvm_global_get_status(),
9401                        "Unmapping failed: %s, GPU %s\n",
9402                        nvstatusToString(status), uvm_gpu_name(gpu));
9403     }
9404 }
9405 
9406 // Evict pages from the GPU by moving each resident region to the CPU
9407 //
9408 // Notably the caller needs to support allocation-retry as
9409 // uvm_va_block_migrate_locked() requires that.
block_evict_pages_from_gpu(uvm_va_block_t * va_block,uvm_gpu_t * gpu,struct mm_struct * mm)9410 static NV_STATUS block_evict_pages_from_gpu(uvm_va_block_t *va_block, uvm_gpu_t *gpu, struct mm_struct *mm)
9411 {
9412     NV_STATUS status = NV_OK;
9413     const uvm_page_mask_t *resident = uvm_va_block_resident_mask_get(va_block, gpu->id, NUMA_NO_NODE);
9414     uvm_va_block_region_t region = uvm_va_block_region_from_block(va_block);
9415     uvm_va_block_region_t subregion;
9416     uvm_service_block_context_t *service_context;
9417 
9418     service_context = uvm_service_block_context_alloc(mm);
9419     if (!service_context)
9420         return NV_ERR_NO_MEMORY;
9421 
9422     // Move all subregions resident on the GPU to the CPU
9423     for_each_va_block_subregion_in_mask(subregion, resident, region) {
9424         if (uvm_va_block_is_hmm(va_block)) {
9425             status = uvm_hmm_va_block_evict_pages_from_gpu(va_block, gpu, service_context, resident, subregion);
9426         }
9427         else {
9428             status = uvm_va_block_migrate_locked(va_block,
9429                                                  NULL,
9430                                                  service_context,
9431                                                  subregion,
9432                                                  UVM_ID_CPU,
9433                                                  UVM_MIGRATE_MODE_MAKE_RESIDENT_AND_MAP,
9434                                                  NULL);
9435         }
9436 
9437         if (status != NV_OK)
9438             break;
9439     }
9440 
9441     if (status == NV_OK)
9442         UVM_ASSERT(!uvm_processor_mask_test(&va_block->resident, gpu->id));
9443 
9444     uvm_service_block_context_free(service_context);
9445 
9446     return status;
9447 }
9448 
uvm_va_block_unregister_gpu_locked(uvm_va_block_t * va_block,uvm_gpu_t * gpu,struct mm_struct * mm)9449 void uvm_va_block_unregister_gpu_locked(uvm_va_block_t *va_block, uvm_gpu_t *gpu, struct mm_struct *mm)
9450 {
9451     NV_STATUS status;
9452     uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id);
9453     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
9454     uvm_va_block_context_t *va_block_context = uvm_va_space_block_context(va_space, mm);
9455 
9456     uvm_assert_mutex_locked(&va_block->lock);
9457 
9458     if (!gpu_state)
9459         return;
9460 
9461     // The mappings should've already been torn down by GPU VA space unregister
9462     UVM_ASSERT(!uvm_processor_mask_test(&va_block->mapped, gpu->id));
9463     UVM_ASSERT(uvm_page_mask_empty(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ]));
9464     UVM_ASSERT(!block_gpu_has_page_tables(va_block, gpu));
9465 
9466     // Use UVM_VA_BLOCK_RETRY_LOCKED() as the va block lock is already taken and
9467     // we don't rely on any state of the block across the call.
9468     // TODO: Bug 4494289: Prevent setting the global error on allocation
9469     // failures.
9470     status = UVM_VA_BLOCK_RETRY_LOCKED(va_block, NULL, block_evict_pages_from_gpu(va_block, gpu, mm));
9471     if (status != NV_OK) {
9472         UVM_ERR_PRINT("Failed to evict GPU pages on GPU unregister: %s, GPU %s\n",
9473                       nvstatusToString(status),
9474                       uvm_gpu_name(gpu));
9475         uvm_global_set_fatal_error(status);
9476     }
9477 
9478     // This function will copy the block's tracker into each chunk then free the
9479     // chunk to PMM. If we do this before waiting for the block tracker below
9480     // we'll populate PMM's free chunks with tracker entries, which gives us
9481     // better testing coverage of chunk synchronization on GPU unregister.
9482     block_destroy_gpu_state(va_block, va_block_context, gpu->id);
9483 
9484     // Any time a GPU is unregistered we need to make sure that there are no
9485     // pending (direct or indirect) tracker entries for that GPU left in the
9486     // block's tracker. The only way to ensure that is to wait for the whole
9487     // tracker.
9488     status = uvm_tracker_wait(&va_block->tracker);
9489     if (status != NV_OK)
9490         UVM_ASSERT(status == uvm_global_get_status());
9491 }
9492 
uvm_va_block_unregister_gpu(uvm_va_block_t * va_block,uvm_gpu_t * gpu,struct mm_struct * mm)9493 void uvm_va_block_unregister_gpu(uvm_va_block_t *va_block, uvm_gpu_t *gpu, struct mm_struct *mm)
9494 {
9495     // Take the lock internally to not expose the caller to allocation-retry.
9496     uvm_mutex_lock(&va_block->lock);
9497 
9498     uvm_va_block_unregister_gpu_locked(va_block, gpu, mm);
9499 
9500     uvm_mutex_unlock(&va_block->lock);
9501 }
9502 
block_mark_region_cpu_dirty(uvm_va_block_t * va_block,uvm_va_block_region_t region)9503 static void block_mark_region_cpu_dirty(uvm_va_block_t *va_block, uvm_va_block_region_t region)
9504 {
9505     uvm_page_index_t page_index;
9506     uvm_page_mask_t *resident_mask;
9507 
9508     uvm_assert_mutex_locked(&va_block->lock);
9509     resident_mask = uvm_va_block_resident_mask_get(va_block, UVM_ID_CPU, NUMA_NO_NODE);
9510     for_each_va_block_page_in_region_mask(page_index, resident_mask, region) {
9511         int nid = block_get_page_node_residency(va_block, page_index);
9512         UVM_ASSERT(nid != NUMA_NO_NODE);
9513         block_mark_cpu_page_dirty(va_block, page_index, nid);
9514     }
9515 }
9516 
9517 // Tears down everything within the block, but doesn't free the block itself.
9518 // Note that when uvm_va_block_kill is called, this is called twice: once for
9519 // the initial kill itself, then again when the block's ref count is eventually
9520 // destroyed. block->va_range is used to track whether the block has already
9521 // been killed.
block_kill(uvm_va_block_t * block)9522 static void block_kill(uvm_va_block_t *block)
9523 {
9524     uvm_va_space_t *va_space;
9525     uvm_perf_event_data_t event_data;
9526     uvm_cpu_chunk_t *chunk;
9527     uvm_gpu_id_t id;
9528     NV_STATUS status;
9529     uvm_va_block_region_t region = uvm_va_block_region_from_block(block);
9530     uvm_page_index_t page_index;
9531     uvm_page_index_t next_page_index;
9532     int nid;
9533     uvm_va_block_context_t *block_context;
9534 
9535     if (uvm_va_block_is_dead(block))
9536         return;
9537 
9538     va_space = uvm_va_block_get_va_space(block);
9539     event_data.block_destroy.block = block;
9540     uvm_perf_event_notify(&va_space->perf_events, UVM_PERF_EVENT_BLOCK_DESTROY, &event_data);
9541 
9542     block_context = uvm_va_space_block_context(va_space, NULL);
9543 
9544     // Unmap all processors in parallel first. Unmapping the whole block won't
9545     // cause a page table split, so this should only fail if we have a system-
9546     // fatal error.
9547     if (!uvm_processor_mask_empty(&block->mapped)) {
9548         // HMM CPU mappings are controlled by Linux so no need to unmap.
9549         // Remote GPU mappings will be removed below.
9550         if (uvm_va_block_is_hmm(block) && uvm_processor_mask_test(&block->mapped, UVM_ID_CPU)) {
9551             uvm_page_mask_zero(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE]);
9552             uvm_page_mask_zero(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ]);
9553             uvm_processor_mask_clear(&block->mapped, UVM_ID_CPU);
9554         }
9555 
9556         // We could only be killed with mapped GPU state by VA range free or VA
9557         // space teardown, so it's safe to use the va_space's block_context
9558         // because both of those have the VA space lock held in write mode.
9559         status = uvm_va_block_unmap_mask(block, block_context, &block->mapped, region, NULL);
9560         UVM_ASSERT(status == uvm_global_get_status());
9561     }
9562 
9563     UVM_ASSERT(uvm_processor_mask_empty(&block->mapped));
9564 
9565     // Free the GPU page tables and chunks
9566     for_each_gpu_id(id)
9567         block_destroy_gpu_state(block, block_context, id);
9568 
9569     // Wait for the GPU PTE unmaps before freeing CPU memory
9570     uvm_tracker_wait_deinit(&block->tracker);
9571 
9572     // No processor should have the CPU mapped at this point
9573     UVM_ASSERT(block_check_processor_not_mapped(block, block_context, UVM_ID_CPU));
9574 
9575     // Free CPU pages
9576     for_each_possible_uvm_node(nid) {
9577         uvm_va_block_cpu_node_state_t *node_state = block_node_state_get(block, nid);
9578         size_t index = node_to_index(nid);
9579 
9580         for_each_cpu_chunk_in_block_safe(chunk, page_index, next_page_index, block, nid) {
9581             // be conservative.
9582             // Tell the OS we wrote to the page because we sometimes clear the dirty
9583             // bit after writing to it. HMM dirty flags are managed by the kernel.
9584             if (!uvm_va_block_is_hmm(block))
9585                 uvm_cpu_chunk_mark_dirty(chunk, 0);
9586 
9587             uvm_cpu_chunk_remove_from_block(block, nid, page_index);
9588             uvm_cpu_chunk_free(chunk);
9589         }
9590 
9591         UVM_ASSERT(uvm_page_mask_empty(&node_state->allocated));
9592         UVM_ASSERT(node_state->chunks == 0);
9593         kmem_cache_free(g_uvm_va_block_cpu_node_state_cache, block->cpu.node_state[index]);
9594     }
9595 
9596     uvm_kvfree((void *)block->cpu.node_state);
9597     block->cpu.node_state = NULL;
9598 
9599     // Clearing the resident bit isn't strictly necessary since this block
9600     // is getting destroyed, but it keeps state consistent for assertions.
9601     uvm_page_mask_zero(&block->cpu.resident);
9602     block_clear_resident_processor(block, UVM_ID_CPU);
9603 
9604     if (uvm_va_block_is_hmm(block))
9605         uvm_va_policy_clear(block, block->start, block->end);
9606 
9607     block->va_range = NULL;
9608 #if UVM_IS_CONFIG_HMM()
9609     block->hmm.va_space = NULL;
9610 #endif
9611 }
9612 
9613 // Called when the block's ref count drops to 0
uvm_va_block_destroy(nv_kref_t * nv_kref)9614 void uvm_va_block_destroy(nv_kref_t *nv_kref)
9615 {
9616     uvm_va_block_t *block = container_of(nv_kref, uvm_va_block_t, kref);
9617 
9618     // Nobody else should have a reference when freeing
9619     uvm_assert_mutex_unlocked(&block->lock);
9620 
9621     uvm_mutex_lock(&block->lock);
9622     block_kill(block);
9623     uvm_mutex_unlock(&block->lock);
9624     uvm_va_block_free(block);
9625 }
9626 
uvm_va_block_kill(uvm_va_block_t * va_block)9627 void uvm_va_block_kill(uvm_va_block_t *va_block)
9628 {
9629     uvm_mutex_lock(&va_block->lock);
9630     block_kill(va_block);
9631     uvm_mutex_unlock(&va_block->lock);
9632 
9633     // May call block_kill again
9634     uvm_va_block_release(va_block);
9635 }
9636 
block_gpu_release_region(uvm_va_block_t * va_block,uvm_gpu_id_t gpu_id,uvm_va_block_gpu_state_t * gpu_state,uvm_page_mask_t * page_mask,uvm_va_block_region_t region)9637 static void block_gpu_release_region(uvm_va_block_t *va_block,
9638                                      uvm_gpu_id_t gpu_id,
9639                                      uvm_va_block_gpu_state_t *gpu_state,
9640                                      uvm_page_mask_t *page_mask,
9641                                      uvm_va_block_region_t region)
9642 {
9643     uvm_page_index_t page_index;
9644 
9645     for_each_va_block_page_in_region_mask(page_index, page_mask, region) {
9646         uvm_gpu_chunk_t *gpu_chunk = gpu_state->chunks[page_index];
9647 
9648         if (!gpu_chunk)
9649             continue;
9650 
9651         // TODO: Bug 3898467: unmap indirect peers when freeing GPU chunks
9652 
9653         uvm_mmu_chunk_unmap(gpu_chunk, &va_block->tracker);
9654 
9655         // The GPU chunk will be freed when the device private reference drops.
9656         if (uvm_page_mask_test_and_clear(&gpu_state->resident, page_index) &&
9657             uvm_page_mask_empty(&gpu_state->resident))
9658             block_clear_resident_processor(va_block, gpu_id);
9659 
9660         gpu_state->chunks[page_index] = NULL;
9661     }
9662 }
9663 
uvm_va_block_munmap_region(uvm_va_block_t * va_block,uvm_va_block_region_t region)9664 void uvm_va_block_munmap_region(uvm_va_block_t *va_block,
9665                                 uvm_va_block_region_t region)
9666 {
9667     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
9668     uvm_perf_event_data_t event_data;
9669     uvm_gpu_id_t gpu_id;
9670 
9671     UVM_ASSERT(uvm_va_block_is_hmm(va_block));
9672     uvm_assert_mutex_locked(&va_block->lock);
9673 
9674     // Reset thrashing state for the region.
9675     event_data.block_munmap.block = va_block;
9676     event_data.block_munmap.region = region;
9677     uvm_perf_event_notify(&va_space->perf_events, UVM_PERF_EVENT_BLOCK_MUNMAP, &event_data);
9678 
9679     // Release any remaining vidmem chunks in the given region.
9680     for_each_gpu_id(gpu_id) {
9681         uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu_id);
9682 
9683         if (!gpu_state)
9684             continue;
9685 
9686         uvm_page_mask_region_clear(&gpu_state->evicted, region);
9687         if (uvm_page_mask_empty(&gpu_state->evicted))
9688             uvm_processor_mask_clear(&va_block->evicted_gpus, gpu_id);
9689 
9690         if (gpu_state->chunks) {
9691             block_gpu_release_region(va_block, gpu_id, gpu_state, NULL, region);
9692 
9693             // TODO: bug 3660922: Need to update the read duplicated pages mask
9694             // when read duplication is supported for HMM.
9695         }
9696         else {
9697             UVM_ASSERT(!uvm_processor_mask_test(&va_block->resident, gpu_id));
9698         }
9699     }
9700 
9701     uvm_va_policy_clear(va_block,
9702                         uvm_va_block_region_start(va_block, region),
9703                         uvm_va_block_region_end(va_block, region));
9704 }
9705 
block_split_presplit_ptes_gpu(uvm_va_block_t * existing,uvm_va_block_t * new,uvm_gpu_t * gpu)9706 static NV_STATUS block_split_presplit_ptes_gpu(uvm_va_block_t *existing, uvm_va_block_t *new, uvm_gpu_t *gpu)
9707 {
9708     uvm_va_block_gpu_state_t *existing_gpu_state = uvm_va_block_gpu_state_get(existing, gpu->id);
9709     uvm_va_space_t *va_space = uvm_va_block_get_va_space(existing);
9710     uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, NULL);
9711     NvU32 big_page_size = uvm_va_block_gpu_big_page_size(existing, gpu);
9712     NvU32 alloc_sizes;
9713     DECLARE_BITMAP(new_big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
9714     uvm_page_index_t new_start_page_index = uvm_va_block_cpu_page_index(existing, new->start);
9715     size_t big_page_index;
9716     uvm_push_t push;
9717     NV_STATUS status;
9718 
9719     // We only have to split to big PTEs if we're currently a 2M PTE
9720     if (existing_gpu_state->pte_is_2m) {
9721         // We can skip the split if the 2M PTE is invalid and we have no lower
9722         // PTEs.
9723         if (block_page_prot_gpu(existing, gpu, 0) == UVM_PROT_NONE &&
9724             !existing_gpu_state->page_table_range_big.table &&
9725             !existing_gpu_state->page_table_range_4k.table)
9726             return NV_OK;
9727 
9728         alloc_sizes = big_page_size;
9729         bitmap_fill(new_big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
9730 
9731         if (!IS_ALIGNED(new->start, big_page_size)) {
9732             alloc_sizes |= UVM_PAGE_SIZE_4K;
9733 
9734             big_page_index = uvm_va_block_big_page_index(existing, new_start_page_index, big_page_size);
9735             __clear_bit(big_page_index, new_big_ptes);
9736         }
9737 
9738         status = block_alloc_ptes_with_retry(existing, gpu, alloc_sizes, NULL);
9739         if (status != NV_OK)
9740             return status;
9741 
9742         status = uvm_push_begin_acquire(gpu->channel_manager,
9743                                         UVM_CHANNEL_TYPE_MEMOPS,
9744                                         &existing->tracker,
9745                                         &push,
9746                                         "Splitting 2M PTE, existing [0x%llx, 0x%llx) new [0x%llx, 0x%llx)",
9747                                         existing->start, existing->end + 1,
9748                                         new->start, new->end + 1);
9749         if (status != NV_OK)
9750             return status;
9751 
9752         block_gpu_split_2m(existing, block_context, gpu, new_big_ptes, &push);
9753     }
9754     else {
9755         big_page_index = uvm_va_block_big_page_index(existing, new_start_page_index, big_page_size);
9756 
9757         // If the split point is on a big page boundary, or if the split point
9758         // is not currently covered by a big PTE, we don't have to split
9759         // anything.
9760         if (IS_ALIGNED(new->start, big_page_size) ||
9761             big_page_index == MAX_BIG_PAGES_PER_UVM_VA_BLOCK ||
9762             !test_bit(big_page_index, existing_gpu_state->big_ptes))
9763             return NV_OK;
9764 
9765         status = block_alloc_ptes_with_retry(existing, gpu, UVM_PAGE_SIZE_4K, NULL);
9766         if (status != NV_OK)
9767             return status;
9768 
9769         bitmap_zero(new_big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
9770         __set_bit(big_page_index, new_big_ptes);
9771 
9772         status = uvm_push_begin_acquire(gpu->channel_manager,
9773                                         UVM_CHANNEL_TYPE_MEMOPS,
9774                                         &existing->tracker,
9775                                         &push,
9776                                         "Splitting big PTE, existing [0x%llx, 0x%llx) new [0x%llx, 0x%llx)",
9777                                         existing->start, existing->end + 1,
9778                                         new->start, new->end + 1);
9779         if (status != NV_OK)
9780             return status;
9781 
9782         block_gpu_split_big(existing, block_context, gpu, new_big_ptes, &push);
9783     }
9784 
9785     uvm_push_end(&push);
9786 
9787     // Adding this push to existing block tracker will cause all GPU PTE splits
9788     // to serialize on each other, but it's simpler than maintaining a separate
9789     // tracker and this path isn't performance-critical.
9790     return uvm_tracker_add_push_safe(&existing->tracker, &push);
9791 }
9792 
block_split_presplit_ptes(uvm_va_block_t * existing,uvm_va_block_t * new)9793 static NV_STATUS block_split_presplit_ptes(uvm_va_block_t *existing, uvm_va_block_t *new)
9794 {
9795     uvm_gpu_t *gpu;
9796     uvm_gpu_id_t id;
9797     NV_STATUS status;
9798 
9799     for_each_gpu_id(id) {
9800         if (!uvm_va_block_gpu_state_get(existing, id))
9801             continue;
9802 
9803         gpu = block_get_gpu(existing, id);
9804 
9805         if (block_gpu_has_page_tables(existing, gpu)) {
9806             status = block_split_presplit_ptes_gpu(existing, new, gpu);
9807             if (status != NV_OK)
9808                 return status;
9809         }
9810     }
9811 
9812     return NV_OK;
9813 }
9814 
9815 typedef struct
9816 {
9817     // Number of chunks contained by this VA block
9818     size_t num_chunks;
9819 
9820     // Index of the "interesting" chunk, either adjacent to or spanning the
9821     // split point depending on which block this is.
9822     size_t chunk_index;
9823 
9824     // Size of the chunk referenced by chunk_index
9825     uvm_chunk_size_t chunk_size;
9826 } block_gpu_chunk_split_state_t;
9827 
block_gpu_chunk_get_split_state(uvm_va_block_t * block,block_gpu_chunk_split_state_t * state,NvU64 start,NvU64 end,uvm_page_index_t page_index,uvm_gpu_t * gpu)9828 static void block_gpu_chunk_get_split_state(uvm_va_block_t *block,
9829                                             block_gpu_chunk_split_state_t *state,
9830                                             NvU64 start,
9831                                             NvU64 end,
9832                                             uvm_page_index_t page_index,
9833                                             uvm_gpu_t *gpu)
9834 {
9835     NvU64 size = end - start + 1;
9836     state->num_chunks = block_num_gpu_chunks_range(block, start, size, gpu);
9837     state->chunk_index = block_gpu_chunk_index_range(block, start, size, gpu, page_index, &state->chunk_size);
9838 }
9839 
block_merge_chunk(uvm_va_block_t * block,uvm_gpu_t * gpu,uvm_gpu_chunk_t * chunk)9840 static void block_merge_chunk(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_gpu_chunk_t *chunk)
9841 {
9842     uvm_gpu_t *accessing_gpu;
9843     uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
9844 
9845     uvm_pmm_gpu_merge_chunk(&gpu->pmm, chunk);
9846 
9847     for_each_va_space_gpu_in_mask(accessing_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)]) {
9848         NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(&gpu->pmm, chunk, accessing_gpu);
9849 
9850         uvm_pmm_sysmem_mappings_merge_gpu_chunk_mappings(&accessing_gpu->pmm_reverse_sysmem_mappings,
9851                                                          peer_addr,
9852                                                          uvm_gpu_chunk_get_size(chunk));
9853     }
9854 }
9855 
9856 // Perform any chunk splitting and array growing required for this block split,
9857 // but don't actually move chunk pointers anywhere.
block_presplit_gpu_chunks(uvm_va_block_t * existing,uvm_va_block_t * new,uvm_gpu_t * gpu)9858 static NV_STATUS block_presplit_gpu_chunks(uvm_va_block_t *existing, uvm_va_block_t *new, uvm_gpu_t *gpu)
9859 {
9860     uvm_va_block_gpu_state_t *existing_gpu_state = uvm_va_block_gpu_state_get(existing, gpu->id);
9861     uvm_gpu_t *accessing_gpu;
9862     uvm_va_space_t *va_space = uvm_va_block_get_va_space(existing);
9863     uvm_gpu_chunk_t **temp_chunks;
9864     uvm_gpu_chunk_t *original_chunk, *curr_chunk;
9865     uvm_page_index_t split_page_index = uvm_va_block_cpu_page_index(existing, new->start);
9866     uvm_chunk_sizes_mask_t split_sizes;
9867     uvm_chunk_size_t subchunk_size;
9868     NV_STATUS status;
9869     block_gpu_chunk_split_state_t existing_before_state, existing_after_state, new_state;
9870 
9871     block_gpu_chunk_get_split_state(existing,
9872                                     &existing_before_state,
9873                                     existing->start,
9874                                     existing->end,
9875                                     split_page_index,
9876                                     gpu);
9877     block_gpu_chunk_get_split_state(existing,
9878                                     &existing_after_state,
9879                                     existing->start,
9880                                     new->start - 1,
9881                                     split_page_index - 1,
9882                                     gpu);
9883     block_gpu_chunk_get_split_state(new,
9884                                     &new_state,
9885                                     new->start,
9886                                     new->end,
9887                                     0,
9888                                     gpu);
9889 
9890     // Even though we're splitting existing, we could wind up requiring a larger
9891     // chunks array if we split a large chunk into many smaller ones.
9892     if (existing_after_state.num_chunks > existing_before_state.num_chunks) {
9893         temp_chunks = uvm_kvrealloc(existing_gpu_state->chunks,
9894                                     existing_after_state.num_chunks * sizeof(existing_gpu_state->chunks[0]));
9895         if (!temp_chunks)
9896             return NV_ERR_NO_MEMORY;
9897         existing_gpu_state->chunks = temp_chunks;
9898     }
9899 
9900     original_chunk = existing_gpu_state->chunks[existing_before_state.chunk_index];
9901 
9902     // If the chunk covering the split point is not populated, we're done. We've
9903     // already grown the array to cover any new chunks which may be populated
9904     // later.
9905     if (!original_chunk)
9906         return NV_OK;
9907 
9908     // Figure out the splits we need to perform. Remove all sizes >= the current
9909     // size, and all sizes < the target size. Note that the resulting mask will
9910     // be 0 if the sizes match (we're already splitting at a chunk boundary).
9911     UVM_ASSERT(uvm_gpu_chunk_get_size(original_chunk) == existing_before_state.chunk_size);
9912     UVM_ASSERT(existing_before_state.chunk_size >= new_state.chunk_size);
9913     split_sizes = gpu->parent->mmu_user_chunk_sizes;
9914     split_sizes &= existing_before_state.chunk_size - 1;
9915     split_sizes &= ~(new_state.chunk_size - 1);
9916 
9917     // Keep splitting the chunk covering the split point until we hit the target
9918     // size.
9919     curr_chunk = original_chunk;
9920     for_each_chunk_size_rev(subchunk_size, split_sizes) {
9921         size_t last_index, num_subchunks;
9922 
9923         status = uvm_pmm_gpu_split_chunk(&gpu->pmm, curr_chunk, subchunk_size, NULL);
9924         if (status != NV_OK)
9925             goto error;
9926 
9927         // Split physical GPU mappings for indirect peers
9928         for_each_va_space_gpu_in_mask(accessing_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)]) {
9929             NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(&gpu->pmm, curr_chunk, accessing_gpu);
9930 
9931             status = uvm_pmm_sysmem_mappings_split_gpu_chunk_mappings(&accessing_gpu->pmm_reverse_sysmem_mappings,
9932                                                                       peer_addr,
9933                                                                       subchunk_size);
9934             if (status != NV_OK)
9935                 goto error;
9936         }
9937 
9938         if (subchunk_size == new_state.chunk_size)
9939             break;
9940 
9941         // Compute the last subchunk index prior to the split point. Divide the
9942         // entire address space into units of subchunk_size, then mod by the
9943         // number of subchunks within the parent.
9944         last_index = (size_t)uvm_div_pow2_64(new->start - 1, subchunk_size);
9945         num_subchunks = (size_t)uvm_div_pow2_64(uvm_gpu_chunk_get_size(curr_chunk), subchunk_size);
9946         UVM_ASSERT(num_subchunks > 1);
9947         last_index &= num_subchunks - 1;
9948 
9949         uvm_pmm_gpu_get_subchunks(&gpu->pmm, curr_chunk, last_index, 1, &curr_chunk);
9950         UVM_ASSERT(uvm_gpu_chunk_get_size(curr_chunk) == subchunk_size);
9951     }
9952 
9953     // Note that existing's chunks array still has a pointer to original_chunk,
9954     // not to any newly-split subchunks. If a subsequent split failure occurs on
9955     // a later GPU we'll have to merge it back. Once we're past the preallocate
9956     // stage we'll remove it from the chunks array and move the new split chunks
9957     // in.
9958 
9959     return NV_OK;
9960 
9961 error:
9962     // On error we need to leave the chunk in its initial state
9963     block_merge_chunk(existing, gpu, original_chunk);
9964 
9965     return status;
9966 }
9967 
block_split_cpu_chunk_to_64k(uvm_va_block_t * block,int nid)9968 static NV_STATUS block_split_cpu_chunk_to_64k(uvm_va_block_t *block, int nid)
9969 {
9970     uvm_cpu_chunk_storage_mixed_t *mixed;
9971     uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, nid, 0);
9972     uvm_va_block_cpu_node_state_t *node_state = block_node_state_get(block, nid);
9973     NV_STATUS status;
9974 
9975     UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_2M);
9976     UVM_ASSERT(uvm_cpu_storage_get_type(node_state) == UVM_CPU_CHUNK_STORAGE_CHUNK);
9977 
9978     mixed = uvm_kvmalloc_zero(sizeof(*mixed));
9979     if (!mixed)
9980         return NV_ERR_NO_MEMORY;
9981 
9982     status = uvm_cpu_chunk_split(chunk, (uvm_cpu_chunk_t **)&mixed->slots);
9983     if (status != NV_OK) {
9984         uvm_kvfree(mixed);
9985         return status;
9986     }
9987 
9988     bitmap_fill(mixed->big_chunks, MAX_BIG_CPU_CHUNK_SLOTS_PER_UVM_VA_BLOCK);
9989     node_state->chunks = (unsigned long)mixed | UVM_CPU_CHUNK_STORAGE_MIXED;
9990 
9991     return status;
9992 }
9993 
block_split_cpu_chunk_to_4k(uvm_va_block_t * block,uvm_page_index_t page_index,int nid)9994 static NV_STATUS block_split_cpu_chunk_to_4k(uvm_va_block_t *block, uvm_page_index_t page_index, int nid)
9995 {
9996     uvm_va_block_cpu_node_state_t *node_state = block_node_state_get(block, nid);
9997     uvm_cpu_chunk_storage_mixed_t *mixed;
9998     uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, nid, page_index);
9999     uvm_cpu_chunk_t **small_chunks;
10000     size_t slot_index;
10001     NV_STATUS status;
10002 
10003     UVM_ASSERT(chunk);
10004     UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_64K);
10005     UVM_ASSERT(uvm_cpu_storage_get_type(node_state) == UVM_CPU_CHUNK_STORAGE_MIXED);
10006 
10007     mixed = uvm_cpu_storage_get_ptr(node_state);
10008     slot_index = compute_slot_index(block, page_index);
10009     small_chunks = uvm_kvmalloc_zero(sizeof(*small_chunks) * MAX_SMALL_CHUNKS_PER_BIG_SLOT);
10010     if (!small_chunks)
10011         return NV_ERR_NO_MEMORY;
10012 
10013     status = uvm_cpu_chunk_split(chunk, small_chunks);
10014     if (status != NV_OK) {
10015         uvm_kvfree(small_chunks);
10016         return status;
10017     }
10018 
10019     mixed->slots[slot_index] = small_chunks;
10020     clear_bit(slot_index, mixed->big_chunks);
10021 
10022     return status;
10023 }
10024 
block_split_cpu_chunk_one(uvm_va_block_t * block,uvm_page_index_t page_index,int nid)10025 static NV_STATUS block_split_cpu_chunk_one(uvm_va_block_t *block, uvm_page_index_t page_index, int nid)
10026 {
10027     uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, nid, page_index);
10028     uvm_chunk_size_t chunk_size = uvm_cpu_chunk_get_size(chunk);
10029     uvm_chunk_size_t new_size;
10030     uvm_gpu_t *gpu;
10031     NvU64 gpu_mapping_addr;
10032     uvm_processor_mask_t *gpu_split_mask;
10033     uvm_gpu_id_t id;
10034     NV_STATUS status;
10035 
10036     gpu_split_mask = uvm_processor_mask_cache_alloc();
10037     if (!gpu_split_mask)
10038         return NV_ERR_NO_MEMORY;
10039 
10040     if (chunk_size == UVM_CHUNK_SIZE_2M)
10041         new_size = UVM_CHUNK_SIZE_64K;
10042     else
10043         new_size = UVM_CHUNK_SIZE_4K;
10044 
10045     UVM_ASSERT(IS_ALIGNED(chunk_size, new_size));
10046 
10047     uvm_processor_mask_zero(gpu_split_mask);
10048     for_each_gpu_id(id) {
10049         if (!uvm_va_block_gpu_state_get(block, id))
10050             continue;
10051 
10052         gpu = block_get_gpu(block, id);
10053 
10054         // If the parent chunk has not been mapped, there is nothing to split.
10055         gpu_mapping_addr = uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk, gpu->parent);
10056         if (gpu_mapping_addr == 0)
10057             continue;
10058 
10059         status = uvm_pmm_sysmem_mappings_split_gpu_mappings(&gpu->pmm_reverse_sysmem_mappings,
10060                                                             gpu_mapping_addr,
10061                                                             new_size);
10062         if (status != NV_OK)
10063             goto merge;
10064 
10065         uvm_processor_mask_set(gpu_split_mask, id);
10066     }
10067 
10068     if (new_size == UVM_CHUNK_SIZE_64K)
10069         status = block_split_cpu_chunk_to_64k(block, nid);
10070     else
10071         status = block_split_cpu_chunk_to_4k(block, page_index, nid);
10072 
10073     if (status != NV_OK) {
10074 merge:
10075         for_each_gpu_id_in_mask(id, gpu_split_mask) {
10076             gpu = block_get_gpu(block, id);
10077             gpu_mapping_addr = uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk, gpu->parent);
10078             uvm_pmm_sysmem_mappings_merge_gpu_mappings(&gpu->pmm_reverse_sysmem_mappings,
10079                                                        gpu_mapping_addr,
10080                                                        chunk_size);
10081         }
10082     }
10083 
10084     uvm_processor_mask_cache_free(gpu_split_mask);
10085 
10086     return status;
10087 }
10088 
block_prealloc_cpu_chunk_storage(uvm_va_block_t * existing,uvm_va_block_t * new,int nid)10089 static NV_STATUS block_prealloc_cpu_chunk_storage(uvm_va_block_t *existing, uvm_va_block_t *new, int nid)
10090 {
10091     uvm_cpu_chunk_storage_mixed_t *existing_mixed;
10092     uvm_cpu_chunk_storage_mixed_t *new_mixed = NULL;
10093     uvm_va_block_cpu_node_state_t *node_state = block_node_state_get(existing, nid);
10094     uvm_va_block_cpu_node_state_t *new_node_state = block_node_state_get(new, nid);
10095     size_t slot_offset;
10096     size_t existing_slot;
10097     NV_STATUS status = NV_OK;
10098 
10099     UVM_ASSERT(uvm_cpu_storage_get_type(node_state) == UVM_CPU_CHUNK_STORAGE_MIXED);
10100     existing_mixed = uvm_cpu_storage_get_ptr(node_state);
10101 
10102     // Pre-allocate chunk storage for the new block. By definition, the new block
10103     // will contain either 64K and/or 4K chunks.
10104     //
10105     // We do this here so there are no failures in block_split_cpu().
10106     new_mixed = uvm_kvmalloc_zero(sizeof(*new_mixed));
10107     if (!new_mixed)
10108         return NV_ERR_NO_MEMORY;
10109 
10110     slot_offset = compute_slot_index(existing, uvm_va_block_cpu_page_index(existing, new->start));
10111     existing_slot = slot_offset;
10112     for_each_clear_bit_from(existing_slot, existing_mixed->big_chunks, MAX_BIG_CPU_CHUNK_SLOTS_PER_UVM_VA_BLOCK) {
10113         size_t new_slot = existing_slot - slot_offset;
10114 
10115         if (existing_mixed->slots[existing_slot]) {
10116             uvm_cpu_chunk_t **small_chunks = uvm_kvmalloc_zero(sizeof(*small_chunks) * MAX_SMALL_CHUNKS_PER_BIG_SLOT);
10117 
10118             if (!small_chunks) {
10119                 status = NV_ERR_NO_MEMORY;
10120                 goto done;
10121             }
10122 
10123             new_mixed->slots[new_slot] = small_chunks;
10124         }
10125     }
10126 
10127     new_node_state->chunks = (unsigned long)new_mixed | UVM_CPU_CHUNK_STORAGE_MIXED;
10128     UVM_ASSERT(status == NV_OK);
10129 
10130 done:
10131     if (status != NV_OK) {
10132         for (; existing_slot > slot_offset; existing_slot--)
10133             uvm_kvfree(new_mixed->slots[existing_slot - slot_offset]);
10134 
10135         uvm_kvfree(new_mixed);
10136     }
10137 
10138     return status;
10139 }
10140 
block_free_cpu_chunk_storage(uvm_va_block_t * block,int nid)10141 static void block_free_cpu_chunk_storage(uvm_va_block_t *block, int nid)
10142 {
10143     uvm_va_block_cpu_node_state_t *node_state = block_node_state_get(block, nid);
10144 
10145     if (node_state->chunks) {
10146         uvm_cpu_chunk_storage_mixed_t *mixed;
10147         size_t slot_index;
10148 
10149         UVM_ASSERT(uvm_cpu_storage_get_type(node_state) == UVM_CPU_CHUNK_STORAGE_MIXED);
10150         mixed = uvm_cpu_storage_get_ptr(node_state);
10151         for (slot_index = 0; slot_index < MAX_BIG_CPU_CHUNK_SLOTS_PER_UVM_VA_BLOCK; slot_index++)
10152             uvm_kvfree(mixed->slots[slot_index]);
10153 
10154         uvm_kvfree(mixed);
10155         node_state->chunks = 0;
10156     }
10157 }
10158 
10159 // Perform any CPU chunk splitting that may be required for this block split.
10160 // Just like block_presplit_gpu_chunks, no chunks are moved to the new block.
block_presplit_cpu_chunks(uvm_va_block_t * existing,uvm_va_block_t * new)10161 static NV_STATUS block_presplit_cpu_chunks(uvm_va_block_t *existing, uvm_va_block_t *new)
10162 {
10163     uvm_page_index_t page_index = uvm_va_block_cpu_page_index(existing, new->start);
10164     uvm_cpu_chunk_t *splitting_chunk;
10165     uvm_chunk_sizes_mask_t split_sizes = uvm_cpu_chunk_get_allocation_sizes();
10166     uvm_chunk_size_t subchunk_size;
10167     NV_STATUS status = NV_OK;
10168     int nid;
10169 
10170     UVM_ASSERT(!IS_ALIGNED(new->start, UVM_VA_BLOCK_SIZE));
10171 
10172     for_each_possible_uvm_node(nid) {
10173         splitting_chunk = uvm_cpu_chunk_get_chunk_for_page(existing, nid, page_index);
10174 
10175         // If the page covering the split point has not been populated, there is no
10176         // need to split.
10177         if (!splitting_chunk)
10178             continue;
10179 
10180         // If the split point is aligned on the chunk size, there is no need to
10181         // split.
10182         if (IS_ALIGNED(new->start, uvm_cpu_chunk_get_size(splitting_chunk)))
10183             continue;
10184 
10185         // Remove all sizes above the chunk's current size.
10186         split_sizes &= uvm_cpu_chunk_get_size(splitting_chunk) - 1;
10187         // Remove all sizes below the alignment of the new block's start.
10188         split_sizes &= ~(IS_ALIGNED(new->start, UVM_CHUNK_SIZE_64K) ? UVM_CHUNK_SIZE_64K - 1 : 0);
10189 
10190         for_each_chunk_size_rev(subchunk_size, split_sizes) {
10191             status = block_split_cpu_chunk_one(existing, page_index, nid);
10192             if (status != NV_OK)
10193                 return status;
10194         }
10195 
10196         status = block_prealloc_cpu_chunk_storage(existing, new, nid);
10197         if (status != NV_OK)
10198             break;
10199     }
10200 
10201     return status;
10202 }
10203 
block_merge_cpu_chunks_to_64k(uvm_va_block_t * block,uvm_page_index_t page_index,int nid)10204 static void block_merge_cpu_chunks_to_64k(uvm_va_block_t *block, uvm_page_index_t page_index, int nid)
10205 {
10206     uvm_va_block_cpu_node_state_t *node_state = block_node_state_get(block, nid);
10207     uvm_cpu_chunk_storage_mixed_t *mixed = uvm_cpu_storage_get_ptr(node_state);
10208     size_t slot_index = compute_slot_index(block, page_index);
10209     uvm_cpu_chunk_t **small_chunks = mixed->slots[slot_index];
10210     uvm_cpu_chunk_t *merged_chunk;
10211 
10212     UVM_ASSERT(uvm_cpu_storage_get_type(node_state) == UVM_CPU_CHUNK_STORAGE_MIXED);
10213     UVM_ASSERT(small_chunks);
10214     UVM_ASSERT(!test_bit(slot_index, mixed->big_chunks));
10215 
10216     merged_chunk = uvm_cpu_chunk_merge(small_chunks);
10217     mixed->slots[slot_index] = merged_chunk;
10218     set_bit(slot_index, mixed->big_chunks);
10219     uvm_kvfree(small_chunks);
10220 }
10221 
block_merge_cpu_chunks_to_2m(uvm_va_block_t * block,uvm_page_index_t page_index,int nid)10222 static void block_merge_cpu_chunks_to_2m(uvm_va_block_t *block, uvm_page_index_t page_index, int nid)
10223 {
10224     uvm_va_block_cpu_node_state_t *node_state = block_node_state_get(block, nid);
10225     uvm_cpu_chunk_storage_mixed_t *mixed = uvm_cpu_storage_get_ptr(node_state);
10226     uvm_cpu_chunk_t **big_chunks = (uvm_cpu_chunk_t **)&mixed->slots;
10227     uvm_cpu_chunk_t *merged_chunk;
10228 
10229     UVM_ASSERT(uvm_cpu_storage_get_type(node_state) == UVM_CPU_CHUNK_STORAGE_MIXED);
10230     UVM_ASSERT(bitmap_full(mixed->big_chunks, MAX_BIG_CPU_CHUNK_SLOTS_PER_UVM_VA_BLOCK));
10231 
10232     merged_chunk = uvm_cpu_chunk_merge(big_chunks);
10233     node_state->chunks = (unsigned long)merged_chunk | UVM_CPU_CHUNK_STORAGE_CHUNK;
10234     uvm_kvfree(mixed);
10235 }
10236 
block_merge_cpu_chunks_one(uvm_va_block_t * block,uvm_page_index_t page_index,int nid)10237 static void block_merge_cpu_chunks_one(uvm_va_block_t *block, uvm_page_index_t page_index, int nid)
10238 {
10239     uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, nid, page_index);
10240     uvm_gpu_id_t id;
10241 
10242     if (!chunk)
10243         return;
10244 
10245     if (uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_4K) {
10246         block_merge_cpu_chunks_to_64k(block, page_index, nid);
10247     }
10248     else {
10249         UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_64K);
10250         block_merge_cpu_chunks_to_2m(block, page_index, nid);
10251     }
10252 
10253     chunk = uvm_cpu_chunk_get_chunk_for_page(block, nid, page_index);
10254 
10255     for_each_gpu_id(id) {
10256         NvU64 gpu_mapping_addr;
10257         uvm_gpu_t *gpu;
10258 
10259         if (!uvm_va_block_gpu_state_get(block, id))
10260             continue;
10261 
10262         gpu = block_get_gpu(block, id);
10263         gpu_mapping_addr = uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk, gpu->parent);
10264         if (gpu_mapping_addr == 0)
10265             continue;
10266 
10267         uvm_pmm_sysmem_mappings_merge_gpu_mappings(&gpu->pmm_reverse_sysmem_mappings,
10268                                                    gpu_mapping_addr,
10269                                                    uvm_cpu_chunk_get_size(chunk));
10270     }
10271 }
10272 
block_merge_cpu_chunks(uvm_va_block_t * existing,uvm_va_block_t * new)10273 static void block_merge_cpu_chunks(uvm_va_block_t *existing, uvm_va_block_t *new)
10274 {
10275     uvm_page_index_t page_index = uvm_va_block_cpu_page_index(existing, new->start);
10276     uvm_chunk_sizes_mask_t merge_sizes = uvm_cpu_chunk_get_allocation_sizes();
10277     uvm_chunk_size_t largest_size;
10278     size_t block_size = uvm_va_block_size(existing);
10279     int nid;
10280 
10281     // Since block sizes are not always powers of 2, use the largest power of 2
10282     // less than or equal to the block size since we can't merge to a size
10283     // larger than the block's size.
10284     largest_size = rounddown_pow_of_two(block_size);
10285 
10286     for_each_possible_uvm_node(nid) {
10287         uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(existing, nid, page_index);
10288         uvm_va_block_cpu_node_state_t *node_state = block_node_state_get(existing, nid);
10289         uvm_chunk_size_t chunk_size;
10290         uvm_chunk_size_t merge_size;
10291 
10292         if (!chunk || uvm_cpu_chunk_is_physical(chunk))
10293             continue;
10294 
10295         chunk_size = uvm_cpu_chunk_get_size(chunk);
10296 
10297         // Remove all CPU chunk sizes above the size of the existing VA block.
10298         merge_sizes &= (largest_size | (largest_size - 1));
10299 
10300         // Remove all CPU chunk sizes smaller than the size of the chunk being merged up.
10301         merge_sizes &= ~(chunk_size | (chunk_size - 1));
10302 
10303         for_each_chunk_size(merge_size, merge_sizes) {
10304             uvm_va_block_region_t chunk_region;
10305 
10306             // The block has to fully contain the VA range after the merge.
10307             if (!uvm_va_block_contains_address(existing, UVM_ALIGN_DOWN(new->start, merge_size)) ||
10308                 !uvm_va_block_contains_address(existing, UVM_ALIGN_DOWN(new->start, merge_size) + merge_size - 1))
10309                 break;
10310 
10311             chunk_region = uvm_va_block_chunk_region(existing, merge_size, page_index);
10312 
10313             // If not all pages in the region covered by the chunk are allocated,
10314             // we can't merge.
10315             if (!uvm_page_mask_region_full(&node_state->allocated, chunk_region))
10316                 break;
10317 
10318             block_merge_cpu_chunks_one(existing, chunk_region.first, nid);
10319             chunk = uvm_cpu_chunk_get_chunk_for_page(existing, nid, page_index);
10320             if (uvm_cpu_chunk_is_physical(chunk))
10321                 break;
10322         }
10323 
10324         block_free_cpu_chunk_storage(new, nid);
10325     }
10326 }
10327 
10328 // Pre-allocate everything which doesn't require retry on both existing and new
10329 // which will be needed to handle a split. If this fails, existing must remain
10330 // functionally unmodified.
block_split_preallocate_no_retry(uvm_va_block_t * existing,uvm_va_block_t * new)10331 static NV_STATUS block_split_preallocate_no_retry(uvm_va_block_t *existing, uvm_va_block_t *new)
10332 {
10333     NV_STATUS status;
10334     uvm_gpu_t *gpu;
10335     uvm_gpu_id_t id;
10336     uvm_page_index_t split_page_index;
10337     uvm_va_block_test_t *block_test;
10338 
10339     status = block_presplit_cpu_chunks(existing, new);
10340     if (status != NV_OK)
10341         goto error;
10342 
10343     for_each_gpu_id(id) {
10344         if (!uvm_va_block_gpu_state_get(existing, id))
10345             continue;
10346 
10347         gpu = block_get_gpu(existing, id);
10348 
10349         status = block_presplit_gpu_chunks(existing, new, gpu);
10350         if (status != NV_OK)
10351             goto error;
10352 
10353         if (!block_gpu_state_get_alloc(new, gpu)) {
10354             status = NV_ERR_NO_MEMORY;
10355             goto error;
10356         }
10357     }
10358 
10359     block_test = uvm_va_block_get_test(existing);
10360     if (block_test && block_test->inject_split_error) {
10361         block_test->inject_split_error = false;
10362         if (!uvm_va_block_is_hmm(existing)) {
10363             UVM_ASSERT(existing->va_range->inject_split_error);
10364             existing->va_range->inject_split_error = false;
10365         }
10366         status = NV_ERR_NO_MEMORY;
10367         goto error;
10368     }
10369 
10370     if (uvm_va_block_is_hmm(existing)) {
10371         uvm_va_policy_node_t *node = uvm_va_policy_node_find(existing, new->start);
10372 
10373         if (node && node->node.start != new->start) {
10374             status = uvm_va_policy_node_split(existing, node, new->start - 1, NULL);
10375             if (status != NV_OK)
10376                 goto error;
10377         }
10378     }
10379 
10380     return NV_OK;
10381 
10382 error:
10383     // Merge back the chunks we split
10384     split_page_index = uvm_va_block_cpu_page_index(existing, new->start);
10385 
10386     for_each_gpu_id(id) {
10387         uvm_gpu_chunk_t *chunk;
10388         size_t chunk_index;
10389         uvm_va_block_gpu_state_t *existing_gpu_state = uvm_va_block_gpu_state_get(existing, id);
10390 
10391         if (!existing_gpu_state)
10392             continue;
10393 
10394         // If the chunk spanning the split point was split, merge it back
10395         gpu = block_get_gpu(existing, id);
10396         chunk_index = block_gpu_chunk_index(existing, gpu, split_page_index, NULL);
10397         chunk = existing_gpu_state->chunks[chunk_index];
10398         if (!chunk || chunk->state != UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT)
10399             continue;
10400 
10401         block_merge_chunk(existing, gpu, chunk);
10402 
10403         // We could attempt to shrink the chunks array back down, but it doesn't
10404         // hurt much to have it larger than necessary, and we'd have to handle
10405         // the shrink call failing anyway on this error path.
10406 
10407     }
10408 
10409     block_merge_cpu_chunks(existing, new);
10410 
10411     return status;
10412 }
10413 
10414 // Re-calculate the block's top-level processor masks:
10415 //   - block->mapped
10416 //   - block->resident
10417 //
10418 // This is called on block split.
block_set_processor_masks(uvm_va_block_t * block)10419 static void block_set_processor_masks(uvm_va_block_t *block)
10420 {
10421     size_t num_pages = uvm_va_block_num_cpu_pages(block);
10422     uvm_va_block_region_t block_region = uvm_va_block_region(0, num_pages);
10423     uvm_gpu_id_t id;
10424 
10425     if (uvm_page_mask_region_empty(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ], block_region)) {
10426         UVM_ASSERT(uvm_page_mask_region_empty(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE], block_region));
10427         uvm_processor_mask_clear(&block->mapped, UVM_ID_CPU);
10428     }
10429     else {
10430         uvm_processor_mask_set(&block->mapped, UVM_ID_CPU);
10431     }
10432 
10433     if (uvm_page_mask_region_empty(uvm_va_block_resident_mask_get(block, UVM_ID_CPU, NUMA_NO_NODE), block_region)) {
10434         uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
10435 
10436         if (uvm_processor_mask_get_gpu_count(&va_space->can_access[UVM_ID_CPU_VALUE]) == 0)
10437             UVM_ASSERT(!uvm_processor_mask_test(&block->mapped, UVM_ID_CPU));
10438 
10439         block_clear_resident_processor(block, UVM_ID_CPU);
10440     }
10441     else {
10442         block_set_resident_processor(block, UVM_ID_CPU);
10443     }
10444 
10445     for_each_gpu_id(id) {
10446         uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, id);
10447         if (!gpu_state)
10448             continue;
10449 
10450         if (uvm_page_mask_region_empty(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ], block_region)) {
10451             UVM_ASSERT(uvm_page_mask_region_empty(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_WRITE], block_region));
10452             UVM_ASSERT(uvm_page_mask_region_empty(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_ATOMIC], block_region));
10453             uvm_processor_mask_clear(&block->mapped, id);
10454         }
10455         else {
10456             uvm_processor_mask_set(&block->mapped, id);
10457         }
10458 
10459         if (uvm_page_mask_region_empty(&gpu_state->resident, block_region))
10460             block_clear_resident_processor(block, id);
10461         else
10462             block_set_resident_processor(block, id);
10463 
10464         if (uvm_page_mask_region_empty(&gpu_state->evicted, block_region))
10465             uvm_processor_mask_clear(&block->evicted_gpus, id);
10466         else
10467             uvm_processor_mask_set(&block->evicted_gpus, id);
10468     }
10469 }
10470 
10471 // Split a PAGES_PER_UVM_VA_BLOCK sized bitmap into new and existing parts
10472 // corresponding to a block split.
block_split_page_mask(uvm_page_mask_t * existing_mask,size_t existing_pages,uvm_page_mask_t * new_mask,size_t new_pages)10473 static void block_split_page_mask(uvm_page_mask_t *existing_mask,
10474                                   size_t existing_pages,
10475                                   uvm_page_mask_t *new_mask,
10476                                   size_t new_pages)
10477 {
10478     UVM_ASSERT_MSG(existing_pages + new_pages <= PAGES_PER_UVM_VA_BLOCK, "existing %zu new %zu\n",
10479                    existing_pages, new_pages);
10480 
10481     // The new block is always in the upper region of existing, so shift the bit
10482     // vectors down.
10483     //
10484     // Note that bitmap_shift_right requires both dst and src to be the same
10485     // size. That's ok since we don't scale them by block size.
10486     uvm_page_mask_shift_right(new_mask, existing_mask, existing_pages);
10487     uvm_page_mask_region_clear(existing_mask, uvm_va_block_region(existing_pages, existing_pages + new_pages));
10488 }
10489 
10490 // Split the CPU state within the existing block. existing's start is correct
10491 // but its end has not yet been adjusted.
block_split_cpu(uvm_va_block_t * existing,uvm_va_block_t * new)10492 static void block_split_cpu(uvm_va_block_t *existing, uvm_va_block_t *new)
10493 {
10494     size_t existing_pages, new_pages = uvm_va_block_num_cpu_pages(new);
10495     uvm_pte_bits_cpu_t pte_bit;
10496     uvm_va_block_region_t block_region = uvm_va_block_region_from_block(existing);
10497     uvm_page_index_t split_page_index = uvm_va_block_cpu_page_index(existing, new->start);
10498     uvm_page_index_t page_index;
10499     uvm_page_index_t next_page_index;
10500     uvm_cpu_chunk_t *chunk;
10501     uvm_va_range_t *existing_va_range = existing->va_range;
10502     int nid;
10503 
10504     if (existing_va_range) {
10505         UVM_ASSERT(existing->va_range->type == UVM_VA_RANGE_TYPE_MANAGED);
10506         UVM_ASSERT(existing->va_range->type == new->va_range->type);
10507     }
10508 
10509     UVM_ASSERT(existing->start < new->start);
10510     UVM_ASSERT(existing->end == new->end);
10511 
10512     UVM_ASSERT(PAGE_ALIGNED(new->start));
10513     UVM_ASSERT(PAGE_ALIGNED(existing->start));
10514 
10515     existing_pages = (new->start - existing->start) / PAGE_SIZE;
10516 
10517     // We don't have to unmap the CPU since its virtual -> physical mappings
10518     // don't change.
10519 
10520     for_each_possible_uvm_node(nid) {
10521         uvm_page_mask_t *existing_resident_mask = uvm_va_block_resident_mask_get(existing, UVM_ID_CPU, nid);
10522         uvm_page_mask_t *new_resident_mask = uvm_va_block_resident_mask_get(new, UVM_ID_CPU, nid);
10523 
10524         for_each_cpu_chunk_in_block_region_safe(chunk,
10525                                                 page_index,
10526                                                 next_page_index,
10527                                                 existing,
10528                                                 nid,
10529                                                 uvm_va_block_region(split_page_index, block_region.outer)) {
10530             uvm_page_index_t new_chunk_page_index;
10531             NV_STATUS status;
10532 
10533             uvm_cpu_chunk_remove_from_block(existing, nid, page_index);
10534 
10535             // The chunk has to be adjusted for the new block before inserting it.
10536             new_chunk_page_index = page_index - split_page_index;
10537 
10538             // This should never fail because all necessary storage was allocated
10539             // in block_presplit_cpu_chunks().
10540             status = uvm_cpu_chunk_insert_in_block(new, chunk, new_chunk_page_index);
10541             UVM_ASSERT(status == NV_OK);
10542         }
10543 
10544         block_split_page_mask(existing_resident_mask, existing_pages, new_resident_mask, new_pages);
10545     }
10546 
10547     block_split_page_mask(&existing->cpu.resident, existing_pages, &new->cpu.resident, new_pages);
10548     new->cpu.ever_mapped = existing->cpu.ever_mapped;
10549 
10550     for (pte_bit = 0; pte_bit < UVM_PTE_BITS_CPU_MAX; pte_bit++)
10551         block_split_page_mask(&existing->cpu.pte_bits[pte_bit], existing_pages, &new->cpu.pte_bits[pte_bit], new_pages);
10552 }
10553 
10554 // Fill out the blocks' chunks arrays with the chunks split by
10555 // block_presplit_gpu_chunks.
block_copy_split_gpu_chunks(uvm_va_block_t * existing,uvm_va_block_t * new,uvm_gpu_t * gpu)10556 static void block_copy_split_gpu_chunks(uvm_va_block_t *existing, uvm_va_block_t *new, uvm_gpu_t *gpu)
10557 {
10558     uvm_va_block_gpu_state_t *existing_gpu_state = uvm_va_block_gpu_state_get(existing, gpu->id);
10559     uvm_va_block_gpu_state_t *new_gpu_state = uvm_va_block_gpu_state_get(new, gpu->id);
10560     uvm_gpu_chunk_t **temp_chunks;
10561     uvm_gpu_chunk_t *original_chunk;
10562     block_gpu_chunk_split_state_t existing_before_state, existing_after_state, new_state;
10563     size_t num_pre_chunks, num_post_chunks, num_split_chunks_existing, num_split_chunks_new;
10564     uvm_page_index_t split_page_index = uvm_va_block_cpu_page_index(existing, new->start);
10565     size_t i;
10566 
10567     block_gpu_chunk_get_split_state(existing,
10568                                     &existing_before_state,
10569                                     existing->start,
10570                                     existing->end,
10571                                     split_page_index,
10572                                     gpu);
10573     block_gpu_chunk_get_split_state(existing,
10574                                     &existing_after_state,
10575                                     existing->start,
10576                                     new->start - 1,
10577                                     split_page_index - 1,
10578                                     gpu);
10579     block_gpu_chunk_get_split_state(new,
10580                                     &new_state,
10581                                     new->start,
10582                                     new->end,
10583                                     0,
10584                                     gpu);
10585 
10586     // General case (B is original_chunk):
10587     //                                          split
10588     //                                            v
10589     //  existing (before) [------ A -----][------ B -----][------ C -----]
10590     //  existing (after)  [------ A -----][- B0 -]
10591     //  new                                       [- B1 -][------ C -----]
10592     //
10593     // Note that the logic below also handles the case of the split happening at
10594     // a chunk boundary. That case behaves as though there is no B0 chunk.
10595 
10596     // Number of chunks to the left and right of original_chunk (A and C above).
10597     // Either or both of these may be 0.
10598     num_pre_chunks  = existing_before_state.chunk_index;
10599     num_post_chunks = existing_before_state.num_chunks - num_pre_chunks - 1;
10600 
10601     // Number of subchunks under existing's portion of original_chunk (B0 above)
10602     num_split_chunks_existing = existing_after_state.num_chunks - num_pre_chunks;
10603 
10604     // Number of subchunks under new's portion of original_chunk (B1 above)
10605     num_split_chunks_new = new_state.num_chunks - num_post_chunks;
10606 
10607     UVM_ASSERT(num_pre_chunks + num_split_chunks_existing > 0);
10608     UVM_ASSERT(num_split_chunks_new > 0);
10609 
10610     // Copy post chunks from the end of existing into new (C above)
10611     memcpy(&new_gpu_state->chunks[num_split_chunks_new],
10612            &existing_gpu_state->chunks[existing_before_state.chunk_index + 1],
10613            num_post_chunks * sizeof(new_gpu_state->chunks[0]));
10614 
10615     // Save off the original split chunk since we may overwrite the array
10616     original_chunk = existing_gpu_state->chunks[existing_before_state.chunk_index];
10617 
10618     // Fill out the new pointers
10619     if (original_chunk) {
10620         // Note that if the split happened at a chunk boundary, original_chunk
10621         // will not be split. In that case, num_split_chunks_existing will be 0
10622         // and num_split_chunks_new will be 1, so the left copy will be skipped
10623         // and the right copy will pick up the chunk.
10624 
10625         // Copy left newly-split chunks into existing (B0 above). The array was
10626         // re-sized in block_presplit_gpu_chunks as necessary.
10627         size_t num_subchunks;
10628 
10629         num_subchunks = uvm_pmm_gpu_get_subchunks(&gpu->pmm,
10630                                                   original_chunk,
10631                                                   0, // start_index
10632                                                   num_split_chunks_existing,
10633                                                   &existing_gpu_state->chunks[existing_before_state.chunk_index]);
10634         UVM_ASSERT(num_subchunks == num_split_chunks_existing);
10635 
10636         // Copy right newly-split chunks into new (B1 above), overwriting the
10637         // pointer to the original chunk.
10638         num_subchunks = uvm_pmm_gpu_get_subchunks(&gpu->pmm,
10639                                                   original_chunk,
10640                                                   num_split_chunks_existing, // start_index
10641                                                   num_split_chunks_new,
10642                                                   &new_gpu_state->chunks[0]);
10643         UVM_ASSERT(num_subchunks == num_split_chunks_new);
10644     }
10645     else {
10646         // If the chunk wasn't already populated we don't need to copy pointers
10647         // anywhere, but we need to clear out stale pointers from existing's
10648         // array covering the new elements. new's chunks array was already zero-
10649         // initialized.
10650         memset(&existing_gpu_state->chunks[existing_before_state.chunk_index],
10651                0,
10652                num_split_chunks_existing * sizeof(existing_gpu_state->chunks[0]));
10653     }
10654 
10655     // Since we update the reverse map information, protect it against a
10656     // concurrent lookup
10657     uvm_spin_lock(&gpu->pmm.list_lock);
10658 
10659     // Update the reverse map of all the chunks that are now under the new block
10660     for (i = 0; i < new_state.num_chunks; ++i) {
10661         if (new_gpu_state->chunks[i]) {
10662             UVM_ASSERT(new_gpu_state->chunks[i]->va_block == existing);
10663             new_gpu_state->chunks[i]->va_block = new;
10664 
10665             // Adjust the page_index within the VA block for the new subchunks in
10666             // the new VA block
10667             UVM_ASSERT(new_gpu_state->chunks[i]->va_block_page_index >= split_page_index);
10668             new_gpu_state->chunks[i]->va_block_page_index -= split_page_index;
10669         }
10670     }
10671 
10672     uvm_spin_unlock(&gpu->pmm.list_lock);
10673 
10674     // Attempt to shrink existing's chunk allocation. If the realloc fails, just
10675     // keep on using the old larger one.
10676     if (existing_after_state.num_chunks < existing_before_state.num_chunks) {
10677         temp_chunks = uvm_kvrealloc(existing_gpu_state->chunks,
10678                                     existing_after_state.num_chunks * sizeof(existing_gpu_state->chunks[0]));
10679         if (temp_chunks)
10680             existing_gpu_state->chunks = temp_chunks;
10681     }
10682 }
10683 
block_split_gpu(uvm_va_block_t * existing,uvm_va_block_t * new,uvm_gpu_id_t gpu_id)10684 static void block_split_gpu(uvm_va_block_t *existing, uvm_va_block_t *new, uvm_gpu_id_t gpu_id)
10685 {
10686     uvm_va_block_gpu_state_t *existing_gpu_state = uvm_va_block_gpu_state_get(existing, gpu_id);
10687     uvm_va_block_gpu_state_t *new_gpu_state = uvm_va_block_gpu_state_get(new, gpu_id);
10688     uvm_va_space_t *va_space = uvm_va_block_get_va_space(existing);
10689     uvm_gpu_va_space_t *gpu_va_space;
10690     uvm_gpu_t *gpu;
10691     uvm_gpu_t *accessing_gpu;
10692     size_t new_pages = uvm_va_block_num_cpu_pages(new);
10693     size_t existing_pages, existing_pages_4k, existing_pages_big, new_pages_big;
10694     uvm_pte_bits_gpu_t pte_bit;
10695     size_t num_chunks, i;
10696     uvm_cpu_chunk_t *cpu_chunk;
10697     uvm_page_index_t page_index;
10698     int nid;
10699 
10700     if (!existing_gpu_state)
10701         return;
10702 
10703     gpu = uvm_va_space_get_gpu(va_space, gpu_id);
10704     UVM_ASSERT(new_gpu_state);
10705 
10706     new_gpu_state->force_4k_ptes = existing_gpu_state->force_4k_ptes;
10707 
10708     UVM_ASSERT(PAGE_ALIGNED(new->start));
10709     UVM_ASSERT(PAGE_ALIGNED(existing->start));
10710     existing_pages = (new->start - existing->start) / PAGE_SIZE;
10711 
10712     for_each_possible_uvm_node(nid) {
10713         for_each_cpu_chunk_in_block(cpu_chunk, page_index, new, nid) {
10714             uvm_pmm_sysmem_mappings_reparent_gpu_mapping(&gpu->pmm_reverse_sysmem_mappings,
10715                                                          uvm_cpu_chunk_get_parent_gpu_phys_addr(cpu_chunk,
10716                                                                                                 gpu->parent),
10717                                                          new);
10718         }
10719     }
10720 
10721     block_copy_split_gpu_chunks(existing, new, gpu);
10722 
10723     num_chunks = block_num_gpu_chunks(new, gpu);
10724 
10725     // Reparent GPU mappings for indirect peers
10726     for (i = 0; i < num_chunks; ++i) {
10727         uvm_gpu_chunk_t *chunk = new_gpu_state->chunks[i];
10728         if (!chunk)
10729             continue;
10730 
10731         for_each_va_space_gpu_in_mask(accessing_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)]) {
10732             NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(&gpu->pmm, chunk, accessing_gpu);
10733 
10734             uvm_pmm_sysmem_mappings_reparent_gpu_chunk_mapping(&accessing_gpu->pmm_reverse_sysmem_mappings,
10735                                                                peer_addr,
10736                                                                new);
10737         }
10738     }
10739 
10740     block_split_page_mask(&existing_gpu_state->resident,
10741                           existing_pages,
10742                           &new_gpu_state->resident,
10743                           new_pages);
10744 
10745     for (pte_bit = 0; pte_bit < UVM_PTE_BITS_GPU_MAX; pte_bit++) {
10746         block_split_page_mask(&existing_gpu_state->pte_bits[pte_bit], existing_pages,
10747                               &new_gpu_state->pte_bits[pte_bit], new_pages);
10748     }
10749 
10750     // Adjust page table ranges.
10751     gpu_va_space = uvm_gpu_va_space_get(va_space, gpu);
10752     if (gpu_va_space) {
10753         if (existing_gpu_state->page_table_range_big.table) {
10754             NvU32 big_page_size = uvm_va_block_gpu_big_page_size(existing, gpu);
10755 
10756             // existing's end has not been adjusted yet
10757             existing_pages_big = range_num_big_pages(existing->start, new->start - 1, big_page_size);
10758 
10759             // Take references on all big pages covered by new
10760             new_pages_big = uvm_va_block_num_big_pages(new, big_page_size);
10761             if (new_pages_big) {
10762                 uvm_page_table_range_get_upper(&gpu_va_space->page_tables,
10763                                                &existing_gpu_state->page_table_range_big,
10764                                                &new_gpu_state->page_table_range_big,
10765                                                new_pages_big);
10766 
10767                 // If the split point is within a big page region, we might have
10768                 // a gap since neither existing nor new can use it anymore.
10769                 // Get the top N bits from existing's mask to handle that.
10770                 bitmap_shift_right(new_gpu_state->big_ptes,
10771                                    existing_gpu_state->big_ptes,
10772                                    uvm_va_block_num_big_pages(existing, big_page_size) - new_pages_big,
10773                                    MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
10774 
10775                 new_gpu_state->initialized_big = existing_gpu_state->initialized_big;
10776             }
10777 
10778             // Drop existing's references on the big PTEs it no longer covers
10779             // now that new has references on them. Note that neither existing
10780             // nor new might have big PTEs after the split. In that case, this
10781             // shrink will free the entire old range.
10782             uvm_page_table_range_shrink(&gpu_va_space->page_tables,
10783                                         &existing_gpu_state->page_table_range_big,
10784                                         existing_pages_big);
10785 
10786             if (existing_pages_big == 0) {
10787                 memset(&existing_gpu_state->page_table_range_big, 0, sizeof(existing_gpu_state->page_table_range_big));
10788                 existing_gpu_state->initialized_big = false;
10789             }
10790 
10791             bitmap_clear(existing_gpu_state->big_ptes,
10792                          existing_pages_big,
10793                          MAX_BIG_PAGES_PER_UVM_VA_BLOCK - existing_pages_big);
10794         }
10795 
10796         if (existing_gpu_state->page_table_range_4k.table) {
10797             // Since existing and new share the same PDE we just need to bump
10798             // the ref-count on new's sub-range.
10799             uvm_page_table_range_get_upper(&gpu_va_space->page_tables,
10800                                            &existing_gpu_state->page_table_range_4k,
10801                                            &new_gpu_state->page_table_range_4k,
10802                                            uvm_va_block_size(new) / UVM_PAGE_SIZE_4K);
10803 
10804             // Drop existing's references on the PTEs it no longer covers now
10805             // that new has references on them.
10806             existing_pages_4k = existing_pages * (PAGE_SIZE / UVM_PAGE_SIZE_4K);
10807             uvm_page_table_range_shrink(&gpu_va_space->page_tables,
10808                                         &existing_gpu_state->page_table_range_4k,
10809                                         existing_pages_4k);
10810         }
10811 
10812         // We have to set this explicitly to handle the case of splitting an
10813         // invalid, active 2M PTE with no lower page tables allocated.
10814         if (existing_gpu_state->pte_is_2m) {
10815             UVM_ASSERT(!existing_gpu_state->page_table_range_big.table);
10816             UVM_ASSERT(!existing_gpu_state->page_table_range_4k.table);
10817             existing_gpu_state->pte_is_2m = false;
10818         }
10819 
10820         // existing can't possibly cover 2MB after a split, so drop any 2M PTE
10821         // references it has. We've taken the necessary references on the lower
10822         // tables above.
10823         block_put_ptes_safe(&gpu_va_space->page_tables, &existing_gpu_state->page_table_range_2m);
10824         existing_gpu_state->activated_big = false;
10825         existing_gpu_state->activated_4k = false;
10826     }
10827 
10828     block_split_page_mask(&existing_gpu_state->evicted, existing_pages, &new_gpu_state->evicted, new_pages);
10829 }
10830 
uvm_va_block_split(uvm_va_block_t * existing_va_block,NvU64 new_end,uvm_va_block_t ** new_va_block,uvm_va_range_t * new_va_range)10831 NV_STATUS uvm_va_block_split(uvm_va_block_t *existing_va_block,
10832                              NvU64 new_end,
10833                              uvm_va_block_t **new_va_block,
10834                              uvm_va_range_t *new_va_range)
10835 {
10836     uvm_va_space_t *va_space;
10837     uvm_va_block_t *new_block = NULL;
10838     NV_STATUS status;
10839 
10840     va_space = new_va_range->va_space;
10841     UVM_ASSERT(existing_va_block->va_range);
10842     UVM_ASSERT(existing_va_block->va_range->va_space == va_space);
10843     UVM_ASSERT(!uvm_va_block_is_hmm(existing_va_block));
10844 
10845     // External range types can't be split
10846     UVM_ASSERT(existing_va_block->va_range->type == UVM_VA_RANGE_TYPE_MANAGED);
10847     UVM_ASSERT(new_va_range->type == UVM_VA_RANGE_TYPE_MANAGED);
10848     uvm_assert_rwsem_locked_write(&va_space->lock);
10849 
10850     UVM_ASSERT(new_end > existing_va_block->start);
10851     UVM_ASSERT(new_end < existing_va_block->end);
10852     UVM_ASSERT(PAGE_ALIGNED(new_end + 1));
10853 
10854     status = uvm_va_block_create(new_va_range, new_end + 1, existing_va_block->end, &new_block);
10855     if (status != NV_OK)
10856         return status;
10857 
10858     // We're protected from other splits and faults by the va_space lock being
10859     // held in write mode, but that doesn't stop the reverse mapping (eviction
10860     // path) from inspecting the existing block. Stop those threads by taking
10861     // the block lock. When a reverse mapping thread takes this lock after the
10862     // split has been performed, it will have to re-inspect state and may see
10863     // that it should use the newly-split block instead.
10864     uvm_mutex_lock(&existing_va_block->lock);
10865 
10866     status = uvm_va_block_split_locked(existing_va_block, new_end, new_block, new_va_range);
10867 
10868     uvm_mutex_unlock(&existing_va_block->lock);
10869 
10870     if (status != NV_OK)
10871         uvm_va_block_release(new_block);
10872     else if (new_va_block)
10873         *new_va_block = new_block;
10874 
10875     return status;
10876 }
10877 
uvm_va_block_split_locked(uvm_va_block_t * existing_va_block,NvU64 new_end,uvm_va_block_t * new_block,uvm_va_range_t * new_va_range)10878 NV_STATUS uvm_va_block_split_locked(uvm_va_block_t *existing_va_block,
10879                                     NvU64 new_end,
10880                                     uvm_va_block_t *new_block,
10881                                     uvm_va_range_t *new_va_range)
10882 {
10883     uvm_va_space_t *va_space = uvm_va_block_get_va_space(existing_va_block);
10884     uvm_gpu_id_t id;
10885     NV_STATUS status;
10886     uvm_perf_event_data_t event_data;
10887     uvm_va_block_context_t *va_block_context;
10888 
10889     uvm_assert_rwsem_locked_write(&va_space->lock);
10890 
10891     va_block_context = uvm_va_space_block_context(va_space, NULL);
10892 
10893     UVM_ASSERT(block_check_chunks(existing_va_block));
10894 
10895     // As soon as we update existing's reverse mappings to point to the newly-
10896     // split block, the eviction path could try to operate on the new block.
10897     // Lock that out too until new is ready.
10898     //
10899     // Note that we usually shouldn't nest block locks, but it's ok here because
10900     // we just created new_block so no other thread could possibly take it out
10901     // of order with existing's lock.
10902     uvm_mutex_lock_nested(&new_block->lock);
10903 
10904     // The split has to be transactional, meaning that if we fail, the existing
10905     // block must not be modified. Handle that by pre-allocating everything we
10906     // might need under both existing and new at the start so we only have a
10907     // single point of failure.
10908 
10909     // Since pre-allocation might require allocating new PTEs, we have to handle
10910     // allocation retry which might drop existing's block lock. The
10911     // preallocation is split into two steps for that: the first part which
10912     // allocates and splits PTEs can handle having the block lock dropped then
10913     // re-taken. It won't modify existing_va_block other than adding new PTE
10914     // allocations and splitting existing PTEs, which is always safe.
10915     status = UVM_VA_BLOCK_RETRY_LOCKED(existing_va_block,
10916                                        NULL,
10917                                        block_split_presplit_ptes(existing_va_block, new_block));
10918     if (status != NV_OK)
10919         goto out;
10920 
10921     // Pre-allocate, stage two. This modifies existing_va_block in ways which
10922     // violate many assumptions (such as changing chunk size), but it will put
10923     // things back into place on a failure without dropping the block lock.
10924     status = block_split_preallocate_no_retry(existing_va_block, new_block);
10925     if (status != NV_OK)
10926         goto out;
10927 
10928     // We'll potentially be freeing page tables, so we need to wait for any
10929     // outstanding work before we start
10930     status = uvm_tracker_wait(&existing_va_block->tracker);
10931     if (status != NV_OK)
10932         goto out;
10933 
10934     // Update existing's state only once we're past all failure points
10935 
10936     event_data.block_shrink.block = existing_va_block;
10937     uvm_perf_event_notify(&va_space->perf_events, UVM_PERF_EVENT_BLOCK_SHRINK, &event_data);
10938 
10939     block_split_cpu(existing_va_block, new_block);
10940 
10941     for_each_gpu_id(id)
10942         block_split_gpu(existing_va_block, new_block, id);
10943 
10944     // Update the size of the existing block first so that
10945     // block_set_processor_masks can use block_{set,clear}_resident_processor
10946     // that relies on the size to be correct.
10947     existing_va_block->end = new_end;
10948 
10949     block_split_page_mask(&existing_va_block->read_duplicated_pages,
10950                           uvm_va_block_num_cpu_pages(existing_va_block),
10951                           &new_block->read_duplicated_pages,
10952                           uvm_va_block_num_cpu_pages(new_block));
10953 
10954     if (!uvm_va_block_is_hmm(existing_va_block)) {
10955         block_split_page_mask(&existing_va_block->maybe_mapped_pages,
10956                               uvm_va_block_num_cpu_pages(existing_va_block),
10957                               &new_block->maybe_mapped_pages,
10958                               uvm_va_block_num_cpu_pages(new_block));
10959     }
10960 
10961     block_set_processor_masks(existing_va_block);
10962     block_set_processor_masks(new_block);
10963 
10964     if (uvm_va_block_is_hmm(existing_va_block)) {
10965         uvm_hmm_va_block_split_tree(existing_va_block, new_block);
10966         uvm_va_policy_node_split_move(existing_va_block, new_block);
10967     }
10968 
10969 out:
10970     // Run checks on existing_va_block even on failure, since an error must
10971     // leave the block in a consistent state.
10972     UVM_ASSERT(block_check_chunks(existing_va_block));
10973     UVM_ASSERT(block_check_mappings(existing_va_block, va_block_context));
10974     if (status == NV_OK) {
10975         UVM_ASSERT(block_check_chunks(new_block));
10976         UVM_ASSERT(block_check_mappings(new_block, va_block_context));
10977     }
10978     else {
10979         int nid;
10980 
10981         for_each_possible_uvm_node(nid)
10982             block_free_cpu_chunk_storage(new_block, nid);
10983     }
10984 
10985     uvm_mutex_unlock_nested(&new_block->lock);
10986 
10987     return status;
10988 }
10989 
block_region_might_read_duplicate(uvm_va_block_t * va_block,uvm_va_block_region_t region)10990 static bool block_region_might_read_duplicate(uvm_va_block_t *va_block,
10991                                               uvm_va_block_region_t region)
10992 {
10993     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
10994     uvm_va_range_t *va_range = va_block->va_range;
10995 
10996     if (!uvm_va_space_can_read_duplicate(va_space, NULL))
10997         return false;
10998 
10999     // TODO: Bug 3660922: need to implement HMM read duplication support.
11000     if (uvm_va_block_is_hmm(va_block) ||
11001         uvm_va_range_get_policy(va_range)->read_duplication == UVM_READ_DUPLICATION_DISABLED)
11002         return false;
11003 
11004     if (uvm_va_range_get_policy(va_range)->read_duplication == UVM_READ_DUPLICATION_UNSET
11005         && uvm_page_mask_region_weight(&va_block->read_duplicated_pages, region) == 0)
11006         return false;
11007 
11008     return true;
11009 }
11010 
11011 // Returns the new access permission for the processor that faulted or
11012 // triggered access counter notifications on the given page
11013 //
11014 // TODO: Bug 1766424: this function works on a single page at a time. This
11015 //       could be changed in the future to optimize multiple faults/counters on
11016 //       contiguous pages.
compute_new_permission(uvm_va_block_t * va_block,uvm_va_block_context_t * va_block_context,uvm_page_index_t page_index,uvm_processor_id_t fault_processor_id,uvm_processor_id_t new_residency,uvm_fault_access_type_t access_type)11017 static uvm_prot_t compute_new_permission(uvm_va_block_t *va_block,
11018                                          uvm_va_block_context_t *va_block_context,
11019                                          uvm_page_index_t page_index,
11020                                          uvm_processor_id_t fault_processor_id,
11021                                          uvm_processor_id_t new_residency,
11022                                          uvm_fault_access_type_t access_type)
11023 {
11024     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
11025     uvm_prot_t logical_prot, new_prot;
11026     uvm_processor_mask_t *revoke_processors = &va_block_context->scratch_processor_mask;
11027     struct vm_area_struct *hmm_vma = va_block_context->hmm.vma;
11028 
11029     // TODO: Bug 1766432: Refactor into policies. Current policy is
11030     //       query_promote: upgrade access privileges to avoid future faults IF
11031     //       they don't trigger further revocations.
11032     new_prot = uvm_fault_access_type_to_prot(access_type);
11033     logical_prot = compute_logical_prot(va_block, hmm_vma, page_index);
11034 
11035     UVM_ASSERT(logical_prot >= new_prot);
11036 
11037     if ((logical_prot > UVM_PROT_READ_ONLY) &&
11038         (new_prot == UVM_PROT_READ_ONLY) &&
11039         !block_region_might_read_duplicate(va_block, uvm_va_block_region_for_page(page_index))) {
11040 
11041         block_page_authorized_processors(va_block,
11042                                          page_index,
11043                                          UVM_PROT_READ_WRITE_ATOMIC,
11044                                          revoke_processors);
11045 
11046         uvm_processor_mask_andnot(revoke_processors,
11047                                   revoke_processors,
11048                                   &va_space->has_native_atomics[uvm_id_value(new_residency)]);
11049 
11050         // Only check if there are no faultable processors in the revoke
11051         // processors mask.
11052         uvm_processor_mask_and(revoke_processors, revoke_processors, &va_space->faultable_processors);
11053 
11054         if (uvm_processor_mask_empty(revoke_processors))
11055             new_prot = UVM_PROT_READ_WRITE;
11056     }
11057 
11058     if (logical_prot == UVM_PROT_READ_WRITE_ATOMIC && new_prot == UVM_PROT_READ_WRITE) {
11059         if (uvm_processor_mask_test(&va_space->has_native_atomics[uvm_id_value(new_residency)], fault_processor_id))
11060             new_prot = UVM_PROT_READ_WRITE_ATOMIC;
11061     }
11062 
11063     return new_prot;
11064 }
11065 
do_block_add_mappings_after_migration(uvm_va_block_t * va_block,uvm_va_block_context_t * va_block_context,uvm_processor_id_t new_residency,uvm_processor_id_t processor_id,const uvm_processor_mask_t * map_processors,uvm_va_block_region_t region,const uvm_page_mask_t * map_page_mask,uvm_prot_t max_prot,const uvm_processor_mask_t * thrashing_processors,uvm_tracker_t * tracker)11066 static NV_STATUS do_block_add_mappings_after_migration(uvm_va_block_t *va_block,
11067                                                        uvm_va_block_context_t *va_block_context,
11068                                                        uvm_processor_id_t new_residency,
11069                                                        uvm_processor_id_t processor_id,
11070                                                        const uvm_processor_mask_t *map_processors,
11071                                                        uvm_va_block_region_t region,
11072                                                        const uvm_page_mask_t *map_page_mask,
11073                                                        uvm_prot_t max_prot,
11074                                                        const uvm_processor_mask_t *thrashing_processors,
11075                                                        uvm_tracker_t *tracker)
11076 {
11077     NV_STATUS status;
11078     uvm_processor_id_t map_processor_id;
11079     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
11080     uvm_prot_t new_map_prot = max_prot;
11081     uvm_processor_mask_t *map_processors_local;
11082     uvm_processor_mask_t *native_atomics_mask = &va_space->has_native_atomics[uvm_id_value(new_residency)];
11083 
11084     map_processors_local = uvm_processor_mask_cache_alloc();
11085     if (!map_processors_local)
11086         return NV_ERR_NO_MEMORY;
11087 
11088     uvm_processor_mask_copy(map_processors_local, map_processors);
11089 
11090     // Handle atomic mappings separately
11091     if (max_prot == UVM_PROT_READ_WRITE_ATOMIC) {
11092         if (uvm_processor_mask_test(native_atomics_mask, processor_id)) {
11093 
11094             for_each_id_in_mask(map_processor_id, map_processors_local) {
11095                 UvmEventMapRemoteCause cause = UvmEventMapRemoteCausePolicy;
11096 
11097                 // Skip processors without native atomics to the residency.
11098                 if (!uvm_processor_mask_test(native_atomics_mask, map_processor_id))
11099                     continue;
11100 
11101                 if (thrashing_processors && uvm_processor_mask_test(thrashing_processors, map_processor_id))
11102                     cause = UvmEventMapRemoteCauseThrashing;
11103 
11104                 status = uvm_va_block_map(va_block,
11105                                           va_block_context,
11106                                           map_processor_id,
11107                                           region,
11108                                           map_page_mask,
11109                                           UVM_PROT_READ_WRITE_ATOMIC,
11110                                           cause,
11111                                           tracker);
11112                 if (status != NV_OK)
11113                     goto out;
11114             }
11115 
11116             // Filter out these mapped processors for the next steps
11117             uvm_processor_mask_andnot(map_processors_local, map_processors_local, native_atomics_mask);
11118 
11119             new_map_prot = UVM_PROT_READ_WRITE;
11120         }
11121         else {
11122             if (UVM_ID_IS_CPU(processor_id))
11123                 new_map_prot = UVM_PROT_READ_WRITE;
11124             else
11125                 new_map_prot = UVM_PROT_READ_ONLY;
11126         }
11127     }
11128 
11129     // Map the rest of processors
11130     for_each_id_in_mask(map_processor_id, map_processors_local) {
11131         UvmEventMapRemoteCause cause = UvmEventMapRemoteCausePolicy;
11132         uvm_prot_t final_map_prot;
11133         bool map_processor_has_enabled_system_wide_atomics =
11134             uvm_processor_mask_test(&va_space->system_wide_atomics_enabled_processors, map_processor_id);
11135 
11136         // Write mappings from processors with disabled system-wide atomics are treated like atomics
11137         if (new_map_prot == UVM_PROT_READ_WRITE && !map_processor_has_enabled_system_wide_atomics)
11138             final_map_prot = UVM_PROT_READ_WRITE_ATOMIC;
11139         else
11140             final_map_prot = new_map_prot;
11141 
11142         if (thrashing_processors && uvm_processor_mask_test(thrashing_processors, map_processor_id))
11143             cause = UvmEventMapRemoteCauseThrashing;
11144 
11145         status = uvm_va_block_map(va_block,
11146                                   va_block_context,
11147                                   map_processor_id,
11148                                   region,
11149                                   map_page_mask,
11150                                   final_map_prot,
11151                                   cause,
11152                                   tracker);
11153         if (status != NV_OK)
11154             goto out;
11155     }
11156 
11157 out:
11158     uvm_processor_mask_cache_free(map_processors_local);
11159 
11160     return NV_OK;
11161 }
11162 
uvm_va_block_add_mappings_after_migration(uvm_va_block_t * va_block,uvm_va_block_context_t * va_block_context,uvm_processor_id_t new_residency,uvm_processor_id_t processor_id,uvm_va_block_region_t region,const uvm_page_mask_t * map_page_mask,uvm_prot_t max_prot,const uvm_processor_mask_t * thrashing_processors)11163 NV_STATUS uvm_va_block_add_mappings_after_migration(uvm_va_block_t *va_block,
11164                                                     uvm_va_block_context_t *va_block_context,
11165                                                     uvm_processor_id_t new_residency,
11166                                                     uvm_processor_id_t processor_id,
11167                                                     uvm_va_block_region_t region,
11168                                                     const uvm_page_mask_t *map_page_mask,
11169                                                     uvm_prot_t max_prot,
11170                                                     const uvm_processor_mask_t *thrashing_processors)
11171 {
11172     NV_STATUS tracker_status, status = NV_OK;
11173     uvm_processor_mask_t *map_other_processors = NULL;
11174     uvm_processor_mask_t *map_uvm_lite_gpus = NULL;
11175     uvm_processor_id_t map_processor_id;
11176     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
11177     const uvm_page_mask_t *final_page_mask = map_page_mask;
11178     uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
11179     const uvm_va_policy_t *policy = uvm_va_policy_get_region(va_block, region);
11180     uvm_processor_id_t preferred_location;
11181 
11182     uvm_assert_mutex_locked(&va_block->lock);
11183 
11184     map_other_processors = uvm_processor_mask_cache_alloc();
11185     if (!map_other_processors) {
11186         status = NV_ERR_NO_MEMORY;
11187         goto out;
11188     }
11189 
11190     map_uvm_lite_gpus = uvm_processor_mask_cache_alloc();
11191     if (!map_uvm_lite_gpus) {
11192         status = NV_ERR_NO_MEMORY;
11193         goto out;
11194     }
11195 
11196     // Read duplication takes precedence over SetAccessedBy.
11197     //
11198     // Exclude ranges with read duplication set...
11199     if (uvm_va_policy_is_read_duplicate(policy, va_space)) {
11200         status = NV_OK;
11201         goto out;
11202     }
11203 
11204     // ... and pages read-duplicated by performance heuristics
11205     if (policy->read_duplication == UVM_READ_DUPLICATION_UNSET) {
11206         if (map_page_mask) {
11207             uvm_page_mask_andnot(&va_block_context->mapping.filtered_page_mask,
11208                                  map_page_mask,
11209                                  &va_block->read_duplicated_pages);
11210         }
11211         else {
11212             uvm_page_mask_complement(&va_block_context->mapping.filtered_page_mask, &va_block->read_duplicated_pages);
11213         }
11214         final_page_mask = &va_block_context->mapping.filtered_page_mask;
11215     }
11216 
11217     // Add mappings for accessed_by processors and the given processor mask
11218     if (thrashing_processors)
11219         uvm_processor_mask_or(map_other_processors, &policy->accessed_by, thrashing_processors);
11220     else
11221         uvm_processor_mask_copy(map_other_processors, &policy->accessed_by);
11222 
11223     // Only processors that can access the new location must be considered
11224     uvm_processor_mask_and(map_other_processors,
11225                            map_other_processors,
11226                            &va_space->accessible_from[uvm_id_value(new_residency)]);
11227 
11228     // Exclude caller processor as it must have already been mapped
11229     uvm_processor_mask_clear(map_other_processors, processor_id);
11230 
11231     // Exclude preferred location so it won't get remote mappings
11232     preferred_location = policy->preferred_location;
11233     if (UVM_ID_IS_VALID(preferred_location) &&
11234         !uvm_id_equal(new_residency, preferred_location) &&
11235         uvm_va_space_processor_has_memory(va_space, preferred_location)) {
11236         uvm_processor_mask_clear(map_other_processors, preferred_location);
11237     }
11238 
11239     // Map the UVM-Lite GPUs if the new location is the preferred location. This
11240     // will only create mappings on first touch. After that they're persistent
11241     // so uvm_va_block_map will be a no-op.
11242     uvm_processor_mask_and(map_uvm_lite_gpus, map_other_processors, block_get_uvm_lite_gpus(va_block));
11243     if (!uvm_processor_mask_empty(map_uvm_lite_gpus) &&
11244         uvm_va_policy_preferred_location_equal(policy, new_residency, va_block_context->make_resident.dest_nid)) {
11245         for_each_id_in_mask (map_processor_id, map_uvm_lite_gpus) {
11246             status = uvm_va_block_map(va_block,
11247                                       va_block_context,
11248                                       map_processor_id,
11249                                       region,
11250                                       final_page_mask,
11251                                       UVM_PROT_READ_WRITE_ATOMIC,
11252                                       UvmEventMapRemoteCauseCoherence,
11253                                       &local_tracker);
11254             if (status != NV_OK)
11255                 goto out;
11256         }
11257     }
11258 
11259     uvm_processor_mask_andnot(map_other_processors, map_other_processors, block_get_uvm_lite_gpus(va_block));
11260 
11261     // We can't map non-migratable pages to the CPU. If we have any, build a
11262     // new mask of migratable pages and map the CPU separately.
11263     if (uvm_processor_mask_test(map_other_processors, UVM_ID_CPU) &&
11264         !uvm_range_group_all_migratable(va_space,
11265                                         uvm_va_block_region_start(va_block, region),
11266                                         uvm_va_block_region_end(va_block, region))) {
11267         uvm_page_mask_t *migratable_mask = &va_block_context->mapping.migratable_mask;
11268 
11269         uvm_range_group_migratable_page_mask(va_block, region, migratable_mask);
11270         if (uvm_page_mask_and(migratable_mask, migratable_mask, final_page_mask)) {
11271             status = do_block_add_mappings_after_migration(va_block,
11272                                                            va_block_context,
11273                                                            new_residency,
11274                                                            processor_id,
11275                                                            &g_uvm_processor_mask_cpu,
11276                                                            region,
11277                                                            migratable_mask,
11278                                                            max_prot,
11279                                                            thrashing_processors,
11280                                                            &local_tracker);
11281             if (status != NV_OK)
11282                 goto out;
11283         }
11284 
11285         uvm_processor_mask_clear(map_other_processors, UVM_ID_CPU);
11286     }
11287 
11288     status = do_block_add_mappings_after_migration(va_block,
11289                                                    va_block_context,
11290                                                    new_residency,
11291                                                    processor_id,
11292                                                    map_other_processors,
11293                                                    region,
11294                                                    final_page_mask,
11295                                                    max_prot,
11296                                                    thrashing_processors,
11297                                                    &local_tracker);
11298     if (status != NV_OK)
11299         goto out;
11300 
11301 out:
11302     tracker_status = uvm_tracker_add_tracker_safe(&va_block->tracker, &local_tracker);
11303     uvm_tracker_deinit(&local_tracker);
11304     uvm_processor_mask_cache_free(map_other_processors);
11305     uvm_processor_mask_cache_free(map_uvm_lite_gpus);
11306 
11307     return status == NV_OK ? tracker_status : status;
11308 }
11309 
uvm_va_block_page_compute_highest_permission(uvm_va_block_t * va_block,uvm_va_block_context_t * va_block_context,uvm_processor_id_t processor_id,uvm_page_index_t page_index)11310 uvm_prot_t uvm_va_block_page_compute_highest_permission(uvm_va_block_t *va_block,
11311                                                         uvm_va_block_context_t *va_block_context,
11312                                                         uvm_processor_id_t processor_id,
11313                                                         uvm_page_index_t page_index)
11314 {
11315     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
11316     uvm_processor_mask_t *resident_processors = &va_block_context->scratch_processor_mask;
11317     NvU32 resident_processors_count;
11318 
11319     if (uvm_processor_mask_test(block_get_uvm_lite_gpus(va_block), processor_id))
11320         return UVM_PROT_READ_WRITE_ATOMIC;
11321 
11322     uvm_va_block_page_resident_processors(va_block, page_index, resident_processors);
11323     resident_processors_count = uvm_processor_mask_get_count(resident_processors);
11324 
11325     if (resident_processors_count == 0) {
11326         return UVM_PROT_NONE;
11327     }
11328     else if (resident_processors_count > 1) {
11329         // If there are many copies, we can only map READ ONLY
11330         //
11331         // The block state doesn't track the mapping target (aperture) of each
11332         // individual PTE, just the permissions and where the data is resident.
11333         // If the data is resident in multiple places, then we have a problem
11334         // since we can't know where the PTE points. This means we won't know
11335         // what needs to be unmapped for cases like UvmUnregisterGpu and
11336         // UvmDisablePeerAccess.
11337         //
11338         // The simple way to solve this is to enforce that a read-duplication
11339         // mapping always points to local memory.
11340         if (uvm_processor_mask_test(resident_processors, processor_id))
11341             return UVM_PROT_READ_ONLY;
11342 
11343         return UVM_PROT_NONE;
11344     }
11345     else {
11346         uvm_processor_id_t atomic_id;
11347         uvm_processor_id_t residency;
11348         uvm_processor_mask_t *atomic_mappings;
11349         uvm_processor_mask_t *write_mappings;
11350 
11351         // Search the id of the processor with the only resident copy
11352         residency = uvm_processor_mask_find_first_id(resident_processors);
11353         UVM_ASSERT(UVM_ID_IS_VALID(residency));
11354 
11355         // If we cannot map the processor with the resident copy, exit
11356         if (!uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(residency)], processor_id))
11357             return UVM_PROT_NONE;
11358 
11359         // Fast path: if the page is not mapped anywhere else, it can be safely
11360         // mapped with RWA permission
11361         if (!uvm_page_mask_test(&va_block->maybe_mapped_pages, page_index) &&
11362             !uvm_va_block_is_hmm(va_block))
11363             return UVM_PROT_READ_WRITE_ATOMIC;
11364 
11365         atomic_mappings = &va_block_context->scratch_processor_mask;
11366 
11367         block_page_authorized_processors(va_block, page_index, UVM_PROT_READ_WRITE_ATOMIC, atomic_mappings);
11368 
11369         // Exclude processors with system-wide atomics disabled from atomic_mappings
11370         uvm_processor_mask_and(atomic_mappings, atomic_mappings, &va_space->system_wide_atomics_enabled_processors);
11371 
11372         // Exclude the processor for which the mapping protections are being computed
11373         uvm_processor_mask_clear(atomic_mappings, processor_id);
11374 
11375         // If there is any processor with atomic mapping, check if it has native atomics to the processor
11376         // with the resident copy. If it does not, we can only map READ ONLY
11377         atomic_id = uvm_processor_mask_find_first_id(atomic_mappings);
11378         if (UVM_ID_IS_VALID(atomic_id) &&
11379             !uvm_processor_mask_test(&va_space->has_native_atomics[uvm_id_value(residency)], atomic_id)) {
11380             return UVM_PROT_READ_ONLY;
11381         }
11382 
11383         write_mappings = &va_block_context->scratch_processor_mask;
11384 
11385         block_page_authorized_processors(va_block, page_index, UVM_PROT_READ_WRITE, write_mappings);
11386 
11387         // Exclude the processor for which the mapping protections are being computed
11388         uvm_processor_mask_clear(write_mappings, processor_id);
11389 
11390         // At this point, any processor with atomic mappings either has native
11391         // atomics support to the processor with the resident copy or has
11392         // disabled system-wide atomics. If the requesting processor has
11393         // disabled system-wide atomics or has native atomics to that processor,
11394         // we can map with ATOMIC privileges. Likewise, if there are no other
11395         // processors with WRITE or ATOMIC mappings, we can map with ATOMIC
11396         // privileges. For HMM, don't allow GPU atomic access to remote mapped
11397         // system memory even if there are no write mappings since CPU access
11398         // can be upgraded without notification.
11399         if (!uvm_processor_mask_test(&va_space->system_wide_atomics_enabled_processors, processor_id) ||
11400             uvm_processor_mask_test(&va_space->has_native_atomics[uvm_id_value(residency)], processor_id) ||
11401             (uvm_processor_mask_empty(write_mappings) && !uvm_va_block_is_hmm(va_block))) {
11402             return UVM_PROT_READ_WRITE_ATOMIC;
11403         }
11404 
11405         return UVM_PROT_READ_WRITE;
11406     }
11407 }
11408 
uvm_va_block_add_mappings(uvm_va_block_t * va_block,uvm_va_block_context_t * va_block_context,uvm_processor_id_t processor_id,uvm_va_block_region_t region,const uvm_page_mask_t * page_mask,UvmEventMapRemoteCause cause)11409 NV_STATUS uvm_va_block_add_mappings(uvm_va_block_t *va_block,
11410                                     uvm_va_block_context_t *va_block_context,
11411                                     uvm_processor_id_t processor_id,
11412                                     uvm_va_block_region_t region,
11413                                     const uvm_page_mask_t *page_mask,
11414                                     UvmEventMapRemoteCause cause)
11415 {
11416     uvm_va_range_t *va_range = va_block->va_range;
11417     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
11418     NV_STATUS status = NV_OK;
11419     uvm_page_index_t page_index;
11420     uvm_range_group_range_iter_t iter;
11421     uvm_prot_t prot_to_map;
11422 
11423     if (UVM_ID_IS_CPU(processor_id) && !uvm_va_block_is_hmm(va_block)) {
11424         if (!uvm_va_range_vma_check(va_range, va_block_context->mm))
11425             return NV_OK;
11426 
11427         uvm_range_group_range_migratability_iter_first(va_space,
11428                                                        uvm_va_block_region_start(va_block, region),
11429                                                        uvm_va_block_region_end(va_block, region),
11430                                                        &iter);
11431     }
11432 
11433     for (prot_to_map = UVM_PROT_READ_ONLY; prot_to_map <= UVM_PROT_READ_WRITE_ATOMIC; ++prot_to_map)
11434         va_block_context->mask_by_prot[prot_to_map - 1].count = 0;
11435 
11436     for_each_va_block_page_in_region_mask(page_index, page_mask, region) {
11437         // Read duplication takes precedence over SetAccessedBy. Exclude pages
11438         // read-duplicated by performance heuristics
11439         if (uvm_page_mask_test(&va_block->read_duplicated_pages, page_index))
11440             continue;
11441 
11442         prot_to_map = uvm_va_block_page_compute_highest_permission(va_block,
11443                                                                    va_block_context,
11444                                                                    processor_id,
11445                                                                    page_index);
11446         if (prot_to_map == UVM_PROT_NONE)
11447             continue;
11448 
11449         if (UVM_ID_IS_CPU(processor_id) && !uvm_va_block_is_hmm(va_block)) {
11450             while (uvm_va_block_cpu_page_index(va_block, iter.end) < page_index) {
11451                 uvm_range_group_range_migratability_iter_next(va_space,
11452                                                               &iter,
11453                                                               uvm_va_block_region_end(va_block, region));
11454             }
11455 
11456             if (!iter.migratable)
11457                 continue;
11458         }
11459 
11460         if (va_block_context->mask_by_prot[prot_to_map - 1].count++ == 0)
11461             uvm_page_mask_zero(&va_block_context->mask_by_prot[prot_to_map - 1].page_mask);
11462 
11463         uvm_page_mask_set(&va_block_context->mask_by_prot[prot_to_map - 1].page_mask, page_index);
11464     }
11465 
11466     for (prot_to_map = UVM_PROT_READ_ONLY; prot_to_map <= UVM_PROT_READ_WRITE_ATOMIC; ++prot_to_map) {
11467         if (va_block_context->mask_by_prot[prot_to_map - 1].count == 0)
11468             continue;
11469 
11470         status = uvm_va_block_map(va_block,
11471                                   va_block_context,
11472                                   processor_id,
11473                                   region,
11474                                   &va_block_context->mask_by_prot[prot_to_map - 1].page_mask,
11475                                   prot_to_map,
11476                                   cause,
11477                                   &va_block->tracker);
11478         if (status != NV_OK)
11479             break;
11480     }
11481 
11482     return status;
11483 }
11484 
can_read_duplicate(uvm_va_block_t * va_block,uvm_page_index_t page_index,const uvm_va_policy_t * policy,const uvm_perf_thrashing_hint_t * thrashing_hint)11485 static bool can_read_duplicate(uvm_va_block_t *va_block,
11486                                uvm_page_index_t page_index,
11487                                const uvm_va_policy_t *policy,
11488                                const uvm_perf_thrashing_hint_t *thrashing_hint)
11489 {
11490     if (uvm_va_policy_is_read_duplicate(policy, uvm_va_block_get_va_space(va_block)))
11491         return true;
11492 
11493     if (policy->read_duplication != UVM_READ_DUPLICATION_DISABLED &&
11494         uvm_page_mask_test(&va_block->read_duplicated_pages, page_index) &&
11495         thrashing_hint->type != UVM_PERF_THRASHING_HINT_TYPE_PIN)
11496         return true;
11497 
11498     return false;
11499 }
11500 
11501 // TODO: Bug 1827400: If the faulting processor has support for native
11502 //       atomics to the current location and the faults on the page were
11503 //       triggered by atomic accesses only, we keep the current residency.
11504 //       This is a short-term solution to exercise remote atomics over
11505 //       NVLINK when possible (not only when preferred location is set to
11506 //       the remote GPU) as they are much faster than relying on page
11507 //       faults and permission downgrades, which cause thrashing. In the
11508 //       future, the thrashing detection/prevention heuristics should
11509 //       detect and handle this case.
map_remote_on_atomic_fault(uvm_va_space_t * va_space,NvU32 access_type_mask,uvm_processor_id_t processor_id,uvm_processor_id_t residency)11510 static bool map_remote_on_atomic_fault(uvm_va_space_t *va_space,
11511                                        NvU32 access_type_mask,
11512                                        uvm_processor_id_t processor_id,
11513                                        uvm_processor_id_t residency)
11514 {
11515     // This policy can be enabled/disabled using a module parameter
11516     if (!uvm_perf_map_remote_on_native_atomics_fault)
11517         return false;
11518 
11519     // Only consider atomics faults
11520     if (uvm_fault_access_type_mask_lowest(access_type_mask) < UVM_FAULT_ACCESS_TYPE_ATOMIC_WEAK)
11521         return false;
11522 
11523     // We cannot differentiate CPU writes from atomics. We exclude CPU faults
11524     // from the logic explained above in order to avoid mapping CPU to vidmem
11525     // memory due to a write.
11526     if (UVM_ID_IS_CPU(processor_id))
11527         return false;
11528 
11529     // On P9 systems (which have native HW support for system-wide atomics), we
11530     // have determined experimentally that placing memory on a GPU yields the
11531     // best performance on most cases (since CPU can cache vidmem but not vice
11532     // versa). Therefore, don't map remotely if the current residency is
11533     // sysmem.
11534     if (UVM_ID_IS_CPU(residency))
11535         return false;
11536 
11537     return uvm_processor_mask_test(&va_space->has_native_atomics[uvm_id_value(residency)], processor_id);
11538 }
11539 
11540 // TODO: Bug 1766424: this function works on a single page at a time. This
11541 //       could be changed in the future to optimize multiple faults or access
11542 //       counter notifications on contiguous pages.
block_select_processor_residency(uvm_va_block_t * va_block,uvm_va_block_context_t * va_block_context,uvm_page_index_t page_index,uvm_processor_id_t processor_id,NvU32 access_type_mask,const uvm_va_policy_t * policy,const uvm_perf_thrashing_hint_t * thrashing_hint,uvm_service_operation_t operation,const bool hmm_migratable,bool * read_duplicate)11543 static uvm_processor_id_t block_select_processor_residency(uvm_va_block_t *va_block,
11544                                                            uvm_va_block_context_t *va_block_context,
11545                                                            uvm_page_index_t page_index,
11546                                                            uvm_processor_id_t processor_id,
11547                                                            NvU32 access_type_mask,
11548                                                            const uvm_va_policy_t *policy,
11549                                                            const uvm_perf_thrashing_hint_t *thrashing_hint,
11550                                                            uvm_service_operation_t operation,
11551                                                            const bool hmm_migratable,
11552                                                            bool *read_duplicate)
11553 {
11554     uvm_processor_id_t closest_resident_processor;
11555     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
11556     bool may_read_duplicate;
11557     uvm_processor_id_t preferred_location;
11558 
11559     // TODO: Bug 3660968: Remove uvm_hmm_force_sysmem_set() check as soon as
11560     // HMM migration is implemented VMAs other than anonymous memory.
11561     // TODO: Bug 4050579: Remove hmm_migratable check when swap cached pages
11562     // can be migrated.
11563     if (is_uvm_fault_force_sysmem_set() ||
11564         !hmm_migratable ||
11565         uvm_hmm_must_use_sysmem(va_block, va_block_context->hmm.vma)) {
11566         *read_duplicate = false;
11567         return UVM_ID_CPU;
11568     }
11569 
11570     may_read_duplicate = can_read_duplicate(va_block, page_index, policy, thrashing_hint);
11571 
11572     // Read/prefetch faults on a VA range with read duplication enabled
11573     // always create a copy of the page on the faulting processor's memory.
11574     // Note that access counters always use UVM_FAULT_ACCESS_TYPE_PREFETCH,
11575     // which will lead to read duplication if it is enabled.
11576     *read_duplicate = may_read_duplicate &&
11577                       (uvm_fault_access_type_mask_highest(access_type_mask) <= UVM_FAULT_ACCESS_TYPE_READ);
11578 
11579     if (*read_duplicate)
11580         return processor_id;
11581 
11582     *read_duplicate = false;
11583 
11584     // If read-duplication is active in the page but we are not
11585     // read-duplicating because the access type is not a read or a prefetch,
11586     // the faulting processor should get a local copy
11587     if (may_read_duplicate)
11588         return processor_id;
11589 
11590     // If the faulting processor is the preferred location always migrate
11591     preferred_location = policy->preferred_location;
11592     if (uvm_id_equal(processor_id, preferred_location)) {
11593         if (thrashing_hint->type != UVM_PERF_THRASHING_HINT_TYPE_NONE) {
11594             UVM_ASSERT(thrashing_hint->type == UVM_PERF_THRASHING_HINT_TYPE_PIN);
11595             if (uvm_va_space_processor_has_memory(va_space, processor_id))
11596                 UVM_ASSERT(uvm_id_equal(thrashing_hint->pin.residency, processor_id));
11597         }
11598 
11599         return processor_id;
11600     }
11601 
11602     // If the faulting processor is the CPU, HMM has to migrate the block to
11603     // system memory.
11604     // TODO: Bug 3900021: [UVM-HMM] investigate thrashing improvements.
11605     if (UVM_ID_IS_CPU(processor_id) && uvm_va_block_is_hmm(va_block))
11606         return processor_id;
11607 
11608     if (thrashing_hint->type == UVM_PERF_THRASHING_HINT_TYPE_PIN) {
11609         UVM_ASSERT(uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(thrashing_hint->pin.residency)],
11610                                            processor_id));
11611         return thrashing_hint->pin.residency;
11612     }
11613 
11614     closest_resident_processor = uvm_va_block_page_get_closest_resident(va_block,
11615                                                                         va_block_context,
11616                                                                         page_index,
11617                                                                         processor_id);
11618 
11619     // If the page is not resident anywhere, select the preferred location as
11620     // long as the preferred location is accessible from the faulting processor.
11621     // Otherwise select the faulting processor.
11622     if (UVM_ID_IS_INVALID(closest_resident_processor)) {
11623         if (UVM_ID_IS_VALID(preferred_location) &&
11624             uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(preferred_location)],
11625                                     processor_id)) {
11626             return preferred_location;
11627         }
11628 
11629         return processor_id;
11630     }
11631 
11632     // AccessedBy mappings might have not been created for the CPU if the thread
11633     // which made the memory resident did not have the proper references on the
11634     // mm_struct (for example, the GPU fault handling path when
11635     // uvm_va_space_mm_enabled() is false).
11636     //
11637     // Also, in uvm_migrate_*, we implement a two-pass scheme in which
11638     // AccessedBy mappings may be delayed to the second pass. This can produce
11639     // faults even if the faulting processor is in the accessed_by mask.
11640     //
11641     // Here, we keep it on the current residency and we just add the missing
11642     // mapping.
11643     if (uvm_processor_mask_test(&policy->accessed_by, processor_id) &&
11644         uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(closest_resident_processor)], processor_id) &&
11645         operation != UVM_SERVICE_OPERATION_ACCESS_COUNTERS) {
11646         return closest_resident_processor;
11647     }
11648 
11649     // Check if we should map the closest resident processor remotely on atomic
11650     // fault
11651     if (map_remote_on_atomic_fault(va_space, access_type_mask, processor_id, closest_resident_processor))
11652         return closest_resident_processor;
11653 
11654     // If the processor has access to the preferred location, and the page is
11655     // not resident on the accessing processor, move it to the preferred
11656     // location.
11657     if (!uvm_id_equal(closest_resident_processor, processor_id) &&
11658         UVM_ID_IS_VALID(preferred_location) &&
11659         uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(preferred_location)], processor_id))
11660         return preferred_location;
11661 
11662     // Check if we should map the closest resident processor remotely on remote CPU fault
11663     //
11664     // When faulting on CPU, there's a linux process on behalf of it, which is associated
11665     // with a unique VM pointed by current->mm. A block of memory residing on GPU is also
11666     // associated with VM, pointed by va_block_context->mm. If they match, it's a regular
11667     // (local) fault, and we may want to migrate a page from GPU to CPU.
11668     // If it's a 'remote' fault, i.e. linux process differs from one associated with block
11669     // VM, we might preserve residence.
11670     //
11671     // Establishing a remote fault without access counters means the memory could stay in
11672     // the wrong spot for a long time, which is why we prefer to avoid creating remote
11673     // mappings. However when NIC accesses a memory residing on GPU, it's worth to keep it
11674     // in place for NIC accesses.
11675     //
11676     // The logic that's used to detect remote faulting also keeps memory in place for
11677     // ptrace accesses. We would prefer to control those policies separately, but the
11678     // NIC case takes priority.
11679     // If the accessing processor is CPU, we're either handling a fault
11680     // from other than owning process, or we're handling an MOMC
11681     // notification. Only prevent migration for the former.
11682     if (UVM_ID_IS_CPU(processor_id) &&
11683         operation != UVM_SERVICE_OPERATION_ACCESS_COUNTERS &&
11684         uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(closest_resident_processor)], processor_id) &&
11685         va_block_context->mm != current->mm) {
11686         UVM_ASSERT(va_block_context->mm != NULL);
11687         return closest_resident_processor;
11688     }
11689 
11690     // If the page is resident on a processor other than the preferred location,
11691     // or the faulting processor can't access the preferred location, we select
11692     // the faulting processor as the new residency.
11693     return processor_id;
11694 }
11695 
block_select_node_residency(uvm_va_block_t * va_block,uvm_page_index_t page_index,uvm_processor_id_t new_residency,const uvm_va_policy_t * policy,const uvm_perf_thrashing_hint_t * thrashing_hint)11696 static int block_select_node_residency(uvm_va_block_t *va_block,
11697                                        uvm_page_index_t page_index,
11698                                        uvm_processor_id_t new_residency,
11699                                        const uvm_va_policy_t *policy,
11700                                        const uvm_perf_thrashing_hint_t *thrashing_hint)
11701 {
11702     // For CPU faults, the fault handler runs on the CPU that faulted.
11703     // For GPU faults, the bottom half is pinned to CPUs closest to their GPU.
11704     // Therefore, in both cases, we can use numa_mem_id() to get the NUMA node
11705     // ID of the faulting processor.
11706     // Note that numa_mem_id() returns the nearest node with memory. In most
11707     // cases, this will be the current NUMA node. However, in the case that the
11708     // current node does not have any memory, we probably want the nearest node
11709     // with memory, anyway.
11710     int current_nid = numa_mem_id();
11711     bool may_read_duplicate = can_read_duplicate(va_block, page_index, policy, thrashing_hint);
11712 
11713     // For HMM allocations UVM doesn't always control allocation of the
11714     // destination page as the kernel may have already allocated one. Therefore
11715     // we can't respect the preferred node ID for HMM pages.
11716     // TODO: Bug 4453874: [UVM-HMM] Respect the preferred CPU NUMA Node ID when making a HMM page resident
11717     if (uvm_va_block_is_hmm(va_block))
11718         return NUMA_NO_NODE;
11719 
11720     // If the new resident processor is not the CPU, return the preferred nid
11721     // since it could be used for CPU allocations of staging pages.
11722     if (!UVM_ID_IS_CPU(new_residency))
11723         return policy->preferred_nid;
11724 
11725     // If the preferred location is the CPU, the new resident nid is the
11726     // preferred nid.
11727     if (UVM_ID_IS_CPU(policy->preferred_location))
11728         return policy->preferred_nid;
11729 
11730     // If read duplication is enabled and the page is also resident on the CPU,
11731     // keep its current NUMA node residency.
11732     if (may_read_duplicate && uvm_va_block_cpu_is_page_resident_on(va_block, NUMA_NO_NODE, page_index))
11733         return NUMA_NO_NODE;
11734 
11735     // The new_residency processor is the CPU and the preferred location is not
11736     // the CPU. If the page is resident on the CPU, keep its current residency.
11737     if (uvm_va_block_cpu_is_page_resident_on(va_block, NUMA_NO_NODE, page_index))
11738         return NUMA_NO_NODE;
11739 
11740     return current_nid;
11741 }
11742 
uvm_va_block_select_residency(uvm_va_block_t * va_block,uvm_va_block_context_t * va_block_context,uvm_page_index_t page_index,uvm_processor_id_t processor_id,NvU32 access_type_mask,const uvm_va_policy_t * policy,const uvm_perf_thrashing_hint_t * thrashing_hint,uvm_service_operation_t operation,const bool hmm_migratable,bool * read_duplicate)11743 uvm_processor_id_t uvm_va_block_select_residency(uvm_va_block_t *va_block,
11744                                                  uvm_va_block_context_t *va_block_context,
11745                                                  uvm_page_index_t page_index,
11746                                                  uvm_processor_id_t processor_id,
11747                                                  NvU32 access_type_mask,
11748                                                  const uvm_va_policy_t *policy,
11749                                                  const uvm_perf_thrashing_hint_t *thrashing_hint,
11750                                                  uvm_service_operation_t operation,
11751                                                  const bool hmm_migratable,
11752                                                  bool *read_duplicate)
11753 {
11754     uvm_processor_id_t id;
11755     int nid;
11756 
11757     UVM_ASSERT(uvm_hmm_check_context_vma_is_valid(va_block,
11758                                                   va_block_context->hmm.vma,
11759                                                   uvm_va_block_region_for_page(page_index)));
11760 
11761     // First, select the processor for the new residency.
11762     id = block_select_processor_residency(va_block,
11763                                           va_block_context,
11764                                           page_index,
11765                                           processor_id,
11766                                           access_type_mask,
11767                                           policy,
11768                                           thrashing_hint,
11769                                           operation,
11770                                           hmm_migratable,
11771                                           read_duplicate);
11772 
11773     // If the intended residency doesn't have memory, fall back to the CPU.
11774     if (!block_processor_has_memory(va_block, id)) {
11775         *read_duplicate = false;
11776         id = UVM_ID_CPU;
11777     }
11778 
11779     // Now, that we know the new residency processor, select the NUMA node ID
11780     // based on the new processor.
11781     nid = block_select_node_residency(va_block, page_index, id, policy, thrashing_hint);
11782 
11783     va_block_context->make_resident.dest_nid = nid;
11784 
11785     return id;
11786 }
11787 
check_access_counters_dont_revoke(uvm_va_block_t * block,uvm_va_block_context_t * block_context,uvm_va_block_region_t region,const uvm_processor_mask_t * revoke_processors,const uvm_page_mask_t * revoke_page_mask,uvm_prot_t revoke_prot)11788 static bool check_access_counters_dont_revoke(uvm_va_block_t *block,
11789                                               uvm_va_block_context_t *block_context,
11790                                               uvm_va_block_region_t region,
11791                                               const uvm_processor_mask_t *revoke_processors,
11792                                               const uvm_page_mask_t *revoke_page_mask,
11793                                               uvm_prot_t revoke_prot)
11794 {
11795     uvm_processor_id_t id;
11796     for_each_id_in_mask(id, revoke_processors) {
11797         const uvm_page_mask_t *mapped_with_prot = block_map_with_prot_mask_get(block, id, revoke_prot);
11798 
11799         uvm_page_mask_and(&block_context->caller_page_mask, revoke_page_mask, mapped_with_prot);
11800 
11801         UVM_ASSERT(uvm_page_mask_region_weight(&block_context->caller_page_mask, region) == 0);
11802     }
11803 
11804     return true;
11805 }
11806 
11807 // Update service_context->prefetch_hint, service_context->per_processor_masks,
11808 // and service_context->region.
uvm_va_block_get_prefetch_hint(uvm_va_block_t * va_block,const uvm_va_policy_t * policy,uvm_service_block_context_t * service_context)11809 static void uvm_va_block_get_prefetch_hint(uvm_va_block_t *va_block,
11810                                            const uvm_va_policy_t *policy,
11811                                            uvm_service_block_context_t *service_context)
11812 {
11813     uvm_processor_id_t new_residency;
11814     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
11815 
11816     // Performance heuristics policy: we only consider prefetching when there
11817     // are migrations to a single processor, only.
11818     if (uvm_processor_mask_get_count(&service_context->resident_processors) == 1) {
11819         uvm_page_index_t page_index;
11820         uvm_page_mask_t *new_residency_mask;
11821 
11822         new_residency = uvm_processor_mask_find_first_id(&service_context->resident_processors);
11823         new_residency_mask = &service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency;
11824 
11825         // Update prefetch tracking structure with the pages that will migrate
11826         // due to faults
11827         uvm_perf_prefetch_get_hint_va_block(va_block,
11828                                             service_context->block_context,
11829                                             new_residency,
11830                                             new_residency_mask,
11831                                             service_context->region,
11832                                             &service_context->prefetch_bitmap_tree,
11833                                             &service_context->prefetch_hint);
11834 
11835         // Obtain the prefetch hint and give a fake fault access type to the
11836         // prefetched pages
11837         if (UVM_ID_IS_VALID(service_context->prefetch_hint.residency)) {
11838             const uvm_page_mask_t *prefetch_pages_mask = &service_context->prefetch_hint.prefetch_pages_mask;
11839 
11840             for_each_va_block_page_in_mask(page_index, prefetch_pages_mask, va_block) {
11841                 UVM_ASSERT(!uvm_page_mask_test(new_residency_mask, page_index));
11842 
11843                 service_context->access_type[page_index] = UVM_FAULT_ACCESS_TYPE_PREFETCH;
11844 
11845                 if (uvm_va_policy_is_read_duplicate(policy, va_space) ||
11846                     (policy->read_duplication != UVM_READ_DUPLICATION_DISABLED &&
11847                      uvm_page_mask_test(&va_block->read_duplicated_pages, page_index))) {
11848                     if (service_context->read_duplicate_count++ == 0)
11849                         uvm_page_mask_zero(&service_context->read_duplicate_mask);
11850 
11851                     uvm_page_mask_set(&service_context->read_duplicate_mask, page_index);
11852                 }
11853             }
11854 
11855             uvm_page_mask_or(new_residency_mask, new_residency_mask, prefetch_pages_mask);
11856             service_context->region = uvm_va_block_region_from_mask(va_block, new_residency_mask);
11857         }
11858     }
11859     else {
11860         service_context->prefetch_hint.residency = UVM_ID_INVALID;
11861     }
11862 }
11863 
uvm_va_block_service_copy(uvm_processor_id_t processor_id,uvm_processor_id_t new_residency,uvm_va_block_t * va_block,uvm_va_block_retry_t * block_retry,uvm_service_block_context_t * service_context)11864 NV_STATUS uvm_va_block_service_copy(uvm_processor_id_t processor_id,
11865                                     uvm_processor_id_t new_residency,
11866                                     uvm_va_block_t *va_block,
11867                                     uvm_va_block_retry_t *block_retry,
11868                                     uvm_service_block_context_t *service_context)
11869 {
11870     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
11871     uvm_processor_mask_t *all_involved_processors =
11872         &service_context->block_context->make_resident.all_involved_processors;
11873     uvm_page_mask_t *new_residency_mask =
11874         &service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency;
11875     uvm_page_mask_t *did_migrate_mask = &service_context->block_context->make_resident.pages_changed_residency;
11876     uvm_page_mask_t *caller_page_mask = &service_context->block_context->caller_page_mask;
11877     uvm_make_resident_cause_t cause;
11878     NV_STATUS status;
11879 
11880     // 1- Migrate pages
11881     switch (service_context->operation) {
11882         case UVM_SERVICE_OPERATION_REPLAYABLE_FAULTS:
11883             cause = UVM_MAKE_RESIDENT_CAUSE_REPLAYABLE_FAULT;
11884             break;
11885         case UVM_SERVICE_OPERATION_NON_REPLAYABLE_FAULTS:
11886             cause = UVM_MAKE_RESIDENT_CAUSE_NON_REPLAYABLE_FAULT;
11887             break;
11888         case UVM_SERVICE_OPERATION_ACCESS_COUNTERS:
11889             cause = UVM_MAKE_RESIDENT_CAUSE_ACCESS_COUNTER;
11890             break;
11891         default:
11892             UVM_ASSERT_MSG(false, "Invalid operation value %d\n", service_context->operation);
11893             // Set cause to silence compiler warning that it may be unused.
11894             cause = UVM_MAKE_RESIDENT_CAUSE_ACCESS_COUNTER;
11895             break;
11896     }
11897 
11898     // Reset masks before all of the make_resident calls
11899     uvm_page_mask_zero(did_migrate_mask);
11900     uvm_processor_mask_zero(all_involved_processors);
11901 
11902     // Handle read duplication first so that the caller_page_mask will be free
11903     // to use below and still valid in uvm_va_block_service_finish().
11904     // TODO: Bug 3660922: need to implement HMM read duplication support.
11905     if (service_context->read_duplicate_count != 0 &&
11906         uvm_page_mask_and(caller_page_mask,
11907                           new_residency_mask,
11908                           &service_context->read_duplicate_mask)) {
11909         status = uvm_va_block_make_resident_read_duplicate(va_block,
11910                                                            block_retry,
11911                                                            service_context->block_context,
11912                                                            new_residency,
11913                                                            service_context->region,
11914                                                            caller_page_mask,
11915                                                            &service_context->prefetch_hint.prefetch_pages_mask,
11916                                                            cause);
11917         if (status != NV_OK)
11918             return status;
11919     }
11920 
11921     if (service_context->read_duplicate_count == 0 ||
11922         uvm_page_mask_andnot(caller_page_mask, new_residency_mask, &service_context->read_duplicate_mask)) {
11923         if (service_context->read_duplicate_count == 0)
11924             uvm_page_mask_copy(caller_page_mask, new_residency_mask);
11925         status = uvm_va_block_make_resident_copy(va_block,
11926                                                  block_retry,
11927                                                  service_context->block_context,
11928                                                  new_residency,
11929                                                  service_context->region,
11930                                                  caller_page_mask,
11931                                                  &service_context->prefetch_hint.prefetch_pages_mask,
11932                                                  cause);
11933         if (status != NV_OK)
11934             return status;
11935     }
11936 
11937     if (UVM_ID_IS_CPU(processor_id) && !uvm_processor_mask_empty(all_involved_processors))
11938         service_context->cpu_fault.did_migrate = true;
11939 
11940     // 2- Check for ECC errors on all GPUs involved in the migration if CPU is
11941     //    the destination. Migrations in response to CPU faults are special
11942     //    because they're on the only path (apart from tools) where CUDA is not
11943     //    involved and wouldn't have a chance to do its own ECC checking.
11944     if (service_context->operation == UVM_SERVICE_OPERATION_REPLAYABLE_FAULTS &&
11945         UVM_ID_IS_CPU(new_residency) &&
11946         !uvm_processor_mask_empty(all_involved_processors)) {
11947         uvm_gpu_t *gpu;
11948 
11949         // Before checking for ECC errors, make sure all of the GPU work
11950         // is finished. Creating mappings on the CPU would have to wait
11951         // for the tracker anyway so this shouldn't hurt performance.
11952         status = uvm_tracker_wait(&va_block->tracker);
11953         if (status != NV_OK)
11954             return status;
11955 
11956         for_each_va_space_gpu_in_mask(gpu, va_space, all_involved_processors) {
11957             // We cannot call into RM here so use the no RM ECC check.
11958             status = uvm_gpu_check_ecc_error_no_rm(gpu);
11959             if (status == NV_WARN_MORE_PROCESSING_REQUIRED) {
11960                 // In case we need to call into RM to be sure whether
11961                 // there is an ECC error or not, signal that to the
11962                 // caller by adding the GPU to the mask.
11963                 //
11964                 // In that case the ECC error might be noticed only after
11965                 // the CPU mappings have been already created below,
11966                 // exposing different CPU threads to the possibly corrupt
11967                 // data, but this thread will fault eventually and that's
11968                 // considered to be an acceptable trade-off between
11969                 // performance and ECC error containment.
11970                 uvm_processor_mask_set(&service_context->cpu_fault.gpus_to_check_for_ecc, gpu->id);
11971                 status = NV_OK;
11972             }
11973             if (status != NV_OK)
11974                 return status;
11975         }
11976     }
11977 
11978     return NV_OK;
11979 }
11980 
uvm_va_block_service_finish(uvm_processor_id_t processor_id,uvm_va_block_t * va_block,uvm_service_block_context_t * service_context)11981 NV_STATUS uvm_va_block_service_finish(uvm_processor_id_t processor_id,
11982                                       uvm_va_block_t *va_block,
11983                                       uvm_service_block_context_t *service_context)
11984 {
11985     uvm_processor_id_t new_residency = service_context->block_context->make_resident.dest_id;
11986     uvm_page_mask_t *new_residency_mask =
11987         &service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency;
11988     uvm_page_mask_t *did_migrate_mask = &service_context->block_context->make_resident.pages_changed_residency;
11989     uvm_page_mask_t *caller_page_mask = &service_context->block_context->caller_page_mask;
11990     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
11991     uvm_prot_t new_prot;
11992     uvm_page_index_t page_index;
11993     uvm_processor_mask_t *revoke_processors;
11994     NV_STATUS status = NV_OK;
11995 
11996     // Update residency.
11997     if (service_context->read_duplicate_count == 0 || !uvm_page_mask_empty(caller_page_mask))
11998         uvm_va_block_make_resident_finish(va_block,
11999                                           service_context->block_context,
12000                                           service_context->region,
12001                                           caller_page_mask);
12002 
12003     uvm_page_mask_andnot(&service_context->did_not_migrate_mask, new_residency_mask, did_migrate_mask);
12004 
12005     // The loops below depend on the enums having the following values in order
12006     // to index into service_context->mappings_by_prot[].
12007     BUILD_BUG_ON(UVM_PROT_READ_ONLY != 1);
12008     BUILD_BUG_ON(UVM_PROT_READ_WRITE != 2);
12009     BUILD_BUG_ON(UVM_PROT_READ_WRITE_ATOMIC != 3);
12010     BUILD_BUG_ON(UVM_PROT_MAX != 4);
12011 
12012     revoke_processors = uvm_processor_mask_cache_alloc();
12013     if (!revoke_processors)
12014         return NV_ERR_NO_MEMORY;
12015 
12016     // 1- Compute mapping protections for the requesting processor on the new
12017     // residency.
12018     for (new_prot = UVM_PROT_READ_ONLY; new_prot < UVM_PROT_MAX; ++new_prot)
12019         service_context->mappings_by_prot[new_prot - 1].count = 0;
12020 
12021     for_each_va_block_page_in_region_mask(page_index, new_residency_mask, service_context->region) {
12022         new_prot = compute_new_permission(va_block,
12023                                           service_context->block_context,
12024                                           page_index,
12025                                           processor_id,
12026                                           new_residency,
12027                                           service_context->access_type[page_index]);
12028 
12029         if (service_context->mappings_by_prot[new_prot - 1].count++ == 0)
12030             uvm_page_mask_zero(&service_context->mappings_by_prot[new_prot - 1].page_mask);
12031 
12032         uvm_page_mask_set(&service_context->mappings_by_prot[new_prot - 1].page_mask, page_index);
12033     }
12034 
12035     // 2- Revoke permissions
12036     //
12037     // NOTE: uvm_va_block_make_resident_copy destroys mappings to old locations.
12038     //       Thus, we need to revoke only if residency did not change and we
12039     //       are mapping higher than READ ONLY.
12040     for (new_prot = UVM_PROT_READ_WRITE; new_prot <= UVM_PROT_READ_WRITE_ATOMIC; ++new_prot) {
12041         bool pages_need_revocation;
12042         uvm_prot_t revoke_prot;
12043         bool this_processor_has_enabled_atomics;
12044 
12045         if (service_context->mappings_by_prot[new_prot - 1].count == 0)
12046             continue;
12047 
12048         pages_need_revocation = uvm_page_mask_and(&service_context->revocation_mask,
12049                                                   &service_context->did_not_migrate_mask,
12050                                                   &service_context->mappings_by_prot[new_prot - 1].page_mask);
12051         if (!pages_need_revocation)
12052             continue;
12053 
12054         uvm_processor_mask_and(revoke_processors, &va_block->mapped, &va_space->faultable_processors);
12055 
12056         // Do not revoke the processor that took the fault
12057         uvm_processor_mask_clear(revoke_processors, processor_id);
12058 
12059         this_processor_has_enabled_atomics = uvm_processor_mask_test(&va_space->system_wide_atomics_enabled_processors,
12060                                                                      processor_id);
12061 
12062         // Atomic operations on processors with system-wide atomics
12063         // disabled or with native atomics access to new_residency
12064         // behave like writes.
12065         if (new_prot == UVM_PROT_READ_WRITE ||
12066             !this_processor_has_enabled_atomics ||
12067             uvm_processor_mask_test(&va_space->has_native_atomics[uvm_id_value(new_residency)], processor_id)) {
12068 
12069             // Exclude processors with native atomics on the resident copy
12070             uvm_processor_mask_andnot(revoke_processors,
12071                                       revoke_processors,
12072                                       &va_space->has_native_atomics[uvm_id_value(new_residency)]);
12073 
12074             // Exclude processors with disabled system-wide atomics
12075             uvm_processor_mask_and(revoke_processors,
12076                                    revoke_processors,
12077                                    &va_space->system_wide_atomics_enabled_processors);
12078         }
12079 
12080         if (UVM_ID_IS_CPU(processor_id)) {
12081             revoke_prot = UVM_PROT_READ_WRITE_ATOMIC;
12082         }
12083         else {
12084             revoke_prot = (new_prot == UVM_PROT_READ_WRITE_ATOMIC)? UVM_PROT_READ_WRITE:
12085                                                                     UVM_PROT_READ_WRITE_ATOMIC;
12086         }
12087 
12088         // UVM-Lite processors must always have RWA mappings
12089         if (uvm_processor_mask_andnot(revoke_processors, revoke_processors, block_get_uvm_lite_gpus(va_block))) {
12090             // Access counters should never trigger revocations apart from
12091             // read-duplication, which are performed in the calls to
12092             // uvm_va_block_make_resident_read_duplicate, above.
12093             if (service_context->operation == UVM_SERVICE_OPERATION_ACCESS_COUNTERS) {
12094                 UVM_ASSERT(check_access_counters_dont_revoke(va_block,
12095                                                              service_context->block_context,
12096                                                              service_context->region,
12097                                                              revoke_processors,
12098                                                              &service_context->revocation_mask,
12099                                                              revoke_prot));
12100             }
12101 
12102             // Downgrade other processors' mappings
12103             status = uvm_va_block_revoke_prot_mask(va_block,
12104                                                    service_context->block_context,
12105                                                    revoke_processors,
12106                                                    service_context->region,
12107                                                    &service_context->revocation_mask,
12108                                                    revoke_prot);
12109             if (status != NV_OK)
12110                 break;
12111         }
12112     }
12113 
12114     uvm_processor_mask_cache_free(revoke_processors);
12115 
12116     if (status != NV_OK)
12117         return status;
12118 
12119     // 3- Map requesting processor with the necessary privileges
12120     for (new_prot = UVM_PROT_READ_ONLY; new_prot <= UVM_PROT_READ_WRITE_ATOMIC; ++new_prot) {
12121         const uvm_page_mask_t *map_prot_mask = &service_context->mappings_by_prot[new_prot - 1].page_mask;
12122 
12123         if (service_context->mappings_by_prot[new_prot - 1].count == 0)
12124             continue;
12125 
12126         // 3.1 - Unmap CPU pages
12127         // HMM cpu mappings can be upgraded at any time without notification
12128         // so no need to downgrade first.
12129         if (service_context->operation != UVM_SERVICE_OPERATION_ACCESS_COUNTERS &&
12130             UVM_ID_IS_CPU(processor_id) &&
12131             !uvm_va_block_is_hmm(va_block)) {
12132             // The kernel can downgrade managed CPU mappings at any time without
12133             // notifying us, which means our PTE state could be stale. We
12134             // handle this by unmapping the CPU PTE and re-mapping it again.
12135             //
12136             // A CPU fault is unexpected if:
12137             // curr_prot == RW || (!is_write && curr_prot == RO)
12138             status = uvm_va_block_unmap(va_block,
12139                                         service_context->block_context,
12140                                         UVM_ID_CPU,
12141                                         service_context->region,
12142                                         map_prot_mask,
12143                                         NULL);
12144             if (status != NV_OK)
12145                 return status;
12146         }
12147 
12148         // 3.2 - Add new mappings
12149 
12150         // The faulting processor can be mapped remotely due to user policy or
12151         // the thrashing mitigation heuristics. Therefore, we set the cause
12152         // accordingly in each case.
12153 
12154         // Map pages that are thrashing first
12155         if (service_context->thrashing_pin_count > 0 && va_space->tools.enabled) {
12156             uvm_page_mask_t *helper_page_mask = &service_context->block_context->caller_page_mask;
12157             bool pages_need_mapping = uvm_page_mask_and(helper_page_mask,
12158                                                         map_prot_mask,
12159                                                         &service_context->thrashing_pin_mask);
12160             if (pages_need_mapping) {
12161                 status = uvm_va_block_map(va_block,
12162                                           service_context->block_context,
12163                                           processor_id,
12164                                           service_context->region,
12165                                           helper_page_mask,
12166                                           new_prot,
12167                                           UvmEventMapRemoteCauseThrashing,
12168                                           &va_block->tracker);
12169                 if (status != NV_OK)
12170                     return status;
12171 
12172                 // Remove thrashing pages from the map mask
12173                 pages_need_mapping = uvm_page_mask_andnot(helper_page_mask,
12174                                                           map_prot_mask,
12175                                                           &service_context->thrashing_pin_mask);
12176                 if (!pages_need_mapping)
12177                     continue;
12178 
12179                 map_prot_mask = helper_page_mask;
12180             }
12181         }
12182 
12183         status = uvm_va_block_map(va_block,
12184                                   service_context->block_context,
12185                                   processor_id,
12186                                   service_context->region,
12187                                   map_prot_mask,
12188                                   new_prot,
12189                                   UvmEventMapRemoteCausePolicy,
12190                                   &va_block->tracker);
12191         if (status != NV_OK)
12192             return status;
12193     }
12194 
12195     // 4- If pages did migrate, map SetAccessedBy processors, except for
12196     // UVM-Lite
12197     for (new_prot = UVM_PROT_READ_ONLY; new_prot <= UVM_PROT_READ_WRITE_ATOMIC; ++new_prot) {
12198         bool pages_need_mapping;
12199 
12200         if (service_context->mappings_by_prot[new_prot - 1].count == 0)
12201             continue;
12202 
12203         pages_need_mapping = uvm_page_mask_and(caller_page_mask,
12204                                                new_residency_mask,
12205                                                &service_context->mappings_by_prot[new_prot - 1].page_mask);
12206         if (!pages_need_mapping)
12207             continue;
12208 
12209         // Map pages that are thrashing
12210         if (service_context->thrashing_pin_count > 0) {
12211             uvm_page_index_t page_index;
12212 
12213             for_each_va_block_page_in_region_mask(page_index,
12214                                                   &service_context->thrashing_pin_mask,
12215                                                   service_context->region) {
12216                 uvm_processor_mask_t *map_thrashing_processors = NULL;
12217                 NvU64 page_addr = uvm_va_block_cpu_page_address(va_block, page_index);
12218 
12219                 // Check protection type
12220                 if (!uvm_page_mask_test(caller_page_mask, page_index))
12221                     continue;
12222 
12223                 map_thrashing_processors = uvm_perf_thrashing_get_thrashing_processors(va_block, page_addr);
12224 
12225                 status = uvm_va_block_add_mappings_after_migration(va_block,
12226                                                                    service_context->block_context,
12227                                                                    new_residency,
12228                                                                    processor_id,
12229                                                                    uvm_va_block_region_for_page(page_index),
12230                                                                    caller_page_mask,
12231                                                                    new_prot,
12232                                                                    map_thrashing_processors);
12233                 if (status != NV_OK)
12234                     return status;
12235             }
12236 
12237             pages_need_mapping = uvm_page_mask_andnot(caller_page_mask,
12238                                                       caller_page_mask,
12239                                                       &service_context->thrashing_pin_mask);
12240             if (!pages_need_mapping)
12241                 continue;
12242         }
12243 
12244         // Map the rest of pages in a single shot
12245         status = uvm_va_block_add_mappings_after_migration(va_block,
12246                                                            service_context->block_context,
12247                                                            new_residency,
12248                                                            processor_id,
12249                                                            service_context->region,
12250                                                            caller_page_mask,
12251                                                            new_prot,
12252                                                            NULL);
12253         if (status != NV_OK)
12254             return status;
12255     }
12256 
12257     return NV_OK;
12258 }
12259 
uvm_va_block_service_locked(uvm_processor_id_t processor_id,uvm_va_block_t * va_block,uvm_va_block_retry_t * block_retry,uvm_service_block_context_t * service_context)12260 NV_STATUS uvm_va_block_service_locked(uvm_processor_id_t processor_id,
12261                                       uvm_va_block_t *va_block,
12262                                       uvm_va_block_retry_t *block_retry,
12263                                       uvm_service_block_context_t *service_context)
12264 {
12265     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
12266     uvm_processor_id_t new_residency;
12267     NV_STATUS status = NV_OK;
12268 
12269     uvm_assert_mutex_locked(&va_block->lock);
12270     UVM_ASSERT(uvm_hmm_check_context_vma_is_valid(va_block,
12271                                                   service_context->block_context->hmm.vma,
12272                                                   service_context->region));
12273 
12274     // GPU fault servicing must be done under the VA space read lock. GPU fault
12275     // servicing is required for RM to make forward progress, and we allow other
12276     // threads to call into RM while holding the VA space lock in read mode. If
12277     // we took the VA space lock in write mode on the GPU fault service path,
12278     // we could deadlock because the thread in RM which holds the VA space lock
12279     // for read wouldn't be able to complete until fault servicing completes.
12280     if (service_context->operation != UVM_SERVICE_OPERATION_REPLAYABLE_FAULTS || UVM_ID_IS_CPU(processor_id))
12281         uvm_assert_rwsem_locked(&va_space->lock);
12282     else
12283         uvm_assert_rwsem_locked_read(&va_space->lock);
12284 
12285     uvm_va_block_get_prefetch_hint(va_block,
12286                                    uvm_va_policy_get_region(va_block, service_context->region),
12287                                    service_context);
12288 
12289     for_each_id_in_mask(new_residency, &service_context->resident_processors) {
12290         if (uvm_va_block_is_hmm(va_block)) {
12291             status = uvm_hmm_va_block_service_locked(processor_id,
12292                                                      new_residency,
12293                                                      va_block,
12294                                                      block_retry,
12295                                                      service_context);
12296             if (status != NV_OK)
12297                 break;
12298 
12299             continue;
12300         }
12301 
12302         status = uvm_va_block_service_copy(processor_id, new_residency, va_block, block_retry, service_context);
12303         if (status != NV_OK)
12304             break;
12305 
12306         status = uvm_va_block_service_finish(processor_id, va_block, service_context);
12307         if (status != NV_OK)
12308             break;
12309     }
12310 
12311     return status;
12312 }
12313 
uvm_va_block_check_logical_permissions(uvm_va_block_t * va_block,uvm_va_block_context_t * va_block_context,uvm_processor_id_t processor_id,uvm_page_index_t page_index,uvm_fault_access_type_t access_type,bool allow_migration)12314 NV_STATUS uvm_va_block_check_logical_permissions(uvm_va_block_t *va_block,
12315                                                  uvm_va_block_context_t *va_block_context,
12316                                                  uvm_processor_id_t processor_id,
12317                                                  uvm_page_index_t page_index,
12318                                                  uvm_fault_access_type_t access_type,
12319                                                  bool allow_migration)
12320 {
12321     uvm_va_range_t *va_range = va_block->va_range;
12322     uvm_prot_t access_prot = uvm_fault_access_type_to_prot(access_type);
12323 
12324     UVM_ASSERT(uvm_hmm_check_context_vma_is_valid(va_block,
12325                                                   va_block_context->hmm.vma,
12326                                                   uvm_va_block_region_for_page(page_index)));
12327 
12328     // CPU permissions are checked later by block_map_cpu_page.
12329     //
12330     // TODO: Bug 1766124: permissions are checked by block_map_cpu_page because
12331     //       it can also be called from change_pte. Make change_pte call this
12332     //       function and only check CPU permissions here.
12333     if (UVM_ID_IS_GPU(processor_id)) {
12334         if (va_range && uvm_va_range_is_managed_zombie(va_range))
12335             return NV_ERR_INVALID_ADDRESS;
12336 
12337         // GPU faults only check vma permissions if a mm is registered with the
12338         // VA space (ie. uvm_va_space_mm_retain_lock(va_space) != NULL) or if
12339         // uvm_enable_builtin_tests is set, because the Linux kernel can change
12340         // vm_flags at any moment (for example on mprotect) and here we are not
12341         // guaranteed to have vma->vm_mm->mmap_lock. During tests we ensure that
12342         // this scenario does not happen.
12343         if (((va_block->hmm.va_space && va_block->hmm.va_space->va_space_mm.mm) || uvm_enable_builtin_tests) &&
12344             (access_prot > compute_logical_prot(va_block, va_block_context->hmm.vma, page_index)))
12345             return NV_ERR_INVALID_ACCESS_TYPE;
12346     }
12347 
12348     // Non-migratable range:
12349     // - CPU accesses are always fatal, regardless of the VA range residency
12350     // - GPU accesses are fatal if the GPU can't map the preferred location
12351     if (!allow_migration) {
12352         UVM_ASSERT(!uvm_va_block_is_hmm(va_block));
12353 
12354         if (UVM_ID_IS_CPU(processor_id)) {
12355             return NV_ERR_INVALID_OPERATION;
12356         }
12357         else {
12358             uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
12359 
12360             return uvm_processor_mask_test(
12361                     &va_space->accessible_from[uvm_id_value(uvm_va_range_get_policy(va_range)->preferred_location)],
12362                     processor_id)?
12363                 NV_OK : NV_ERR_INVALID_ACCESS_TYPE;
12364         }
12365     }
12366 
12367     return NV_OK;
12368 }
12369 
12370 // Check if we are faulting on a page with valid permissions to check if we can
12371 // skip fault handling. See uvm_va_block_t::cpu::fault_authorized for more
12372 // details
skip_cpu_fault_with_valid_permissions(uvm_va_block_t * va_block,uvm_page_index_t page_index,uvm_fault_access_type_t fault_access_type)12373 static bool skip_cpu_fault_with_valid_permissions(uvm_va_block_t *va_block,
12374                                                   uvm_page_index_t page_index,
12375                                                   uvm_fault_access_type_t fault_access_type)
12376 {
12377     // TODO: Bug 3900038: is skip_cpu_fault_with_valid_permissions() needed for
12378     // HMM?
12379     if (uvm_va_block_is_hmm(va_block))
12380         return false;
12381 
12382     if (block_page_is_processor_authorized(va_block,
12383                                            page_index,
12384                                            UVM_ID_CPU,
12385                                            uvm_fault_access_type_to_prot(fault_access_type))) {
12386         NvU64 now = NV_GETTIME();
12387         pid_t pid = current->pid;
12388 
12389         // Latch the pid/timestamp/page_index values for the first time
12390         if (!va_block->cpu.fault_authorized.first_fault_stamp) {
12391             va_block->cpu.fault_authorized.first_fault_stamp = now;
12392             va_block->cpu.fault_authorized.first_pid = pid;
12393             va_block->cpu.fault_authorized.page_index = page_index;
12394 
12395             return true;
12396         }
12397 
12398         // If the same thread shows up again, this means that the kernel
12399         // downgraded the page's PTEs. Service the fault to force a remap of
12400         // the page.
12401         if (va_block->cpu.fault_authorized.first_pid == pid &&
12402             va_block->cpu.fault_authorized.page_index == page_index) {
12403             va_block->cpu.fault_authorized.first_fault_stamp = 0;
12404         }
12405         else {
12406             // If the window has expired, clear the information and service the
12407             // fault. Otherwise, just return
12408             if (now - va_block->cpu.fault_authorized.first_fault_stamp > uvm_perf_authorized_cpu_fault_tracking_window_ns)
12409                 va_block->cpu.fault_authorized.first_fault_stamp = 0;
12410             else
12411                 return true;
12412         }
12413     }
12414 
12415     return false;
12416 }
12417 
block_cpu_fault_locked(uvm_va_block_t * va_block,uvm_va_block_retry_t * va_block_retry,NvU64 fault_addr,uvm_fault_access_type_t fault_access_type,uvm_service_block_context_t * service_context)12418 static NV_STATUS block_cpu_fault_locked(uvm_va_block_t *va_block,
12419                                         uvm_va_block_retry_t *va_block_retry,
12420                                         NvU64 fault_addr,
12421                                         uvm_fault_access_type_t fault_access_type,
12422                                         uvm_service_block_context_t *service_context)
12423 {
12424     uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
12425     NV_STATUS status = NV_OK;
12426     uvm_page_index_t page_index;
12427     uvm_perf_thrashing_hint_t thrashing_hint;
12428     uvm_processor_id_t new_residency;
12429     bool read_duplicate;
12430     const uvm_va_policy_t *policy;
12431     const bool hmm_migratable = true;
12432 
12433     uvm_assert_rwsem_locked(&va_space->lock);
12434 
12435     UVM_ASSERT(fault_addr >= va_block->start);
12436     UVM_ASSERT(fault_addr <= va_block->end);
12437 
12438     uvm_assert_mmap_lock_locked(service_context->block_context->mm);
12439 
12440     policy = uvm_va_policy_get(va_block, fault_addr);
12441 
12442     if (service_context->num_retries == 0) {
12443         // notify event to tools/performance heuristics
12444         uvm_perf_event_notify_cpu_fault(&va_space->perf_events,
12445                                         va_block,
12446                                         policy->preferred_location,
12447                                         fault_addr,
12448                                         fault_access_type > UVM_FAULT_ACCESS_TYPE_READ,
12449                                         KSTK_EIP(current));
12450     }
12451 
12452     // Check logical permissions
12453     page_index = uvm_va_block_cpu_page_index(va_block, fault_addr);
12454     status = uvm_va_block_check_logical_permissions(va_block,
12455                                                     service_context->block_context,
12456                                                     UVM_ID_CPU,
12457                                                     page_index,
12458                                                     fault_access_type,
12459                                                     uvm_range_group_address_migratable(va_space, fault_addr));
12460     if (status != NV_OK)
12461         return status;
12462 
12463     uvm_processor_mask_zero(&service_context->cpu_fault.gpus_to_check_for_ecc);
12464 
12465     if (skip_cpu_fault_with_valid_permissions(va_block, page_index, fault_access_type))
12466         return NV_OK;
12467 
12468     thrashing_hint = uvm_perf_thrashing_get_hint(va_block, service_context->block_context, fault_addr, UVM_ID_CPU);
12469     // Throttling is implemented by sleeping in the fault handler on the CPU
12470     if (thrashing_hint.type == UVM_PERF_THRASHING_HINT_TYPE_THROTTLE) {
12471         service_context->cpu_fault.wakeup_time_stamp = thrashing_hint.throttle.end_time_stamp;
12472         return NV_WARN_MORE_PROCESSING_REQUIRED;
12473     }
12474 
12475     service_context->read_duplicate_count = 0;
12476     service_context->thrashing_pin_count = 0;
12477     service_context->operation = UVM_SERVICE_OPERATION_REPLAYABLE_FAULTS;
12478 
12479     if (thrashing_hint.type == UVM_PERF_THRASHING_HINT_TYPE_PIN) {
12480         uvm_page_mask_zero(&service_context->thrashing_pin_mask);
12481         uvm_page_mask_set(&service_context->thrashing_pin_mask, page_index);
12482         service_context->thrashing_pin_count = 1;
12483     }
12484 
12485     // Compute new residency and update the masks
12486     new_residency = uvm_va_block_select_residency(va_block,
12487                                                   service_context->block_context,
12488                                                   page_index,
12489                                                   UVM_ID_CPU,
12490                                                   uvm_fault_access_type_mask_bit(fault_access_type),
12491                                                   policy,
12492                                                   &thrashing_hint,
12493                                                   UVM_SERVICE_OPERATION_REPLAYABLE_FAULTS,
12494                                                   hmm_migratable,
12495                                                   &read_duplicate);
12496 
12497     // Initialize the minimum necessary state in the fault service context
12498     uvm_processor_mask_zero(&service_context->resident_processors);
12499 
12500     // Set new residency and update the masks
12501     uvm_processor_mask_set(&service_context->resident_processors, new_residency);
12502 
12503     // The masks need to be fully zeroed as the fault region may grow due to prefetching
12504     uvm_page_mask_zero(&service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency);
12505     uvm_page_mask_set(&service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency, page_index);
12506 
12507     if (read_duplicate) {
12508         uvm_page_mask_zero(&service_context->read_duplicate_mask);
12509         uvm_page_mask_set(&service_context->read_duplicate_mask, page_index);
12510         service_context->read_duplicate_count = 1;
12511     }
12512 
12513     service_context->access_type[page_index] = fault_access_type;
12514 
12515     service_context->region = uvm_va_block_region_for_page(page_index);
12516 
12517     status = uvm_va_block_service_locked(UVM_ID_CPU, va_block, va_block_retry, service_context);
12518     UVM_ASSERT(status != NV_WARN_MISMATCHED_TARGET);
12519 
12520     ++service_context->num_retries;
12521 
12522     return status;
12523 }
12524 
uvm_va_block_cpu_fault(uvm_va_block_t * va_block,NvU64 fault_addr,bool is_write,uvm_service_block_context_t * service_context)12525 NV_STATUS uvm_va_block_cpu_fault(uvm_va_block_t *va_block,
12526                                  NvU64 fault_addr,
12527                                  bool is_write,
12528                                  uvm_service_block_context_t *service_context)
12529 {
12530     NV_STATUS status;
12531     uvm_va_block_retry_t va_block_retry;
12532     uvm_fault_access_type_t fault_access_type;
12533 
12534     if (is_write)
12535         fault_access_type = UVM_FAULT_ACCESS_TYPE_ATOMIC_STRONG;
12536     else
12537         fault_access_type = UVM_FAULT_ACCESS_TYPE_READ;
12538 
12539     service_context->num_retries = 0;
12540     service_context->cpu_fault.did_migrate = false;
12541 
12542     // We have to use vm_insert_page instead of handing the page to the kernel
12543     // and letting it insert the mapping, and we must do that while holding the
12544     // lock on this VA block. Otherwise there will be a window in which we think
12545     // we've mapped the page but the CPU mapping hasn't actually been created
12546     // yet. During that window a GPU fault event could arrive and claim
12547     // ownership of that VA, "unmapping" it. Then later the kernel would
12548     // eventually establish the mapping, and we'd end up with both CPU and GPU
12549     // thinking they each owned the page.
12550     //
12551     // This function must only be called when it's safe to call vm_insert_page.
12552     // That is, there must be a reference held on the vma's vm_mm, and
12553     // vm_mm->mmap_lock is held in at least read mode. Note that current->mm
12554     // might not be vma->vm_mm.
12555     status = UVM_VA_BLOCK_LOCK_RETRY(va_block,
12556                                      &va_block_retry,
12557                                      block_cpu_fault_locked(va_block,
12558                                                             &va_block_retry,
12559                                                             fault_addr,
12560                                                             fault_access_type,
12561                                                             service_context));
12562     return status;
12563 }
12564 
uvm_va_block_find(uvm_va_space_t * va_space,NvU64 addr,uvm_va_block_t ** out_block)12565 NV_STATUS uvm_va_block_find(uvm_va_space_t *va_space, NvU64 addr, uvm_va_block_t **out_block)
12566 {
12567     uvm_va_range_t *va_range;
12568     uvm_va_block_t *block;
12569     size_t index;
12570 
12571     va_range = uvm_va_range_find(va_space, addr);
12572     if (!va_range)
12573         return uvm_hmm_va_block_find(va_space, addr, out_block);
12574 
12575     UVM_ASSERT(uvm_hmm_va_block_find(va_space, addr, out_block) == NV_ERR_INVALID_ADDRESS ||
12576                uvm_hmm_va_block_find(va_space, addr, out_block) == NV_ERR_OBJECT_NOT_FOUND);
12577 
12578     if (va_range->type != UVM_VA_RANGE_TYPE_MANAGED)
12579         return NV_ERR_INVALID_ADDRESS;
12580 
12581     index = uvm_va_range_block_index(va_range, addr);
12582     block = uvm_va_range_block(va_range, index);
12583     if (!block)
12584         return NV_ERR_OBJECT_NOT_FOUND;
12585 
12586     *out_block = block;
12587     return NV_OK;
12588 }
12589 
uvm_va_block_find_create_in_range(uvm_va_space_t * va_space,uvm_va_range_t * va_range,NvU64 addr,uvm_va_block_t ** out_block)12590 NV_STATUS uvm_va_block_find_create_in_range(uvm_va_space_t *va_space,
12591                                             uvm_va_range_t *va_range,
12592                                             NvU64 addr,
12593                                             uvm_va_block_t **out_block)
12594 {
12595     size_t index;
12596 
12597     if (uvm_enable_builtin_tests && atomic_dec_if_positive(&va_space->test.va_block_allocation_fail_nth) == 0)
12598         return NV_ERR_NO_MEMORY;
12599 
12600     UVM_ASSERT(va_range);
12601     UVM_ASSERT(addr >= va_range->node.start);
12602     UVM_ASSERT(addr <= va_range->node.end);
12603 
12604     UVM_ASSERT(uvm_hmm_va_block_find(va_space, addr, out_block) == NV_ERR_INVALID_ADDRESS ||
12605                uvm_hmm_va_block_find(va_space, addr, out_block) == NV_ERR_OBJECT_NOT_FOUND);
12606 
12607     if (va_range->type != UVM_VA_RANGE_TYPE_MANAGED)
12608         return NV_ERR_INVALID_ADDRESS;
12609 
12610     index = uvm_va_range_block_index(va_range, addr);
12611     return uvm_va_range_block_create(va_range, index, out_block);
12612 }
12613 
uvm_va_block_find_create_managed(uvm_va_space_t * va_space,NvU64 addr,uvm_va_block_t ** out_block)12614 NV_STATUS uvm_va_block_find_create_managed(uvm_va_space_t *va_space,
12615                                    NvU64 addr,
12616                                    uvm_va_block_t **out_block)
12617 {
12618     uvm_va_range_t *va_range = uvm_va_range_find(va_space, addr);
12619 
12620     if (va_range)
12621         return uvm_va_block_find_create_in_range(va_space, va_range, addr, out_block);
12622     else
12623         return NV_ERR_INVALID_ADDRESS;
12624 }
12625 
uvm_va_block_find_create(uvm_va_space_t * va_space,NvU64 addr,struct vm_area_struct ** hmm_vma,uvm_va_block_t ** out_block)12626 NV_STATUS uvm_va_block_find_create(uvm_va_space_t *va_space,
12627                                    NvU64 addr,
12628                                    struct vm_area_struct **hmm_vma,
12629                                    uvm_va_block_t **out_block)
12630 {
12631     uvm_va_range_t *va_range = uvm_va_range_find(va_space, addr);
12632 
12633     if (hmm_vma)
12634         *hmm_vma = NULL;
12635 
12636     if (va_range)
12637         return uvm_va_block_find_create_in_range(va_space, va_range, addr, out_block);
12638     else
12639         return uvm_hmm_va_block_find_create(va_space, addr, hmm_vma, out_block);
12640 }
12641 
va_block_write_cpu_to_gpu(uvm_va_block_t * va_block,uvm_gpu_t * gpu,uvm_gpu_address_t dst_gpu_address,NvU64 dst,uvm_mem_t * src_mem,size_t size)12642 static NV_STATUS va_block_write_cpu_to_gpu(uvm_va_block_t *va_block,
12643                                            uvm_gpu_t *gpu,
12644                                            uvm_gpu_address_t dst_gpu_address,
12645                                            NvU64 dst,
12646                                            uvm_mem_t *src_mem,
12647                                            size_t size)
12648 {
12649     NV_STATUS status;
12650     uvm_push_t push;
12651     uvm_gpu_address_t src_gpu_address;
12652 
12653     if (g_uvm_global.conf_computing_enabled) {
12654         return uvm_conf_computing_util_memcopy_cpu_to_gpu(gpu,
12655                                                           dst_gpu_address,
12656                                                           uvm_mem_get_cpu_addr_kernel(src_mem),
12657                                                           size,
12658                                                           &va_block->tracker,
12659                                                           "Encrypted write to [0x%llx, 0x%llx)",
12660                                                           dst,
12661                                                           dst + size);
12662     }
12663 
12664     status = uvm_push_begin_acquire(gpu->channel_manager,
12665                                     UVM_CHANNEL_TYPE_CPU_TO_GPU,
12666                                     &va_block->tracker,
12667                                     &push,
12668                                     "Direct write to [0x%llx, 0x%llx)",
12669                                     dst,
12670                                     dst + size);
12671     if (status != NV_OK)
12672         return status;
12673 
12674     src_gpu_address = uvm_mem_gpu_address_virtual_kernel(src_mem, gpu);
12675     gpu->parent->ce_hal->memcopy(&push, dst_gpu_address, src_gpu_address, size);
12676     return uvm_push_end_and_wait(&push);
12677 }
12678 
uvm_va_block_write_from_cpu(uvm_va_block_t * va_block,uvm_va_block_context_t * block_context,NvU64 dst,uvm_mem_t * src_mem,size_t size)12679 NV_STATUS uvm_va_block_write_from_cpu(uvm_va_block_t *va_block,
12680                                       uvm_va_block_context_t *block_context,
12681                                       NvU64 dst,
12682                                       uvm_mem_t *src_mem,
12683                                       size_t size)
12684 {
12685     NV_STATUS status;
12686     uvm_page_index_t page_index = uvm_va_block_cpu_page_index(va_block, dst);
12687     NvU64 page_offset = dst & (PAGE_SIZE - 1);
12688     uvm_processor_id_t proc = uvm_va_block_page_get_closest_resident(va_block, block_context, page_index, UVM_ID_CPU);
12689     uvm_va_block_region_t region = uvm_va_block_region_for_page(page_index);
12690 
12691     uvm_assert_mutex_locked(&va_block->lock);
12692     UVM_ASSERT_MSG(page_offset + size <= PAGE_SIZE, "Write spans multiple pages: dst 0x%llx, size 0x%zx\n", dst, size);
12693 
12694     if (UVM_ID_IS_INVALID(proc))
12695         proc = UVM_ID_CPU;
12696 
12697     // Use make_resident() in all cases to break read-duplication, but
12698     // block_retry can be NULL as if the page is not resident yet we will make
12699     // it resident on the CPU.
12700     // Notably we don't care about coherence with respect to atomics from other
12701     // processors.
12702     status = uvm_va_block_make_resident(va_block,
12703                                         NULL,
12704                                         block_context,
12705                                         proc,
12706                                         region,
12707                                         NULL,
12708                                         NULL,
12709                                         UVM_MAKE_RESIDENT_CAUSE_API_TOOLS);
12710 
12711     if (status != NV_OK)
12712         return status;
12713 
12714     if (UVM_ID_IS_CPU(proc)) {
12715         char *mapped_page;
12716         struct page *page = uvm_va_block_get_cpu_page(va_block, page_index);
12717         void *src = uvm_mem_get_cpu_addr_kernel(src_mem);
12718 
12719         status = uvm_tracker_wait(&va_block->tracker);
12720         if (status != NV_OK)
12721             return status;
12722 
12723         mapped_page = (char *)kmap(page);
12724         memcpy(mapped_page + page_offset, src, size);
12725         kunmap(page);
12726 
12727         return NV_OK;
12728     }
12729     else {
12730         uvm_gpu_t *dst_gpu;
12731         uvm_gpu_address_t dst_gpu_address;
12732 
12733         UVM_ASSERT(UVM_ID_IS_GPU(proc));
12734 
12735         dst_gpu = block_get_gpu(va_block, proc);
12736 
12737         dst_gpu_address = block_phys_page_copy_address(va_block,
12738                                                        block_phys_page(proc, NUMA_NO_NODE, page_index),
12739                                                        dst_gpu);
12740         dst_gpu_address.address += page_offset;
12741 
12742         return va_block_write_cpu_to_gpu(va_block, dst_gpu, dst_gpu_address, dst, src_mem, size);
12743     }
12744 }
12745 
va_block_read_gpu_to_cpu(uvm_va_block_t * va_block,uvm_mem_t * dst_mem,uvm_gpu_t * gpu,uvm_gpu_address_t src_gpu_address,NvU64 src,size_t size)12746 static NV_STATUS va_block_read_gpu_to_cpu(uvm_va_block_t *va_block,
12747                                           uvm_mem_t *dst_mem,
12748                                           uvm_gpu_t *gpu,
12749                                           uvm_gpu_address_t src_gpu_address,
12750                                           NvU64 src,
12751                                           size_t size)
12752 {
12753     NV_STATUS status;
12754     uvm_push_t push;
12755     uvm_gpu_address_t dst_gpu_address;
12756 
12757     if (g_uvm_global.conf_computing_enabled) {
12758         return uvm_conf_computing_util_memcopy_gpu_to_cpu(gpu,
12759                                                           uvm_mem_get_cpu_addr_kernel(dst_mem),
12760                                                           src_gpu_address,
12761                                                           size,
12762                                                           &va_block->tracker,
12763                                                           "Encrypted read from [0x%llx, 0x%llx)",
12764                                                           src,
12765                                                           src + size);
12766     }
12767 
12768     status = uvm_push_begin_acquire(gpu->channel_manager,
12769                                     UVM_CHANNEL_TYPE_GPU_TO_CPU,
12770                                     &va_block->tracker,
12771                                     &push,
12772                                     "Direct read from [0x%llx, 0x%llx)",
12773                                     src,
12774                                     src + size);
12775     if (status != NV_OK)
12776         return status;
12777 
12778     dst_gpu_address = uvm_mem_gpu_address_virtual_kernel(dst_mem, gpu);
12779     gpu->parent->ce_hal->memcopy(&push, dst_gpu_address, src_gpu_address, size);
12780     return uvm_push_end_and_wait(&push);
12781 }
12782 
uvm_va_block_read_to_cpu(uvm_va_block_t * va_block,uvm_va_block_context_t * va_block_context,uvm_mem_t * dst_mem,NvU64 src,size_t size)12783 NV_STATUS uvm_va_block_read_to_cpu(uvm_va_block_t *va_block,
12784                                    uvm_va_block_context_t *va_block_context,
12785                                    uvm_mem_t *dst_mem,
12786                                    NvU64 src,
12787                                    size_t size)
12788 {
12789     uvm_page_index_t page_index = uvm_va_block_cpu_page_index(va_block, src);
12790     NvU64 page_offset = src & (PAGE_SIZE - 1);
12791     uvm_processor_id_t proc;
12792     void *dst = uvm_mem_get_cpu_addr_kernel(dst_mem);
12793 
12794     uvm_assert_mutex_locked(&va_block->lock);
12795     UVM_ASSERT_MSG(page_offset + size <= PAGE_SIZE, "Read spans multiple pages: src 0x%llx, size 0x%zx\n", src, size);
12796 
12797     proc = uvm_va_block_page_get_closest_resident(va_block, va_block_context, page_index, UVM_ID_CPU);
12798     if (UVM_ID_IS_INVALID(proc)) {
12799         memset(dst, 0, size);
12800         return NV_OK;
12801     }
12802     else if (UVM_ID_IS_CPU(proc)) {
12803         NV_STATUS status;
12804         char *mapped_page;
12805         struct page *page = uvm_va_block_get_cpu_page(va_block, page_index);
12806 
12807         status = uvm_tracker_wait(&va_block->tracker);
12808         if (status != NV_OK)
12809             return status;
12810 
12811         mapped_page = (char *)kmap(page);
12812         memcpy(dst, mapped_page + page_offset, size);
12813         kunmap(page);
12814 
12815         return NV_OK;
12816     }
12817     else {
12818         uvm_gpu_address_t src_gpu_address;
12819         uvm_gpu_t *gpu = block_get_gpu(va_block, proc);
12820 
12821         src_gpu_address = block_phys_page_copy_address(va_block,
12822                                                        block_phys_page(proc, NUMA_NO_NODE, page_index),
12823                                                        gpu);
12824         src_gpu_address.address += page_offset;
12825 
12826         return va_block_read_gpu_to_cpu(va_block, dst_mem, gpu, src_gpu_address, src, size);
12827     }
12828 }
12829 
12830 // Deferred work item reestablishing accessed by mappings after eviction. On
12831 // GPUs with access counters enabled, the evicted GPU will also get remote
12832 // mappings.
block_add_eviction_mappings(void * args)12833 static void block_add_eviction_mappings(void *args)
12834 {
12835     uvm_va_block_t *va_block = (uvm_va_block_t*)args;
12836     uvm_va_space_t *va_space;
12837     uvm_processor_id_t id;
12838     uvm_va_block_context_t *block_context = NULL;
12839     struct mm_struct *mm = NULL;
12840 
12841     uvm_mutex_lock(&va_block->lock);
12842     va_space = uvm_va_block_get_va_space_maybe_dead(va_block);
12843     uvm_mutex_unlock(&va_block->lock);
12844 
12845     if (!va_space) {
12846         // Block has been killed in the meantime
12847         goto done;
12848     }
12849 
12850     mm = uvm_va_space_mm_retain_lock(va_space);
12851 
12852     block_context = uvm_va_block_context_alloc(mm);
12853     if (!block_context)
12854         goto done;
12855 
12856     // The block wasn't dead when we checked above and that's enough to
12857     // guarantee that the VA space is still around, because
12858     // uvm_va_space_destroy() flushes the associated nv_kthread_q, and that
12859     // flush waits for this function call to finish.
12860     uvm_va_space_down_read(va_space);
12861 
12862     // Now that we have the VA space lock held, we can check whether the block
12863     // is still alive since the VA space write lock is needed to kill blocks.
12864     if (uvm_va_block_is_dead(va_block))
12865         goto unlock;
12866 
12867     if (uvm_va_block_is_hmm(va_block)) {
12868         uvm_hmm_block_add_eviction_mappings(va_space, va_block, block_context);
12869     }
12870     else {
12871         uvm_va_range_t *va_range = va_block->va_range;
12872         NV_STATUS status = NV_OK;
12873 
12874         for_each_id_in_mask(id, &uvm_va_range_get_policy(va_range)->accessed_by) {
12875             status = uvm_va_block_set_accessed_by(va_block, block_context, id);
12876             if (status != NV_OK)
12877                 break;
12878         }
12879 
12880         if (status == NV_OK && uvm_va_space_map_remote_on_eviction(va_space)) {
12881             uvm_processor_mask_t *map_processors = &block_context->map_processors_eviction;
12882 
12883             // Exclude the processors that have been already mapped due to
12884             // AccessedBy
12885             uvm_processor_mask_andnot(map_processors,
12886                                       &va_block->evicted_gpus,
12887                                       &uvm_va_range_get_policy(va_range)->accessed_by);
12888 
12889             for_each_gpu_id_in_mask(id, map_processors) {
12890                 uvm_gpu_t *gpu = uvm_va_space_get_gpu(va_space, id);
12891                 uvm_va_block_gpu_state_t *gpu_state;
12892 
12893                 if (!gpu->parent->access_counters_supported)
12894                     continue;
12895 
12896                 gpu_state = uvm_va_block_gpu_state_get(va_block, id);
12897                 UVM_ASSERT(gpu_state);
12898 
12899                 // TODO: Bug 2096389: uvm_va_block_add_mappings does not add
12900                 // remote mappings to read-duplicated pages. Add support for it
12901                 // or create a new function.
12902                 status = UVM_VA_BLOCK_LOCK_RETRY(va_block, NULL,
12903                                                  uvm_va_block_add_mappings(va_block,
12904                                                                            block_context,
12905                                                                            id,
12906                                                                            uvm_va_block_region_from_block(va_block),
12907                                                                            &gpu_state->evicted,
12908                                                                            UvmEventMapRemoteCauseEviction));
12909                 if (status != NV_OK)
12910                     break;
12911             }
12912         }
12913 
12914         if (status != NV_OK) {
12915             UVM_ERR_PRINT("Deferred mappings to evicted memory for block [0x%llx, 0x%llx] failed %s, processor %s\n",
12916                           va_block->start,
12917                           va_block->end,
12918                           nvstatusToString(status),
12919                           uvm_va_space_processor_name(va_space, id));
12920         }
12921     }
12922 
12923 unlock:
12924     uvm_va_space_up_read(va_space);
12925     uvm_va_block_context_free(block_context);
12926 
12927 done:
12928     uvm_va_space_mm_release_unlock(va_space, mm);
12929     uvm_va_block_release(va_block);
12930 }
12931 
block_add_eviction_mappings_entry(void * args)12932 static void block_add_eviction_mappings_entry(void *args)
12933 {
12934     UVM_ENTRY_VOID(block_add_eviction_mappings(args));
12935 }
12936 
uvm_va_block_evict_chunks(uvm_va_block_t * va_block,uvm_gpu_t * gpu,uvm_gpu_chunk_t * root_chunk,uvm_tracker_t * tracker)12937 NV_STATUS uvm_va_block_evict_chunks(uvm_va_block_t *va_block,
12938                                     uvm_gpu_t *gpu,
12939                                     uvm_gpu_chunk_t *root_chunk,
12940                                     uvm_tracker_t *tracker)
12941 {
12942     NV_STATUS status = NV_OK;
12943     NvU32 i;
12944     uvm_va_block_gpu_state_t *gpu_state;
12945     uvm_va_block_region_t chunk_region;
12946     size_t num_gpu_chunks = block_num_gpu_chunks(va_block, gpu);
12947     size_t chunks_to_evict = 0;
12948     uvm_service_block_context_t *service_context;
12949     uvm_va_block_context_t *block_context;
12950     uvm_page_mask_t *pages_to_evict;
12951     uvm_va_block_test_t *va_block_test = uvm_va_block_get_test(va_block);
12952     uvm_va_space_t *va_space = uvm_va_block_get_va_space_maybe_dead(va_block);
12953     struct mm_struct *mm;
12954     bool accessed_by_set = false;
12955 
12956     uvm_assert_mutex_locked(&va_block->lock);
12957 
12958     // The block might have been killed in the meantime
12959     if (!va_space)
12960         return NV_OK;
12961 
12962     gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id);
12963     if (!gpu_state)
12964         return NV_OK;
12965 
12966     if (va_block_test && va_block_test->inject_eviction_error) {
12967         va_block_test->inject_eviction_error = false;
12968         return NV_ERR_NO_MEMORY;
12969     }
12970 
12971     // We cannot take this block's VA space or mmap_lock locks on the eviction
12972     // path, however, we retain mm in order to support accounting of CPU memory
12973     // allocations. If mappings need to be created,
12974     // block_add_eviction_mappings() will be scheduled below.
12975     mm = uvm_va_space_mm_retain(va_space);
12976 
12977     service_context = uvm_service_block_context_alloc(mm);
12978     if (!service_context) {
12979         if (mm)
12980             uvm_va_space_mm_release(va_space);
12981 
12982         return NV_ERR_NO_MEMORY;
12983     }
12984 
12985     block_context = service_context->block_context;
12986 
12987     pages_to_evict = &block_context->caller_page_mask;
12988     uvm_page_mask_zero(pages_to_evict);
12989     chunk_region.outer = 0;
12990 
12991     // Find all chunks that are subchunks of the root chunk
12992     for (i = 0; i < num_gpu_chunks; ++i) {
12993         uvm_chunk_size_t chunk_size;
12994         size_t chunk_index = block_gpu_chunk_index(va_block, gpu, chunk_region.outer, &chunk_size);
12995         UVM_ASSERT(chunk_index == i);
12996         chunk_region.first = chunk_region.outer;
12997         chunk_region.outer = chunk_region.first + chunk_size / PAGE_SIZE;
12998 
12999         if (!gpu_state->chunks[i])
13000             continue;
13001         if (!uvm_gpu_chunk_same_root(gpu_state->chunks[i], root_chunk))
13002             continue;
13003 
13004         if (uvm_va_block_is_hmm(va_block)) {
13005             status = uvm_hmm_va_block_evict_chunk_prep(va_block, block_context, gpu_state->chunks[i], chunk_region);
13006             if (status != NV_OK)
13007                 break;
13008         }
13009 
13010         uvm_page_mask_region_fill(pages_to_evict, chunk_region);
13011         ++chunks_to_evict;
13012     }
13013 
13014     if (chunks_to_evict == 0)
13015         goto out;
13016 
13017     // Only move pages resident on the GPU
13018     uvm_page_mask_and(pages_to_evict, pages_to_evict, uvm_va_block_resident_mask_get(va_block, gpu->id, NUMA_NO_NODE));
13019     uvm_processor_mask_zero(&block_context->make_resident.all_involved_processors);
13020 
13021     if (uvm_va_block_is_hmm(va_block)) {
13022         status = uvm_hmm_va_block_evict_chunks(va_block,
13023                                                service_context,
13024                                                pages_to_evict,
13025                                                uvm_va_block_region_from_block(va_block),
13026                                                &accessed_by_set);
13027     }
13028     else {
13029         const uvm_va_policy_t *policy = uvm_va_range_get_policy(va_block->va_range);
13030         accessed_by_set = uvm_processor_mask_get_count(&policy->accessed_by) > 0;
13031 
13032         // TODO: Bug 1765193: make_resident() breaks read-duplication, but it's
13033         // not necessary to do so for eviction. Add a version that unmaps only
13034         // the processors that have mappings to the pages being evicted.
13035         status = uvm_va_block_make_resident(va_block,
13036                                             NULL,
13037                                             block_context,
13038                                             UVM_ID_CPU,
13039                                             uvm_va_block_region_from_block(va_block),
13040                                             pages_to_evict,
13041                                             NULL,
13042                                             UVM_MAKE_RESIDENT_CAUSE_EVICTION);
13043     }
13044     if (status != NV_OK)
13045         goto out;
13046 
13047     // VA space lock may not be held and hence we cannot reestablish any
13048     // mappings here and need to defer it to a work queue.
13049     //
13050     // Reading the accessed_by mask without the VA space lock is safe because
13051     // adding a new processor to the mask triggers going over all the VA blocks
13052     // in the range and locking them. And we hold one of the VA block's locks.
13053     //
13054     // If uvm_va_range_set_accessed_by() hasn't called
13055     // uvm_va_block_set_accessed_by() for this block yet then it will take care
13056     // of adding the mapping after we are done. If it already did then we are
13057     // guaranteed to see the new processor in the accessed_by mask because we
13058     // locked the block's lock that the thread calling
13059     // uvm_va_range_set_accessed_by() unlocked after updating the mask.
13060     //
13061     // If a processor gets removed from the mask then we might not notice and
13062     // schedule the work item anyway, but that's benign as
13063     // block_add_eviction_mappings() re-examines the mask.
13064     //
13065     // Checking if access counters migrations are enabled on a VA space is racy
13066     // without holding the VA space lock. However, this is fine as
13067     // block_add_eviction_mappings() reexamines the value with the VA space
13068     // lock being held.
13069     if (accessed_by_set || (gpu->parent->access_counters_supported && uvm_va_space_map_remote_on_eviction(va_space))) {
13070         // Always retain the VA block first so that it's safe for the deferred
13071         // callback to release it immediately after it runs.
13072         uvm_va_block_retain(va_block);
13073 
13074         if (!nv_kthread_q_schedule_q_item(&g_uvm_global.global_q,
13075                                           &va_block->eviction_mappings_q_item)) {
13076             // And release it if no new callback was scheduled
13077             uvm_va_block_release_no_destroy(va_block);
13078         }
13079     }
13080 
13081     status = uvm_tracker_add_tracker_safe(tracker, &va_block->tracker);
13082     if (status != NV_OK)
13083         goto out;
13084 
13085     for (i = 0; i < num_gpu_chunks; ++i) {
13086         uvm_gpu_id_t accessing_gpu_id;
13087         uvm_gpu_chunk_t *chunk = gpu_state->chunks[i];
13088 
13089         if (!chunk)
13090             continue;
13091         if (!uvm_gpu_chunk_same_root(chunk, root_chunk))
13092             continue;
13093 
13094         // Remove the mappings of indirect peers from the reverse map. We
13095         // access the indirect peer mask from the VA space without holding the
13096         // VA space lock. Therefore, we can race with enable_peer/disable_peer
13097         // operations. However this is fine:
13098         //
13099         // The enable_peer sequence is as follows:
13100         //
13101         // set_bit in va_space->indirect_peers
13102         // uvm_va_block_enable_peer;
13103         //
13104         // - If we read the mask BEFORE it is set or AFTER the mapping has
13105         // been added to the map there is no race.
13106         // - If we read the mask AFTER it is set but BEFORE adding the mapping
13107         // to the reverse map, we will try to remove it although it is not
13108         // there yet. Therefore, we use
13109         // uvm_pmm_sysmem_mappings_remove_gpu_mapping_on_eviction, which does
13110         // not check if the mapping is present in the reverse map.
13111         //
13112         // The disable_peer sequence is as follows:
13113         //
13114         // uvm_va_block_disable_peer;
13115         // clear_bit in va_space->indirect_peers
13116         //
13117         // - If we read the mask BEFORE the mapping has been added to the map
13118         // or AFTER the bit has been cleared, there is no race.
13119         // - If we read the mask AFTER the mapping has been removed and BEFORE
13120         // the bit is cleared, we will try to remove the mapping, too.
13121         // Again, uvm_pmm_sysmem_mappings_remove_gpu_mapping_on_eviction works
13122         // in this scenario.
13123         // Obtain the uvm_gpu_t directly via the parent GPU's id since indirect
13124         // peers are not supported when SMC is enabled.
13125         for_each_gpu_id_in_mask(accessing_gpu_id, &va_space->indirect_peers[uvm_id_value(gpu->id)]) {
13126             uvm_gpu_t *accessing_gpu = uvm_va_space_get_gpu(va_space, accessing_gpu_id);
13127             NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(&gpu->pmm, chunk, accessing_gpu);
13128 
13129             uvm_pmm_sysmem_mappings_remove_gpu_mapping_on_eviction(&accessing_gpu->pmm_reverse_sysmem_mappings,
13130                                                                    peer_addr);
13131         }
13132 
13133         uvm_mmu_chunk_unmap(chunk, tracker);
13134 
13135         uvm_pmm_gpu_mark_chunk_evicted(&gpu->pmm, gpu_state->chunks[i]);
13136         gpu_state->chunks[i] = NULL;
13137     }
13138 
13139 out:
13140     uvm_service_block_context_free(service_context);
13141 
13142     if (mm)
13143         uvm_va_space_mm_release(va_space);
13144 
13145     return status;
13146 }
13147 
block_gpu_force_4k_ptes(uvm_va_block_t * block,uvm_va_block_context_t * block_context,uvm_gpu_t * gpu)13148 static NV_STATUS block_gpu_force_4k_ptes(uvm_va_block_t *block, uvm_va_block_context_t *block_context, uvm_gpu_t *gpu)
13149 {
13150     uvm_va_block_gpu_state_t *gpu_state = block_gpu_state_get_alloc(block, gpu);
13151     uvm_push_t push;
13152     NV_STATUS status;
13153 
13154     // See comment in uvm_va_block_set_cancel
13155     UVM_ASSERT(!gpu->parent->fault_cancel_va_supported);
13156 
13157     if (!gpu_state)
13158         return NV_ERR_NO_MEMORY;
13159 
13160     // Force all pages to be 4K and prevent future upgrades during cancel
13161     gpu_state->force_4k_ptes = true;
13162 
13163     // If we have no page tables we're done. For fault cancel we need to make
13164     // sure that fatal faults are on different 4k PTEs than non-fatal faults,
13165     // and we need to service all non-fatal faults before issuing the cancel. So
13166     // either all faults are fatal and we have no PTEs (we're PROT_NONE), or
13167     // we'll allocate PTEs later when we service the non-fatal faults. Those
13168     // PTEs will be 4k since force_4k_ptes is set.
13169     if (!block_gpu_has_page_tables(block, gpu))
13170         return NV_OK;
13171 
13172     // Are we 4k already?
13173     if (!gpu_state->pte_is_2m && bitmap_empty(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK))
13174         return NV_OK;
13175 
13176     status = block_alloc_ptes_with_retry(block, gpu, UVM_PAGE_SIZE_4K, NULL);
13177     if (status != NV_OK)
13178         return status;
13179 
13180     status = uvm_push_begin_acquire(gpu->channel_manager,
13181                                     UVM_CHANNEL_TYPE_MEMOPS,
13182                                     &block->tracker,
13183                                     &push,
13184                                     "Forcing 4k PTEs on block [0x%llx, 0x%llx)",
13185                                     block->start,
13186                                     block->end + 1);
13187     if (status != NV_OK)
13188         return status;
13189 
13190     if (gpu_state->pte_is_2m)
13191         block_gpu_split_2m(block, block_context, gpu, NULL, &push);
13192     else
13193         block_gpu_split_big(block, block_context, gpu, gpu_state->big_ptes, &push);
13194 
13195     uvm_push_end(&push);
13196 
13197     UVM_ASSERT(block_check_mappings(block, block_context));
13198 
13199     return uvm_tracker_add_push_safe(&block->tracker, &push);
13200 }
13201 
uvm_va_block_set_cancel(uvm_va_block_t * va_block,uvm_va_block_context_t * block_context,uvm_gpu_t * gpu)13202 NV_STATUS uvm_va_block_set_cancel(uvm_va_block_t *va_block, uvm_va_block_context_t *block_context, uvm_gpu_t *gpu)
13203 {
13204     uvm_assert_mutex_locked(&va_block->lock);
13205 
13206     // Volta+ devices support a global VA cancel method that does not require
13207     // 4k PTEs. Thus, skip doing this PTE splitting, particularly because it
13208     // could result in 4k PTEs on P9 systems which otherwise would never need
13209     // them.
13210     if (gpu->parent->fault_cancel_va_supported)
13211         return NV_OK;
13212 
13213     return block_gpu_force_4k_ptes(va_block, block_context, gpu);
13214 }
13215 
uvm_test_va_block_inject_error(UVM_TEST_VA_BLOCK_INJECT_ERROR_PARAMS * params,struct file * filp)13216 NV_STATUS uvm_test_va_block_inject_error(UVM_TEST_VA_BLOCK_INJECT_ERROR_PARAMS *params, struct file *filp)
13217 {
13218     uvm_va_space_t *va_space = uvm_va_space_get(filp);
13219     struct mm_struct *mm;
13220     uvm_va_block_t *va_block;
13221     uvm_va_block_test_t *va_block_test;
13222     NV_STATUS status = NV_OK;
13223 
13224     mm = uvm_va_space_mm_or_current_retain_lock(va_space);
13225     uvm_va_space_down_read(va_space);
13226 
13227     if (mm)
13228         status = uvm_va_block_find_create(va_space, params->lookup_address, NULL, &va_block);
13229     else
13230         status = uvm_va_block_find_create_managed(va_space, params->lookup_address, &va_block);
13231 
13232     if (status != NV_OK)
13233         goto out;
13234 
13235     va_block_test = uvm_va_block_get_test(va_block);
13236     UVM_ASSERT(va_block_test);
13237 
13238     uvm_mutex_lock(&va_block->lock);
13239 
13240     if (params->page_table_allocation_retry_force_count)
13241         va_block_test->page_table_allocation_retry_force_count = params->page_table_allocation_retry_force_count;
13242 
13243     if (params->user_pages_allocation_retry_force_count)
13244         va_block_test->user_pages_allocation_retry_force_count = params->user_pages_allocation_retry_force_count;
13245 
13246     if (params->cpu_chunk_allocation_size_mask) {
13247         if (params->cpu_chunk_allocation_size_mask & ~UVM_CPU_CHUNK_SIZES ||
13248             !(params->cpu_chunk_allocation_size_mask & PAGE_SIZE)) {
13249             status = NV_ERR_INVALID_ARGUMENT;
13250             goto block_unlock;
13251         }
13252 
13253         va_block_test->cpu_chunk_allocation_size_mask = params->cpu_chunk_allocation_size_mask & UVM_CPU_CHUNK_SIZES;
13254     }
13255 
13256     if (params->cpu_chunk_allocation_target_id != NUMA_NO_NODE)
13257         va_block_test->cpu_chunk_allocation_target_id = params->cpu_chunk_allocation_target_id;
13258 
13259     if (params->cpu_chunk_allocation_actual_id != NUMA_NO_NODE)
13260         va_block_test->cpu_chunk_allocation_actual_id = params->cpu_chunk_allocation_actual_id;
13261 
13262     if (params->eviction_error)
13263         va_block_test->inject_eviction_error = params->eviction_error;
13264 
13265     if (params->cpu_pages_allocation_error_count)
13266         va_block_test->inject_cpu_pages_allocation_error_count = params->cpu_pages_allocation_error_count;
13267 
13268     if (params->populate_error)
13269         va_block_test->inject_populate_error = params->populate_error;
13270 
13271 block_unlock:
13272     uvm_mutex_unlock(&va_block->lock);
13273 
13274 out:
13275     uvm_va_space_up_read(va_space);
13276     uvm_va_space_mm_or_current_release_unlock(va_space, mm);
13277     return status;
13278 }
13279 
13280 static uvm_prot_t g_uvm_test_pte_mapping_to_prot[UVM_TEST_PTE_MAPPING_MAX] =
13281 {
13282     [UVM_TEST_PTE_MAPPING_INVALID]           = UVM_PROT_NONE,
13283     [UVM_TEST_PTE_MAPPING_READ_ONLY]         = UVM_PROT_READ_ONLY,
13284     [UVM_TEST_PTE_MAPPING_READ_WRITE]        = UVM_PROT_READ_WRITE,
13285     [UVM_TEST_PTE_MAPPING_READ_WRITE_ATOMIC] = UVM_PROT_READ_WRITE_ATOMIC,
13286 };
13287 
13288 static UVM_TEST_PTE_MAPPING g_uvm_prot_to_test_pte_mapping[UVM_PROT_MAX] =
13289 {
13290     [UVM_PROT_NONE]              = UVM_TEST_PTE_MAPPING_INVALID,
13291     [UVM_PROT_READ_ONLY]         = UVM_TEST_PTE_MAPPING_READ_ONLY,
13292     [UVM_PROT_READ_WRITE]        = UVM_TEST_PTE_MAPPING_READ_WRITE,
13293     [UVM_PROT_READ_WRITE_ATOMIC] = UVM_TEST_PTE_MAPPING_READ_WRITE_ATOMIC,
13294 };
13295 
uvm_test_change_pte_mapping(UVM_TEST_CHANGE_PTE_MAPPING_PARAMS * params,struct file * filp)13296 NV_STATUS uvm_test_change_pte_mapping(UVM_TEST_CHANGE_PTE_MAPPING_PARAMS *params, struct file *filp)
13297 {
13298     uvm_va_space_t *va_space = uvm_va_space_get(filp);
13299     uvm_va_block_t *block;
13300     struct mm_struct *mm;
13301     NV_STATUS status = NV_OK;
13302     uvm_prot_t curr_prot, new_prot;
13303     uvm_gpu_t *gpu = NULL;
13304     uvm_processor_id_t id;
13305     uvm_tracker_t local_tracker;
13306     uvm_va_block_region_t region;
13307     uvm_va_block_context_t *block_context = NULL;
13308 
13309     if (!PAGE_ALIGNED(params->va))
13310         return NV_ERR_INVALID_ADDRESS;
13311 
13312     if (params->mapping >= UVM_TEST_PTE_MAPPING_MAX)
13313         return NV_ERR_INVALID_ARGUMENT;
13314 
13315     new_prot = g_uvm_test_pte_mapping_to_prot[params->mapping];
13316 
13317     // mmap_lock isn't needed for invalidating CPU mappings, but it will be
13318     // needed for inserting them.
13319     mm = uvm_va_space_mm_or_current_retain_lock(va_space);
13320     uvm_va_space_down_read(va_space);
13321 
13322     if (uvm_uuid_is_cpu(&params->uuid)) {
13323         id = UVM_ID_CPU;
13324     }
13325     else {
13326         gpu = uvm_va_space_get_gpu_by_uuid_with_gpu_va_space(va_space, &params->uuid);
13327         if (!gpu) {
13328             status = NV_ERR_INVALID_DEVICE;
13329             goto out;
13330         }
13331 
13332         // Check if the GPU can access the VA
13333         if (!uvm_gpu_can_address(gpu, params->va, PAGE_SIZE)) {
13334             status = NV_ERR_OUT_OF_RANGE;
13335             goto out;
13336         }
13337 
13338         id = gpu->id;
13339     }
13340 
13341     block_context = uvm_va_block_context_alloc(mm);
13342     if (!block_context) {
13343         status = NV_ERR_NO_MEMORY;
13344         goto out;
13345     }
13346 
13347     if (mm)
13348         status = uvm_va_block_find_create(va_space, params->va, &block_context->hmm.vma, &block);
13349     else
13350         status = uvm_va_block_find_create_managed(va_space, params->va, &block);
13351 
13352     if (status != NV_OK)
13353         goto out;
13354 
13355     // TODO: Bug 3912902: UvmTestChangePteMapping() doesn't work on CPU.
13356     if (UVM_ID_IS_CPU(id) && uvm_va_block_is_hmm(block))
13357         goto out;
13358 
13359     uvm_mutex_lock(&block->lock);
13360 
13361     region = uvm_va_block_region_from_start_size(block, params->va, PAGE_SIZE);
13362     curr_prot = block_page_prot(block, id, region.first);
13363 
13364     if (new_prot == curr_prot) {
13365         status = NV_OK;
13366         goto out_block;
13367     }
13368 
13369     // TODO: Bug 1766124: Upgrades might require revoking other processors'
13370     //       access privileges. We just fail for now. Only downgrades are
13371     //       supported. If we allowed upgrades, we would need to check the mm
13372     //       like we do for revocation below.
13373     if (new_prot > curr_prot) {
13374         status = NV_ERR_INVALID_OPERATION;
13375         goto out_block;
13376     }
13377 
13378     if (new_prot == UVM_PROT_NONE) {
13379         status = uvm_va_block_unmap(block, block_context, id, region, NULL, &block->tracker);
13380     }
13381     else {
13382         UVM_ASSERT(block_is_page_resident_anywhere(block, region.first));
13383 
13384         // Revoking CPU mappings performs a combination of unmap + map. The map
13385         // portion requires a valid mm.
13386         if (UVM_ID_IS_CPU(id) && !uvm_va_range_vma_check(block->va_range, mm)) {
13387             status = NV_ERR_INVALID_STATE;
13388         }
13389         else {
13390             status = uvm_va_block_revoke_prot(block,
13391                                               block_context,
13392                                               id,
13393                                               region,
13394                                               NULL,
13395                                               new_prot + 1,
13396                                               &block->tracker);
13397         }
13398     }
13399 
13400 out_block:
13401     if (status == NV_OK)
13402         status = uvm_tracker_init_from(&local_tracker, &block->tracker);
13403 
13404     uvm_mutex_unlock(&block->lock);
13405 
13406     if (status == NV_OK)
13407         status = uvm_tracker_wait_deinit(&local_tracker);
13408 
13409 out:
13410     uvm_va_space_up_read(va_space);
13411     uvm_va_space_mm_or_current_release_unlock(va_space, mm);
13412 
13413     uvm_va_block_context_free(block_context);
13414 
13415     return status;
13416 }
13417 
uvm_test_va_block_info(UVM_TEST_VA_BLOCK_INFO_PARAMS * params,struct file * filp)13418 NV_STATUS uvm_test_va_block_info(UVM_TEST_VA_BLOCK_INFO_PARAMS *params, struct file *filp)
13419 {
13420     uvm_va_space_t *va_space = uvm_va_space_get(filp);
13421     uvm_va_block_t *va_block;
13422     uvm_va_range_t *va_range;
13423     struct mm_struct *mm;
13424     size_t index;
13425     NV_STATUS status = NV_OK;
13426 
13427     BUILD_BUG_ON(UVM_TEST_VA_BLOCK_SIZE != UVM_VA_BLOCK_SIZE);
13428 
13429     mm = uvm_va_space_mm_or_current_retain_lock(va_space);
13430     uvm_va_space_down_read(va_space);
13431 
13432     va_range = uvm_va_range_find(va_space, params->lookup_address);
13433     if (!va_range) {
13434         status = uvm_hmm_va_block_find(va_space, params->lookup_address, &va_block);
13435         if (status == NV_ERR_OBJECT_NOT_FOUND) {
13436             status = uvm_hmm_va_block_range_bounds(va_space,
13437                                                    mm,
13438                                                    params->lookup_address,
13439                                                    &params->va_block_start,
13440                                                    &params->va_block_end,
13441                                                    NULL);
13442             goto out;
13443         }
13444         else if (status != NV_OK) {
13445             goto out;
13446         }
13447     }
13448     else {
13449         index = uvm_va_range_block_index(va_range, params->lookup_address);
13450         va_block = uvm_va_range_block(va_range, index);
13451         if (!va_block) {
13452             status = NV_ERR_OBJECT_NOT_FOUND;
13453             goto out;
13454         }
13455     }
13456 
13457     params->va_block_start = va_block->start;
13458     params->va_block_end   = va_block->end;
13459 
13460 out:
13461     uvm_va_space_up_read(va_space);
13462     uvm_va_space_mm_or_current_release_unlock(va_space, mm);
13463     return status;
13464 }
13465 
uvm_test_va_residency_info(UVM_TEST_VA_RESIDENCY_INFO_PARAMS * params,struct file * filp)13466 NV_STATUS uvm_test_va_residency_info(UVM_TEST_VA_RESIDENCY_INFO_PARAMS *params, struct file *filp)
13467 {
13468     NV_STATUS status = NV_OK;
13469     uvm_va_space_t *va_space = uvm_va_space_get(filp);
13470     uvm_va_range_t *va_range;
13471     uvm_va_block_t *block = NULL;
13472     uvm_va_block_context_t *block_context = NULL;
13473     struct mm_struct *mm;
13474     NvU32 count = 0;
13475     uvm_processor_mask_t *resident_on_mask;
13476     uvm_processor_id_t id;
13477     uvm_page_index_t page_index;
13478     unsigned release_block_count = 0;
13479     NvU64 addr = UVM_ALIGN_DOWN(params->lookup_address, PAGE_SIZE);
13480     size_t index;
13481 
13482     resident_on_mask = uvm_processor_mask_cache_alloc();
13483     if (!resident_on_mask)
13484         return NV_ERR_NO_MEMORY;
13485 
13486     mm = uvm_va_space_mm_or_current_retain_lock(va_space);
13487 
13488     block_context = uvm_va_block_context_alloc(mm);
13489     if (!block_context) {
13490         status = NV_ERR_NO_MEMORY;
13491         goto out_unlocked;
13492     }
13493 
13494     uvm_va_space_down_read(va_space);
13495 
13496     // Inline uvm_va_block_find() to get the va_range.
13497     va_range = uvm_va_range_find(va_space, addr);
13498     if (!va_range) {
13499         NvU64 start, end;
13500 
13501         status = uvm_hmm_va_block_find(va_space, addr, &block);
13502         if (status != NV_OK) {
13503             if (status != NV_ERR_OBJECT_NOT_FOUND)
13504                 goto out;
13505             status = uvm_hmm_va_block_range_bounds(va_space, mm, addr, &start, &end, params);
13506             goto out;
13507         }
13508         // Update current CPU mapping information.
13509         status = uvm_hmm_va_block_update_residency_info(block, mm, addr, false);
13510         if (status != NV_OK) {
13511             block = NULL;
13512             goto out;
13513         }
13514     }
13515     else if (va_range->type != UVM_VA_RANGE_TYPE_MANAGED) {
13516         status = NV_ERR_INVALID_ADDRESS;
13517         goto out;
13518     }
13519     else {
13520         index = uvm_va_range_block_index(va_range, addr);
13521         block = uvm_va_range_block(va_range, index);
13522         if (!block) {
13523             params->resident_on_count = 0;
13524             params->populated_on_count = 0;
13525             params->mapped_on_count = 0;
13526             params->resident_nid = -1;
13527 
13528             status = NV_OK;
13529 
13530             goto out;
13531         }
13532     }
13533 
13534     uvm_mutex_lock(&block->lock);
13535 
13536     page_index = uvm_va_block_cpu_page_index(block, addr);
13537     uvm_va_block_page_resident_processors(block, page_index, resident_on_mask);
13538 
13539     params->resident_nid = -1;
13540     for_each_id_in_mask(id, resident_on_mask) {
13541         block_phys_page_t block_page;
13542         int nid = block_get_page_node_residency(block, page_index);
13543 
13544         block_page = block_phys_page(id, nid, page_index);
13545         uvm_va_space_processor_uuid(va_space, &params->resident_on[count], id);
13546         params->resident_physical_size[count] = block_phys_page_size(block, block_page);
13547         if (UVM_ID_IS_CPU(id)) {
13548             params->resident_physical_address[count] = page_to_phys(uvm_va_block_get_cpu_page(block, page_index));
13549             params->resident_nid = nid;
13550 
13551             // Check that the page is only resident on a single CPU NUMA node.
13552             for_each_possible_uvm_node(nid) {
13553                 if (uvm_va_block_cpu_is_page_resident_on(block, nid, page_index) && nid != params->resident_nid) {
13554                     status = NV_ERR_INVALID_STATE;
13555                     goto out;
13556                 }
13557             }
13558         }
13559         else {
13560             params->resident_physical_address[count] =
13561                 block_phys_page_address(block, block_page, uvm_va_space_get_gpu(va_space, id)).address;
13562         }
13563 
13564         ++count;
13565     }
13566 
13567     params->resident_on_count = count;
13568 
13569     count = 0;
13570     for_each_id_in_mask(id, &block->mapped) {
13571         uvm_processor_id_t processor_to_map;
13572         block_phys_page_t block_page;
13573         NvU32 page_size = uvm_va_block_page_size_processor(block, id, page_index);
13574         int nid = NUMA_NO_NODE;
13575 
13576         if (page_size == 0)
13577             continue;
13578 
13579         uvm_va_space_processor_uuid(va_space, &params->mapped_on[count], id);
13580 
13581         params->mapping_type[count] = g_uvm_prot_to_test_pte_mapping[block_page_prot(block, id, page_index)];
13582         UVM_ASSERT(params->mapping_type[count] != UVM_TEST_PTE_MAPPING_INVALID);
13583         processor_to_map = block_get_processor_to_map(block, block_context, id, page_index);
13584         if (UVM_ID_IS_CPU(processor_to_map))
13585             nid = block_get_page_node_residency(block, page_index);
13586 
13587         block_page = block_phys_page(processor_to_map, nid, page_index);
13588 
13589         if (!UVM_ID_IS_CPU(id)) {
13590             uvm_gpu_phys_address_t gpu_phys_addr = block_phys_page_address(block,
13591                                                                            block_page,
13592                                                                            uvm_va_space_get_gpu(va_space, id));
13593             params->mapping_physical_address[count] = gpu_phys_addr.address;
13594         }
13595         else {
13596             struct page *page = block_page_get(block, block_page);
13597 
13598             params->mapping_physical_address[count] = page_to_phys(page);
13599         }
13600 
13601         params->page_size[count] = page_size;
13602         ++count;
13603     }
13604 
13605     if (params->resident_on_count == 1) {
13606         if (uvm_processor_mask_test(resident_on_mask, UVM_ID_CPU)) {
13607             if (uvm_pmm_sysmem_mappings_indirect_supported()) {
13608                 for_each_gpu_id(id) {
13609                     NvU32 page_size = uvm_va_block_page_size_processor(block, id, page_index);
13610                     uvm_reverse_map_t sysmem_page;
13611                     uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page_resident(block, page_index);
13612                     size_t num_pages;
13613                     uvm_gpu_t *gpu;
13614 
13615                     if (!uvm_va_block_gpu_state_get(block, id))
13616                         continue;
13617 
13618                     gpu = uvm_va_space_get_gpu(va_space, id);
13619 
13620                     if (!gpu->parent->access_counters_can_use_physical_addresses)
13621                         continue;
13622 
13623                     num_pages = uvm_pmm_sysmem_mappings_dma_to_virt(&gpu->pmm_reverse_sysmem_mappings,
13624                                                                     uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk,
13625                                                                                                            gpu->parent),
13626                                                                     uvm_cpu_chunk_get_size(chunk),
13627                                                                     &sysmem_page,
13628                                                                     1);
13629                     if (page_size > 0)
13630                         UVM_ASSERT(num_pages == 1);
13631                     else
13632                         UVM_ASSERT(num_pages <= 1);
13633 
13634                     if (num_pages == 1) {
13635                         UVM_ASSERT(sysmem_page.va_block == block);
13636                         UVM_ASSERT(uvm_reverse_map_start(&sysmem_page) <= addr);
13637                         UVM_ASSERT(uvm_reverse_map_end(&sysmem_page) > addr);
13638 
13639                         ++release_block_count;
13640                     }
13641                 }
13642             }
13643         }
13644         else {
13645             uvm_gpu_id_t id = uvm_processor_mask_find_first_id(resident_on_mask);
13646             uvm_reverse_map_t gpu_mapping;
13647             size_t num_pages;
13648             uvm_gpu_t *gpu = uvm_va_space_get_gpu(va_space, id);
13649             uvm_gpu_phys_address_t phys_addr;
13650 
13651             phys_addr = uvm_va_block_gpu_phys_page_address(block, page_index, gpu);
13652             num_pages = uvm_pmm_gpu_phys_to_virt(&gpu->pmm, phys_addr.address, PAGE_SIZE, &gpu_mapping);
13653 
13654             // Chunk may be in TEMP_PINNED state so it may not have a VA block
13655             // assigned. In that case, we don't get a valid translation.
13656             if (num_pages > 0) {
13657                 UVM_ASSERT(num_pages == 1);
13658                 UVM_ASSERT(gpu_mapping.va_block == block);
13659                 UVM_ASSERT(uvm_reverse_map_start(&gpu_mapping) == addr);
13660 
13661                 ++release_block_count;
13662             }
13663         }
13664     }
13665 
13666     params->mapped_on_count = count;
13667 
13668     count = 0;
13669     for_each_id(id) {
13670         if (!block_processor_page_is_populated(block, id, page_index))
13671             continue;
13672 
13673         uvm_va_space_processor_uuid(va_space, &params->populated_on[count], id);
13674         ++count;
13675     }
13676 
13677     params->populated_on_count = count;
13678 
13679 out:
13680     if (block) {
13681         if (!params->is_async && status == NV_OK)
13682             status = uvm_tracker_wait(&block->tracker);
13683         uvm_mutex_unlock(&block->lock);
13684         while (release_block_count--)
13685             uvm_va_block_release(block);
13686     }
13687 
13688     uvm_va_space_up_read(va_space);
13689     uvm_va_block_context_free(block_context);
13690 
13691 out_unlocked:
13692     uvm_va_space_mm_or_current_release_unlock(va_space, mm);
13693     uvm_processor_mask_cache_free(resident_on_mask);
13694 
13695     return status;
13696 }
13697 
uvm_va_block_mark_cpu_dirty(uvm_va_block_t * va_block)13698 void uvm_va_block_mark_cpu_dirty(uvm_va_block_t *va_block)
13699 {
13700     block_mark_region_cpu_dirty(va_block, uvm_va_block_region_from_block(va_block));
13701 }
13702