1 /*******************************************************************************
2 Copyright (c) 2015-2023 NVIDIA Corporation
3
4 Permission is hereby granted, free of charge, to any person obtaining a copy
5 of this software and associated documentation files (the "Software"), to
6 deal in the Software without restriction, including without limitation the
7 rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
8 sell copies of the Software, and to permit persons to whom the Software is
9 furnished to do so, subject to the following conditions:
10
11 The above copyright notice and this permission notice shall be
12 included in all copies or substantial portions of the Software.
13
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20 DEALINGS IN THE SOFTWARE.
21
22 *******************************************************************************/
23
24 #include "uvm_linux.h"
25 #include "uvm_common.h"
26 #include "uvm_api.h"
27 #include "uvm_global.h"
28 #include "uvm_gpu.h"
29 #include "uvm_va_space.h"
30 #include "uvm_va_range.h"
31 #include "uvm_va_block.h"
32 #include "uvm_hal_types.h"
33 #include "uvm_kvmalloc.h"
34 #include "uvm_tools.h"
35 #include "uvm_processors.h"
36 #include "uvm_push.h"
37 #include "uvm_hal.h"
38 #include "uvm_perf_thrashing.h"
39 #include "uvm_perf_prefetch.h"
40 #include "uvm_mem.h"
41 #include "uvm_gpu_access_counters.h"
42 #include "uvm_va_space_mm.h"
43 #include "uvm_test_ioctl.h"
44 #include "uvm_conf_computing.h"
45
46 typedef enum
47 {
48 BLOCK_PTE_OP_MAP,
49 BLOCK_PTE_OP_REVOKE,
50 BLOCK_PTE_OP_COUNT
51 } block_pte_op_t;
52
53 static NvU64 uvm_perf_authorized_cpu_fault_tracking_window_ns = 300000;
54
55 static struct kmem_cache *g_uvm_va_block_cache __read_mostly;
56 static struct kmem_cache *g_uvm_va_block_gpu_state_cache __read_mostly;
57 static struct kmem_cache *g_uvm_page_mask_cache __read_mostly;
58 static struct kmem_cache *g_uvm_va_block_context_cache __read_mostly;
59 static struct kmem_cache *g_uvm_va_block_cpu_node_state_cache __read_mostly;
60
61 static int uvm_fault_force_sysmem __read_mostly = 0;
62 module_param(uvm_fault_force_sysmem, int, S_IRUGO|S_IWUSR);
63 MODULE_PARM_DESC(uvm_fault_force_sysmem, "Force (1) using sysmem storage for pages that faulted. Default: 0.");
64
65 static int uvm_perf_map_remote_on_eviction __read_mostly = 1;
66 module_param(uvm_perf_map_remote_on_eviction, int, S_IRUGO);
67
68 static int uvm_block_cpu_to_cpu_copy_with_ce __read_mostly = 0;
69 module_param(uvm_block_cpu_to_cpu_copy_with_ce, int, S_IRUGO | S_IWUSR);
70 MODULE_PARM_DESC(uvm_block_cpu_to_cpu_copy_with_ce, "Use GPU CEs for CPU-to-CPU migrations.");
71
72 // Caching is always disabled for mappings to remote memory. The following two
73 // module parameters can be used to force caching for GPU peer/sysmem mappings.
74 //
75 // However, it is important to note that it may not be safe to enable caching
76 // in the general case so the enablement should only be used for experiments.
77 static unsigned uvm_exp_gpu_cache_peermem __read_mostly = 0;
78 module_param(uvm_exp_gpu_cache_peermem, uint, S_IRUGO);
79 MODULE_PARM_DESC(uvm_exp_gpu_cache_peermem,
80 "Force caching for mappings to peer memory. "
81 "This is an experimental parameter that may cause correctness issues if used.");
82
83 static unsigned uvm_exp_gpu_cache_sysmem __read_mostly = 0;
84 module_param(uvm_exp_gpu_cache_sysmem, uint, S_IRUGO);
85 MODULE_PARM_DESC(uvm_exp_gpu_cache_sysmem,
86 "Force caching for mappings to system memory. "
87 "This is an experimental parameter that may cause correctness issues if used.");
88
89 static void block_add_eviction_mappings_entry(void *args);
90
uvm_va_block_get_va_space_maybe_dead(uvm_va_block_t * va_block)91 uvm_va_space_t *uvm_va_block_get_va_space_maybe_dead(uvm_va_block_t *va_block)
92 {
93 #if UVM_IS_CONFIG_HMM()
94 if (va_block->hmm.va_space)
95 return va_block->hmm.va_space;
96 #endif
97
98 if (va_block->va_range)
99 return va_block->va_range->va_space;
100
101 return NULL;
102 }
103
uvm_va_block_get_va_space(uvm_va_block_t * va_block)104 uvm_va_space_t *uvm_va_block_get_va_space(uvm_va_block_t *va_block)
105 {
106 uvm_va_space_t *va_space;
107
108 UVM_ASSERT(!uvm_va_block_is_dead(va_block));
109
110 va_space = uvm_va_block_get_va_space_maybe_dead(va_block);
111 UVM_ASSERT(va_space);
112
113 return va_space;
114 }
115
block_gpu_pte_flag_cacheable(uvm_va_block_t * block,uvm_gpu_t * gpu,uvm_processor_id_t resident_id)116 static NvU64 block_gpu_pte_flag_cacheable(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_processor_id_t resident_id)
117 {
118 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
119
120 UVM_ASSERT(UVM_ID_IS_VALID(resident_id));
121
122 // Local vidmem is always cached
123 if (uvm_id_equal(resident_id, gpu->id))
124 return UVM_MMU_PTE_FLAGS_CACHED;
125
126 if (UVM_ID_IS_CPU(resident_id))
127 return uvm_exp_gpu_cache_sysmem == 0 ? UVM_MMU_PTE_FLAGS_NONE : UVM_MMU_PTE_FLAGS_CACHED;
128
129 UVM_ASSERT(uvm_processor_mask_test(&va_space->can_access[uvm_id_value(gpu->id)], resident_id));
130
131 return uvm_exp_gpu_cache_peermem == 0 ? UVM_MMU_PTE_FLAGS_NONE : UVM_MMU_PTE_FLAGS_CACHED;
132 }
133
block_get_gpu(uvm_va_block_t * block,uvm_gpu_id_t gpu_id)134 static uvm_gpu_t *block_get_gpu(uvm_va_block_t *block, uvm_gpu_id_t gpu_id)
135 {
136 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
137
138 return uvm_va_space_get_gpu(va_space, gpu_id);
139 }
140
block_processor_name(uvm_va_block_t * block,uvm_processor_id_t id)141 static const char *block_processor_name(uvm_va_block_t *block, uvm_processor_id_t id)
142 {
143 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
144
145 return uvm_va_space_processor_name(va_space, id);
146 }
147
block_processor_has_memory(uvm_va_block_t * block,uvm_processor_id_t id)148 static bool block_processor_has_memory(uvm_va_block_t *block, uvm_processor_id_t id)
149 {
150 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
151
152 return uvm_va_space_processor_has_memory(va_space, id);
153 }
154
is_uvm_fault_force_sysmem_set(void)155 static bool is_uvm_fault_force_sysmem_set(void)
156 {
157 // Only enforce this during testing
158 return uvm_enable_builtin_tests && uvm_fault_force_sysmem != 0;
159 }
160
uvm_va_space_map_remote_on_eviction(uvm_va_space_t * va_space)161 bool uvm_va_space_map_remote_on_eviction(uvm_va_space_t *va_space)
162 {
163 return uvm_perf_map_remote_on_eviction &&
164 uvm_va_space_has_access_counter_migrations(va_space);
165 }
166
block_get_uvm_lite_gpus(uvm_va_block_t * va_block)167 static const uvm_processor_mask_t *block_get_uvm_lite_gpus(uvm_va_block_t *va_block)
168 {
169 // Note that for HMM we always return a pointer to a zero bitmap
170 // (not allocated on the stack) since uvm_lite GPUs are not supported.
171 if (uvm_va_block_is_hmm(va_block))
172 return &g_uvm_processor_mask_empty;
173 else
174 return &va_block->va_range->uvm_lite_gpus;
175 }
176
uvm_va_block_retry_init(uvm_va_block_retry_t * retry)177 void uvm_va_block_retry_init(uvm_va_block_retry_t *retry)
178 {
179 if (!retry)
180 return;
181
182 uvm_tracker_init(&retry->tracker);
183 INIT_LIST_HEAD(&retry->used_chunks);
184 INIT_LIST_HEAD(&retry->free_chunks);
185 }
186
node_to_index(int nid)187 static size_t node_to_index(int nid)
188 {
189 UVM_ASSERT(nid != NUMA_NO_NODE);
190 UVM_ASSERT(nid < MAX_NUMNODES);
191 return __nodes_weight(&node_possible_map, nid);
192 }
193
block_node_state_get(uvm_va_block_t * block,int nid)194 static uvm_va_block_cpu_node_state_t *block_node_state_get(uvm_va_block_t *block, int nid)
195 {
196 size_t index = node_to_index(nid);
197 UVM_ASSERT(block->cpu.node_state[index]);
198 return block->cpu.node_state[index];
199 }
200
block_tracking_node_mask_get(uvm_va_block_context_t * va_block_context,int nid)201 static uvm_page_mask_t *block_tracking_node_mask_get(uvm_va_block_context_t *va_block_context, int nid)
202 {
203 size_t index = node_to_index(nid);
204 UVM_ASSERT(va_block_context->make_resident.cpu_pages_used.node_masks[index]);
205 return va_block_context->make_resident.cpu_pages_used.node_masks[index];
206 }
207
208 // The bottom bit of uvm_va_block_t::chunks is used to indicate how CPU chunks
209 // are stored.
210 //
211 // CPU chunk storage is handled in three different ways depending on the
212 // type of chunks the VA block owns. This is done to minimize the memory
213 // required to hold metadata.
214 typedef enum
215 {
216 // The uvm_va_block_t::chunk pointer points to a single 2MB
217 // CPU chunk.
218 UVM_CPU_CHUNK_STORAGE_CHUNK = 0,
219
220 // The uvm_va_block_t::chunks pointer points to a
221 // structure of mixed (64K and 4K) chunks.
222 UVM_CPU_CHUNK_STORAGE_MIXED,
223 UVM_CPU_CHUNK_STORAGE_COUNT,
224 } uvm_cpu_chunk_storage_type_t;
225
226 #define UVM_CPU_CHUNK_STORAGE_MASK 0x1
227
228 // The maximum number of slots in the mixed chunk mode (64K + 4K chunks) is
229 // MAX_BIG_PAGES_PER_UVM_VA_BLOCK. Any leading/trailing misaligned pages will
230 // be stored in the first/last entry, respectively.
231 #define MAX_BIG_CPU_CHUNK_SLOTS_PER_UVM_VA_BLOCK MAX_BIG_PAGES_PER_UVM_VA_BLOCK
232
233 #define MAX_SMALL_CHUNKS_PER_BIG_SLOT (UVM_MIN_BIG_PAGE_SIZE / PAGE_SIZE)
234
235 // This structure is used when a VA block contains 64K or a mix of 64K and 4K
236 // CPU chunks.
237 // For every 64K CPU chunks, big_chunks will have its corresponding bit set
238 // and the corresponding index in slots will point directly to the
239 // uvm_cpu_chunk_t structure.
240 //
241 // For 4K CPU chunks, the corresponding bit in big_chunks will be clear and
242 // the element in slots will point to an array of 16 uvm_cpu_chunk_t pointers.
243 typedef struct {
244 DECLARE_BITMAP(big_chunks, MAX_BIG_CPU_CHUNK_SLOTS_PER_UVM_VA_BLOCK);
245 void *slots[MAX_BIG_CPU_CHUNK_SLOTS_PER_UVM_VA_BLOCK];
246 } uvm_cpu_chunk_storage_mixed_t;
247
uvm_cpu_chunk_block_region(uvm_va_block_t * va_block,uvm_cpu_chunk_t * chunk,uvm_page_index_t page_index)248 static uvm_va_block_region_t uvm_cpu_chunk_block_region(uvm_va_block_t *va_block,
249 uvm_cpu_chunk_t *chunk,
250 uvm_page_index_t page_index)
251 {
252 UVM_ASSERT(chunk);
253 return uvm_va_block_chunk_region(va_block, uvm_cpu_chunk_get_size(chunk), page_index);
254 }
255
uvm_cpu_storage_get_ptr(uvm_va_block_cpu_node_state_t * node_state)256 static void *uvm_cpu_storage_get_ptr(uvm_va_block_cpu_node_state_t *node_state)
257 {
258 return (void *)(node_state->chunks & ~UVM_CPU_CHUNK_STORAGE_MASK);
259 }
260
uvm_cpu_storage_get_type(uvm_va_block_cpu_node_state_t * node_state)261 static uvm_cpu_chunk_storage_type_t uvm_cpu_storage_get_type(uvm_va_block_cpu_node_state_t *node_state)
262 {
263 return node_state->chunks & UVM_CPU_CHUNK_STORAGE_MASK;
264 }
265
block_get_page_node_residency(uvm_va_block_t * block,uvm_page_index_t page_index)266 static int block_get_page_node_residency(uvm_va_block_t *block, uvm_page_index_t page_index)
267 {
268 int nid;
269
270 for_each_possible_uvm_node(nid) {
271 if (uvm_va_block_cpu_is_page_resident_on(block, nid, page_index))
272 return nid;
273 }
274
275 return NUMA_NO_NODE;
276 }
277
compute_page_prefix(uvm_va_block_t * va_block,uvm_chunk_size_t size)278 static uvm_page_index_t compute_page_prefix(uvm_va_block_t *va_block, uvm_chunk_size_t size)
279 {
280 return (UVM_ALIGN_UP(va_block->start, size) - va_block->start) / PAGE_SIZE;
281 }
282
compute_slot_index(uvm_va_block_t * va_block,uvm_page_index_t page_index)283 static size_t compute_slot_index(uvm_va_block_t *va_block, uvm_page_index_t page_index)
284 {
285 uvm_va_block_region_t block_region = uvm_va_block_region_from_block(va_block);
286 uvm_page_index_t prefix;
287 size_t slot_index;
288
289 UVM_ASSERT(page_index < block_region.outer);
290 prefix = compute_page_prefix(va_block, UVM_PAGE_SIZE_64K);
291
292 if (page_index < prefix)
293 return 0;
294
295 slot_index = ((page_index - prefix) / MAX_SMALL_CHUNKS_PER_BIG_SLOT) + !!prefix;
296 UVM_ASSERT(slot_index < MAX_BIG_CPU_CHUNK_SLOTS_PER_UVM_VA_BLOCK);
297
298 return slot_index;
299 }
300
compute_small_index(uvm_va_block_t * va_block,uvm_page_index_t page_index)301 static size_t compute_small_index(uvm_va_block_t *va_block, uvm_page_index_t page_index)
302 {
303 size_t prefix = compute_page_prefix(va_block, UVM_PAGE_SIZE_64K);
304
305 if (page_index < prefix)
306 return page_index;
307
308 return (page_index - prefix) % MAX_SMALL_CHUNKS_PER_BIG_SLOT;
309 }
310
uvm_cpu_chunk_insert_in_block(uvm_va_block_t * va_block,uvm_cpu_chunk_t * chunk,uvm_page_index_t page_index)311 NV_STATUS uvm_cpu_chunk_insert_in_block(uvm_va_block_t *va_block, uvm_cpu_chunk_t *chunk, uvm_page_index_t page_index)
312 {
313 uvm_chunk_size_t chunk_size = uvm_cpu_chunk_get_size(chunk);
314 uvm_va_block_region_t chunk_region = uvm_va_block_region(page_index, page_index + uvm_cpu_chunk_num_pages(chunk));
315 int nid = uvm_cpu_chunk_get_numa_node(chunk);
316 uvm_va_block_cpu_node_state_t *node_state = block_node_state_get(va_block, nid);
317 size_t slot_index;
318 uvm_cpu_chunk_storage_mixed_t *mixed;
319 uvm_cpu_chunk_t **chunks = NULL;
320
321 // We only want to use the bottom bit of a pointer.
322 BUILD_BUG_ON(UVM_CPU_CHUNK_STORAGE_COUNT > 2);
323
324 // We want to protect against two threads manipulating the VA block's CPU
325 // chunks at the same time. However, when a block is split, the new block's
326 // lock is locked without tracking. So, we can't use
327 // uvm_assert_mutex_locked().
328 UVM_ASSERT(mutex_is_locked(&va_block->lock.m));
329
330 if (chunk_size == UVM_CHUNK_SIZE_2M) {
331 UVM_ASSERT(uvm_va_block_size(va_block) == UVM_PAGE_SIZE_2M);
332 UVM_ASSERT(!node_state->chunks);
333 node_state->chunks = (unsigned long)chunk | UVM_CPU_CHUNK_STORAGE_CHUNK;
334 }
335 else {
336 if (!node_state->chunks) {
337 mixed = uvm_kvmalloc_zero(sizeof(*mixed));
338 if (!mixed)
339 return NV_ERR_NO_MEMORY;
340
341 node_state->chunks = (unsigned long)mixed | UVM_CPU_CHUNK_STORAGE_MIXED;
342 }
343
344 UVM_ASSERT(uvm_cpu_storage_get_type(node_state) == UVM_CPU_CHUNK_STORAGE_MIXED);
345 mixed = uvm_cpu_storage_get_ptr(node_state);
346 slot_index = compute_slot_index(va_block, page_index);
347 UVM_ASSERT(compute_slot_index(va_block, page_index + uvm_cpu_chunk_num_pages(chunk) - 1) == slot_index);
348 UVM_ASSERT(!test_bit(slot_index, mixed->big_chunks));
349
350 if (chunk_size == UVM_CHUNK_SIZE_64K) {
351 mixed->slots[slot_index] = chunk;
352 set_bit(slot_index, mixed->big_chunks);
353 }
354 else {
355 size_t small_index;
356
357 UVM_ASSERT(chunk_size == UVM_CHUNK_SIZE_4K);
358 chunks = mixed->slots[slot_index];
359
360 if (!chunks) {
361 chunks = uvm_kvmalloc_zero(sizeof(*chunks) * MAX_SMALL_CHUNKS_PER_BIG_SLOT);
362 if (!chunks)
363 return NV_ERR_NO_MEMORY;
364 mixed->slots[slot_index] = chunks;
365 }
366
367 small_index = compute_small_index(va_block, page_index);
368 chunks[small_index] = chunk;
369 }
370 }
371
372 uvm_page_mask_region_fill(&node_state->allocated, chunk_region);
373 uvm_page_mask_region_fill(&va_block->cpu.allocated, chunk_region);
374 return NV_OK;
375 }
376
uvm_cpu_chunk_get_chunk_for_page(uvm_va_block_t * va_block,int nid,uvm_page_index_t page_index)377 uvm_cpu_chunk_t *uvm_cpu_chunk_get_chunk_for_page(uvm_va_block_t *va_block, int nid, uvm_page_index_t page_index)
378 {
379 uvm_va_block_cpu_node_state_t *node_state;
380 uvm_cpu_chunk_storage_mixed_t *mixed;
381 uvm_cpu_chunk_t *chunk;
382 uvm_cpu_chunk_t **chunks;
383 size_t slot_index;
384
385 UVM_ASSERT(page_index < uvm_va_block_num_cpu_pages(va_block));
386 UVM_ASSERT(nid != NUMA_NO_NODE);
387 node_state = block_node_state_get(va_block, nid);
388 if (!uvm_page_mask_test(&node_state->allocated, page_index))
389 return NULL;
390
391 UVM_ASSERT(node_state->chunks);
392
393 if (uvm_cpu_storage_get_type(node_state) == UVM_CPU_CHUNK_STORAGE_CHUNK) {
394 return uvm_cpu_storage_get_ptr(node_state);
395 }
396 else {
397 mixed = uvm_cpu_storage_get_ptr(node_state);
398 slot_index = compute_slot_index(va_block, page_index);
399 UVM_ASSERT(mixed->slots[slot_index] != NULL);
400 if (test_bit(slot_index, mixed->big_chunks))
401 return mixed->slots[slot_index];
402
403 chunks = mixed->slots[slot_index];
404 chunk = chunks[compute_small_index(va_block, page_index)];
405 }
406
407 UVM_ASSERT(chunk);
408 return chunk;
409 }
410
uvm_cpu_chunk_get_any_chunk_for_page(uvm_va_block_t * va_block,uvm_page_index_t page_index)411 uvm_cpu_chunk_t *uvm_cpu_chunk_get_any_chunk_for_page(uvm_va_block_t *va_block, uvm_page_index_t page_index)
412 {
413 int nid;
414 uvm_va_block_cpu_node_state_t *node_state;
415
416 // Callers for managed blocks should already know the correct nid and
417 // shouldn't need to call this function.
418 UVM_ASSERT(uvm_va_block_is_hmm(va_block));
419
420 for_each_possible_uvm_node(nid) {
421 node_state = block_node_state_get(va_block, nid);
422 if (uvm_page_mask_test(&node_state->allocated, page_index))
423 return uvm_cpu_chunk_get_chunk_for_page(va_block, nid, page_index);
424 }
425
426 return NULL;
427 }
428
uvm_cpu_chunk_get_chunk_for_page_resident(uvm_va_block_t * va_block,uvm_page_index_t page_index)429 static uvm_cpu_chunk_t *uvm_cpu_chunk_get_chunk_for_page_resident(uvm_va_block_t *va_block, uvm_page_index_t page_index)
430 {
431 uvm_cpu_chunk_t *chunk = NULL;
432 int nid = block_get_page_node_residency(va_block, page_index);
433
434 if (nid != NUMA_NO_NODE)
435 chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, nid, page_index);
436
437 return chunk;
438 }
439
uvm_cpu_chunk_remove_from_block(uvm_va_block_t * va_block,int nid,uvm_page_index_t page_index)440 void uvm_cpu_chunk_remove_from_block(uvm_va_block_t *va_block, int nid, uvm_page_index_t page_index)
441 {
442 uvm_va_block_cpu_node_state_t *node_state = block_node_state_get(va_block, nid);
443 uvm_cpu_chunk_storage_mixed_t *mixed;
444 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, nid, page_index);
445 uvm_va_block_region_t chunk_region = uvm_cpu_chunk_block_region(va_block, chunk, page_index);
446 size_t slot_index;
447 uvm_cpu_chunk_t **chunks;
448 int nid_iter;
449
450 // We want to protect against two threads manipulating the VA block's CPU
451 // chunks at the same time. However, when a block is split, the new block's
452 // lock is locked without tracking. So, we can't use
453 // uvm_assert_mutex_locked().
454 UVM_ASSERT(mutex_is_locked(&va_block->lock.m));
455 UVM_ASSERT(node_state->chunks);
456 UVM_ASSERT(uvm_va_block_region_num_pages(chunk_region) == uvm_cpu_chunk_num_pages(chunk));
457
458 if (uvm_cpu_storage_get_type(node_state) == UVM_CPU_CHUNK_STORAGE_CHUNK) {
459 UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_2M);
460 UVM_ASSERT(uvm_cpu_storage_get_ptr(node_state) == chunk);
461 node_state->chunks = 0;
462 }
463 else {
464 UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) != UVM_CHUNK_SIZE_2M);
465 mixed = uvm_cpu_storage_get_ptr(node_state);
466 slot_index = compute_slot_index(va_block, page_index);
467 UVM_ASSERT(mixed->slots[slot_index] != NULL);
468
469 if (test_bit(slot_index, mixed->big_chunks)) {
470 UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_64K);
471 UVM_ASSERT(mixed->slots[slot_index] == chunk);
472 mixed->slots[slot_index] = NULL;
473 clear_bit(slot_index, mixed->big_chunks);
474 }
475 else {
476 size_t small_index;
477
478 UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_4K);
479 chunks = mixed->slots[slot_index];
480 small_index = compute_small_index(va_block, page_index);
481 UVM_ASSERT(chunks[small_index] == chunk);
482 chunks[small_index] = NULL;
483
484 for (small_index = 0; small_index < MAX_SMALL_CHUNKS_PER_BIG_SLOT; small_index++) {
485 if (chunks[small_index])
486 break;
487 }
488
489 if (small_index == MAX_SMALL_CHUNKS_PER_BIG_SLOT) {
490 uvm_kvfree(chunks);
491 mixed->slots[slot_index] = NULL;
492 }
493 }
494 }
495
496 uvm_page_mask_region_clear(&node_state->allocated, chunk_region);
497 uvm_page_mask_zero(&va_block->cpu.allocated);
498 for_each_possible_uvm_node(nid_iter) {
499 uvm_va_block_cpu_node_state_t *iter_node_state = block_node_state_get(va_block, nid_iter);
500 uvm_page_mask_or(&va_block->cpu.allocated, &va_block->cpu.allocated, &iter_node_state->allocated);
501 }
502
503 if (uvm_page_mask_empty(&node_state->allocated) && node_state->chunks) {
504 uvm_kvfree(uvm_cpu_storage_get_ptr(node_state));
505 node_state->chunks = 0;
506 }
507 }
508
uvm_cpu_chunk_get_cpu_page(uvm_va_block_t * va_block,uvm_cpu_chunk_t * chunk,uvm_page_index_t page_index)509 struct page *uvm_cpu_chunk_get_cpu_page(uvm_va_block_t *va_block, uvm_cpu_chunk_t *chunk, uvm_page_index_t page_index)
510 {
511 uvm_va_block_region_t chunk_region;
512
513 UVM_ASSERT(chunk);
514 UVM_ASSERT(chunk->page);
515 chunk_region = uvm_va_block_chunk_region(va_block, uvm_cpu_chunk_get_size(chunk), page_index);
516 return chunk->page + (page_index - chunk_region.first);
517 }
518
uvm_va_block_get_cpu_page(uvm_va_block_t * va_block,uvm_page_index_t page_index)519 struct page *uvm_va_block_get_cpu_page(uvm_va_block_t *va_block, uvm_page_index_t page_index)
520 {
521 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page_resident(va_block, page_index);
522
523 return uvm_cpu_chunk_get_cpu_page(va_block, chunk, page_index);
524 }
525
uvm_cpu_chunk_first_in_region(uvm_va_block_t * va_block,uvm_va_block_region_t region,int nid,uvm_page_index_t * first_chunk_page)526 static uvm_cpu_chunk_t *uvm_cpu_chunk_first_in_region(uvm_va_block_t *va_block,
527 uvm_va_block_region_t region,
528 int nid,
529 uvm_page_index_t *first_chunk_page)
530 {
531 uvm_cpu_chunk_t *chunk = NULL;
532 uvm_page_index_t page_index;
533 uvm_va_block_cpu_node_state_t *node_state = block_node_state_get(va_block, nid);
534
535 if (!node_state)
536 return NULL;
537
538 page_index = uvm_va_block_first_page_in_mask(region, &node_state->allocated);
539 if (page_index < region.outer)
540 chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, nid, page_index);
541
542 if (first_chunk_page && chunk) {
543 uvm_va_block_region_t chunk_region = uvm_cpu_chunk_block_region(va_block, chunk, page_index);
544 *first_chunk_page = chunk_region.first;
545 }
546
547 return chunk;
548 }
549
uvm_cpu_chunk_next_in_region(uvm_va_block_t * va_block,uvm_va_block_region_t region,int nid,uvm_page_index_t prev_page_index,uvm_page_index_t * next_chunk_page)550 static uvm_cpu_chunk_t *uvm_cpu_chunk_next_in_region(uvm_va_block_t *va_block,
551 uvm_va_block_region_t region,
552 int nid,
553 uvm_page_index_t prev_page_index,
554 uvm_page_index_t *next_chunk_page)
555 {
556 if (prev_page_index >= region.outer)
557 return NULL;
558
559 return uvm_cpu_chunk_first_in_region(va_block,
560 uvm_va_block_region(prev_page_index, region.outer),
561 nid, next_chunk_page);
562 }
563
564 #define for_each_cpu_chunk_in_block_region(chunk, chunk_start, va_block, nid, region) \
565 for ((chunk) = uvm_cpu_chunk_first_in_region((va_block), (region), (nid), &(chunk_start)); \
566 (chunk) != NULL; \
567 (chunk) = uvm_cpu_chunk_next_in_region((va_block), \
568 (region), \
569 (nid), \
570 (chunk_start) + uvm_cpu_chunk_num_pages((chunk)), \
571 &(chunk_start)))
572
573 #define for_each_cpu_chunk_in_block_region_safe(chunk, chunk_start, next_chunk_start, va_block, nid, region) \
574 for ((chunk) = uvm_cpu_chunk_first_in_region((va_block), (region), (nid), &(chunk_start)), \
575 (next_chunk_start) = (chunk_start) + (chunk ? uvm_cpu_chunk_num_pages(chunk) : 0); \
576 (chunk) != NULL; \
577 (chunk) = uvm_cpu_chunk_next_in_region((va_block), (region), (nid), (next_chunk_start), &(chunk_start)), \
578 (next_chunk_start) = (chunk_start) + ((chunk) ? uvm_cpu_chunk_num_pages((chunk)) : 0))
579
580 #define for_each_cpu_chunk_in_block(chunk, chunk_start, va_block, nid) \
581 for_each_cpu_chunk_in_block_region((chunk), \
582 (chunk_start), \
583 (va_block), \
584 (nid), \
585 uvm_va_block_region_from_block((va_block)))
586
587 #define for_each_cpu_chunk_in_block_safe(chunk, chunk_start, next_chunk_start, va_block, nid) \
588 for_each_cpu_chunk_in_block_region_safe((chunk), \
589 (chunk_start), \
590 (next_chunk_start), \
591 (va_block), \
592 (nid), \
593 uvm_va_block_region_from_block((va_block)))
594
block_update_cpu_resident_mask(uvm_va_block_t * va_block)595 static void block_update_cpu_resident_mask(uvm_va_block_t *va_block)
596 {
597 int nid;
598
599 uvm_page_mask_zero(&va_block->cpu.resident);
600 for_each_possible_uvm_node(nid) {
601 uvm_va_block_cpu_node_state_t *node_state = block_node_state_get(va_block, nid);
602 uvm_page_mask_or(&va_block->cpu.resident, &va_block->cpu.resident, &node_state->resident);
603 }
604 }
605
uvm_va_block_cpu_set_resident_page(uvm_va_block_t * va_block,int nid,uvm_page_index_t page_index)606 void uvm_va_block_cpu_set_resident_page(uvm_va_block_t *va_block, int nid, uvm_page_index_t page_index)
607 {
608 uvm_va_block_cpu_node_state_t *node_state;
609
610 node_state = block_node_state_get(va_block, nid);
611 UVM_ASSERT(node_state);
612 UVM_ASSERT(uvm_page_mask_test(&node_state->allocated, page_index));
613 uvm_page_mask_set(&node_state->resident, page_index);
614 uvm_page_mask_set(&va_block->cpu.resident, page_index);
615 uvm_processor_mask_set(&va_block->resident, UVM_ID_CPU);
616 }
617
618 // Set all CPU pages in the mask as resident on NUMA node nid.
619 // nid cannot be NUMA_NO_NODE.
uvm_va_block_cpu_set_resident_mask(uvm_va_block_t * va_block,int nid,const uvm_page_mask_t * mask)620 static void uvm_va_block_cpu_set_resident_mask(uvm_va_block_t *va_block, int nid, const uvm_page_mask_t *mask)
621 {
622 uvm_va_block_cpu_node_state_t *node_state;
623
624 node_state = block_node_state_get(va_block, nid);
625 UVM_ASSERT(node_state);
626 UVM_ASSERT(uvm_page_mask_subset(mask, &node_state->allocated));
627 uvm_page_mask_or(&node_state->resident, &node_state->resident, mask);
628 uvm_page_mask_or(&va_block->cpu.resident, &va_block->cpu.resident, mask);
629 }
630
uvm_va_block_cpu_set_resident_all_chunks(uvm_va_block_t * va_block,uvm_va_block_context_t * va_block_context,const uvm_page_mask_t * page_mask)631 static void uvm_va_block_cpu_set_resident_all_chunks(uvm_va_block_t *va_block,
632 uvm_va_block_context_t *va_block_context,
633 const uvm_page_mask_t *page_mask)
634 {
635 uvm_make_resident_page_tracking_t *tracking = &va_block_context->make_resident.cpu_pages_used;
636 uvm_page_mask_t *node_pages_mask = &va_block_context->make_resident.node_pages_mask;
637 uvm_page_mask_t *page_mask_copy = &va_block_context->scratch_page_mask;
638 int nid;
639
640 if (uvm_page_mask_empty(page_mask))
641 return;
642
643 uvm_page_mask_copy(page_mask_copy, page_mask);
644 for_each_node_mask(nid, tracking->nodes) {
645 uvm_page_mask_t *node_mask = block_tracking_node_mask_get(va_block_context, nid);
646
647 if (uvm_page_mask_and(node_pages_mask, page_mask_copy, node_mask)) {
648 uvm_va_block_cpu_set_resident_mask(va_block, nid, node_pages_mask);
649 uvm_page_mask_andnot(page_mask_copy, page_mask_copy, node_pages_mask);
650 }
651 }
652
653 UVM_ASSERT(uvm_page_mask_empty(page_mask_copy));
654 }
655
656 // Clear residency for all CPU pages in the mask.
657 // nid cannot be NUMA_NO_NODE.
uvm_va_block_cpu_clear_resident_mask(uvm_va_block_t * va_block,int nid,const uvm_page_mask_t * mask)658 static void uvm_va_block_cpu_clear_resident_mask(uvm_va_block_t *va_block, int nid, const uvm_page_mask_t *mask)
659 {
660 uvm_va_block_cpu_node_state_t *node_state;
661
662 node_state = block_node_state_get(va_block, nid);
663 UVM_ASSERT(node_state);
664 uvm_page_mask_andnot(&node_state->resident, &node_state->resident, mask);
665 block_update_cpu_resident_mask(va_block);
666 }
667
uvm_va_block_cpu_clear_resident_region(uvm_va_block_t * va_block,int nid,uvm_va_block_region_t region)668 static void uvm_va_block_cpu_clear_resident_region(uvm_va_block_t *va_block, int nid, uvm_va_block_region_t region)
669 {
670 uvm_va_block_cpu_node_state_t *node_state;
671
672 node_state = block_node_state_get(va_block, nid);
673 UVM_ASSERT(node_state);
674 uvm_page_mask_region_clear(&node_state->resident, region);
675 block_update_cpu_resident_mask(va_block);
676 }
677
678 // Clear residency bits from any/all processors that might have had pages resident.
679 // Note that both the destination processor and any CPU NUMA nodes where pages are
680 // migrating to need to be skipped as the block logic sets the new page residency
681 // before clearing the old ones (see uvm_va_block_make_resident_finish()).
uvm_va_block_cpu_clear_resident_all_chunks(uvm_va_block_t * va_block,uvm_va_block_context_t * va_block_context,uvm_page_mask_t * page_mask)682 static void uvm_va_block_cpu_clear_resident_all_chunks(uvm_va_block_t *va_block,
683 uvm_va_block_context_t *va_block_context,
684 uvm_page_mask_t *page_mask)
685 {
686 int nid;
687
688 if (UVM_ID_IS_CPU(va_block_context->make_resident.dest_id) &&
689 nodes_empty(va_block_context->make_resident.cpu_pages_used.nodes))
690 return;
691
692 for_each_possible_uvm_node(nid) {
693 // If the destination is the CPU and pages were allocated on this node
694 // for the migration, clear residency on the node only for pages that
695 // are in the page_mask but not in the node's allocated mask.
696 if (UVM_ID_IS_CPU(va_block_context->make_resident.dest_id) &&
697 node_isset(nid, va_block_context->make_resident.cpu_pages_used.nodes)) {
698 uvm_page_mask_t *node_pages_mask = &va_block_context->make_resident.node_pages_mask;
699 uvm_page_mask_t *node_alloc_mask = block_tracking_node_mask_get(va_block_context, nid);
700 uvm_page_mask_t *nid_resident = uvm_va_block_resident_mask_get(va_block, UVM_ID_CPU, nid);
701 uvm_page_mask_t *migrated_pages = &va_block_context->make_resident.pages_migrated;
702
703 uvm_page_mask_andnot(node_pages_mask, nid_resident, node_alloc_mask);
704 if (uvm_page_mask_and(node_pages_mask, migrated_pages, node_pages_mask))
705 uvm_va_block_cpu_clear_resident_mask(va_block, nid, node_pages_mask);
706 }
707 else {
708 uvm_va_block_cpu_clear_resident_mask(va_block, nid, page_mask);
709 }
710 }
711 }
712
uvm_va_block_cpu_is_page_resident_on(uvm_va_block_t * va_block,int nid,uvm_page_index_t page_index)713 bool uvm_va_block_cpu_is_page_resident_on(uvm_va_block_t *va_block, int nid, uvm_page_index_t page_index)
714 {
715 uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, UVM_ID_CPU, nid);
716
717 return uvm_page_mask_test(resident_mask, page_index);
718 }
719
uvm_va_block_cpu_is_region_resident_on(uvm_va_block_t * va_block,int nid,uvm_va_block_region_t region)720 bool uvm_va_block_cpu_is_region_resident_on(uvm_va_block_t *va_block, int nid, uvm_va_block_region_t region)
721 {
722 uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, UVM_ID_CPU, nid);
723
724 return uvm_page_mask_region_full(resident_mask, region);
725 }
726
727 // Return the preferred NUMA node ID for the block's policy.
728 // If the preferred node ID is NUMA_NO_NODE, the nearest NUMA node ID
729 // with memory is returned. In most cases, this should be the current
730 // NUMA node.
uvm_va_block_context_get_node(uvm_va_block_context_t * va_block_context)731 static int uvm_va_block_context_get_node(uvm_va_block_context_t *va_block_context)
732 {
733 if (va_block_context->make_resident.dest_nid != NUMA_NO_NODE)
734 return va_block_context->make_resident.dest_nid;
735
736 return numa_mem_id();
737 }
738
uvm_va_block_find_vma_region(uvm_va_block_t * va_block,struct mm_struct * mm,NvU64 start,uvm_va_block_region_t * region)739 struct vm_area_struct *uvm_va_block_find_vma_region(uvm_va_block_t *va_block,
740 struct mm_struct *mm,
741 NvU64 start,
742 uvm_va_block_region_t *region)
743 {
744 struct vm_area_struct *vma;
745 NvU64 end;
746
747 if (start > va_block->end)
748 return NULL;
749
750 vma = find_vma_intersection(mm, start, va_block->end + 1);
751 if (!vma)
752 return NULL;
753
754 if (start < vma->vm_start)
755 start = vma->vm_start;
756
757 end = vma->vm_end - 1;
758 if (end > va_block->end)
759 end = va_block->end;
760
761 *region = uvm_va_block_region_from_start_end(va_block, start, end);
762
763 return vma;
764 }
765
block_check_cpu_chunks(uvm_va_block_t * block)766 static bool block_check_cpu_chunks(uvm_va_block_t *block)
767 {
768 int nid;
769 uvm_page_mask_t *temp_resident_mask;
770
771 temp_resident_mask = kmem_cache_alloc(g_uvm_page_mask_cache, NV_UVM_GFP_FLAGS | __GFP_ZERO);
772
773 for_each_possible_uvm_node(nid) {
774 uvm_cpu_chunk_t *chunk;
775 uvm_page_index_t page_index;
776 uvm_va_block_region_t prev_region = {0};
777 uvm_va_block_cpu_node_state_t *node_state = block_node_state_get(block, nid);
778 size_t alloced_pages = 0;
779
780 for_each_cpu_chunk_in_block(chunk, page_index, block, nid) {
781 uvm_va_block_region_t chunk_region = uvm_cpu_chunk_block_region(block, chunk, page_index);
782 size_t num_chunk_pages = uvm_cpu_chunk_num_pages(chunk);
783 uvm_page_index_t chunk_page;
784
785 UVM_ASSERT(prev_region.outer <= chunk_region.first);
786 UVM_ASSERT(IS_ALIGNED(uvm_va_block_region_start(block, chunk_region), uvm_cpu_chunk_get_size(chunk)));
787 UVM_ASSERT(chunk_region.outer <= uvm_va_block_num_cpu_pages(block));
788
789 alloced_pages += uvm_cpu_chunk_num_pages(chunk);
790 UVM_ASSERT(uvm_page_mask_region_full(&node_state->allocated, chunk_region));
791 prev_region = chunk_region;
792
793 for (chunk_page = page_index; chunk_page < page_index + num_chunk_pages; chunk_page++)
794 UVM_ASSERT(uvm_cpu_chunk_get_chunk_for_page(block, nid, chunk_page) == chunk);
795 }
796
797 UVM_ASSERT(alloced_pages == uvm_page_mask_weight(&node_state->allocated));
798 UVM_ASSERT(uvm_page_mask_subset(&node_state->resident, &node_state->allocated));
799 UVM_ASSERT(uvm_page_mask_subset(&node_state->resident, &block->cpu.resident));
800 if (temp_resident_mask && !uvm_page_mask_empty(&node_state->resident)) {
801 UVM_ASSERT(!uvm_page_mask_intersects(&node_state->resident, temp_resident_mask));
802 uvm_page_mask_or(temp_resident_mask, temp_resident_mask, &node_state->resident);
803 }
804 }
805
806 if (temp_resident_mask) {
807 UVM_ASSERT(uvm_page_mask_equal(temp_resident_mask, &block->cpu.resident));
808 kmem_cache_free(g_uvm_page_mask_cache, temp_resident_mask);
809 }
810
811 return true;
812 }
813
814 // Frees any left-over free chunks and unpins all the used chunks
uvm_va_block_retry_deinit(uvm_va_block_retry_t * retry,uvm_va_block_t * va_block)815 void uvm_va_block_retry_deinit(uvm_va_block_retry_t *retry, uvm_va_block_t *va_block)
816 {
817 uvm_gpu_t *gpu;
818 uvm_gpu_chunk_t *gpu_chunk;
819 uvm_gpu_chunk_t *next_chunk;
820
821 if (!retry)
822 return;
823
824 uvm_tracker_deinit(&retry->tracker);
825
826 // Free any unused chunks
827 list_for_each_entry_safe(gpu_chunk, next_chunk, &retry->free_chunks, list) {
828 list_del_init(&gpu_chunk->list);
829 gpu = uvm_gpu_chunk_get_gpu(gpu_chunk);
830 uvm_pmm_gpu_free(&gpu->pmm, gpu_chunk, NULL);
831 }
832
833 // Unpin all the used chunks now that we are done
834 list_for_each_entry_safe(gpu_chunk, next_chunk, &retry->used_chunks, list) {
835 list_del_init(&gpu_chunk->list);
836 gpu = uvm_gpu_chunk_get_gpu(gpu_chunk);
837 // HMM should have already moved allocated blocks to the referenced
838 // state so any left over were not migrated and should be freed.
839 if (uvm_va_block_is_hmm(va_block))
840 uvm_pmm_gpu_free(&gpu->pmm, gpu_chunk, NULL);
841 else
842 uvm_pmm_gpu_unpin_allocated(&gpu->pmm, gpu_chunk, va_block);
843 }
844 }
845
block_retry_add_free_chunk(uvm_va_block_retry_t * retry,uvm_gpu_chunk_t * gpu_chunk)846 static void block_retry_add_free_chunk(uvm_va_block_retry_t *retry, uvm_gpu_chunk_t *gpu_chunk)
847 {
848 list_add_tail(&gpu_chunk->list, &retry->free_chunks);
849 }
850
block_retry_add_used_chunk(uvm_va_block_retry_t * retry,uvm_gpu_chunk_t * gpu_chunk)851 static void block_retry_add_used_chunk(uvm_va_block_retry_t *retry, uvm_gpu_chunk_t *gpu_chunk)
852 {
853 list_add_tail(&gpu_chunk->list, &retry->used_chunks);
854 }
855
block_retry_get_free_chunk(uvm_va_block_retry_t * retry,uvm_gpu_t * gpu,uvm_chunk_size_t size)856 static uvm_gpu_chunk_t *block_retry_get_free_chunk(uvm_va_block_retry_t *retry, uvm_gpu_t *gpu, uvm_chunk_size_t size)
857 {
858 uvm_gpu_chunk_t *gpu_chunk;
859
860 list_for_each_entry(gpu_chunk, &retry->free_chunks, list) {
861 if (uvm_gpu_chunk_get_gpu(gpu_chunk) == gpu && uvm_gpu_chunk_get_size(gpu_chunk) == size) {
862 list_del_init(&gpu_chunk->list);
863 return gpu_chunk;
864 }
865 }
866
867 return NULL;
868 }
869
870 // Encapsulates a reference to a physical page belonging to a specific processor
871 // within a VA block.
872 typedef struct
873 {
874 // Processor the page is on
875 uvm_processor_id_t processor;
876
877 // The page index
878 uvm_page_index_t page_index;
879
880 // If processor is the CPU, the NUMA node of the page.
881 int nid;
882 } block_phys_page_t;
883
block_phys_page(uvm_processor_id_t processor,int nid,uvm_page_index_t page_index)884 static block_phys_page_t block_phys_page(uvm_processor_id_t processor, int nid, uvm_page_index_t page_index)
885 {
886 if (UVM_ID_IS_CPU(processor))
887 UVM_ASSERT(nid != NUMA_NO_NODE);
888
889 return (block_phys_page_t){ processor, page_index, nid };
890 }
891
uvm_va_block_init(void)892 NV_STATUS uvm_va_block_init(void)
893 {
894 if (uvm_enable_builtin_tests)
895 g_uvm_va_block_cache = NV_KMEM_CACHE_CREATE("uvm_va_block_wrapper_t", uvm_va_block_wrapper_t);
896 else
897 g_uvm_va_block_cache = NV_KMEM_CACHE_CREATE("uvm_va_block_t", uvm_va_block_t);
898
899 if (!g_uvm_va_block_cache)
900 return NV_ERR_NO_MEMORY;
901
902 g_uvm_va_block_gpu_state_cache = NV_KMEM_CACHE_CREATE("uvm_va_block_gpu_state_t", uvm_va_block_gpu_state_t);
903 if (!g_uvm_va_block_gpu_state_cache)
904 return NV_ERR_NO_MEMORY;
905
906 g_uvm_page_mask_cache = NV_KMEM_CACHE_CREATE("uvm_page_mask_t", uvm_page_mask_t);
907 if (!g_uvm_page_mask_cache)
908 return NV_ERR_NO_MEMORY;
909
910 g_uvm_va_block_context_cache = NV_KMEM_CACHE_CREATE("uvm_va_block_context_t", uvm_va_block_context_t);
911 if (!g_uvm_va_block_context_cache)
912 return NV_ERR_NO_MEMORY;
913
914 g_uvm_va_block_cpu_node_state_cache = NV_KMEM_CACHE_CREATE("uvm_va_block_cpu_node_state_t",
915 uvm_va_block_cpu_node_state_t);
916 if (!g_uvm_va_block_cpu_node_state_cache)
917 return NV_ERR_NO_MEMORY;
918
919 return NV_OK;
920 }
921
uvm_va_block_exit(void)922 void uvm_va_block_exit(void)
923 {
924 kmem_cache_destroy_safe(&g_uvm_va_block_cpu_node_state_cache);
925 kmem_cache_destroy_safe(&g_uvm_va_block_context_cache);
926 kmem_cache_destroy_safe(&g_uvm_page_mask_cache);
927 kmem_cache_destroy_safe(&g_uvm_va_block_gpu_state_cache);
928 kmem_cache_destroy_safe(&g_uvm_va_block_cache);
929 }
930
block_context_free_tracking(uvm_make_resident_page_tracking_t * tracking)931 static void block_context_free_tracking(uvm_make_resident_page_tracking_t *tracking)
932 {
933 size_t index;
934
935 for (index = 0; index < num_possible_nodes(); index++) {
936 if (tracking->node_masks[index])
937 kmem_cache_free(g_uvm_page_mask_cache, tracking->node_masks[index]);
938 }
939
940 uvm_kvfree(tracking->node_masks);
941 }
942
block_context_alloc_tracking(uvm_make_resident_page_tracking_t * tracking)943 static NV_STATUS block_context_alloc_tracking(uvm_make_resident_page_tracking_t *tracking)
944 {
945 size_t index;
946
947 tracking->node_masks = uvm_kvmalloc_zero(num_possible_nodes() * sizeof(*tracking->node_masks));
948 if (!tracking->node_masks)
949 return NV_ERR_NO_MEMORY;
950
951 for (index = 0; index < num_possible_nodes(); index++) {
952 tracking->node_masks[index] = kmem_cache_alloc(g_uvm_page_mask_cache, NV_UVM_GFP_FLAGS);
953 if (!tracking->node_masks[index])
954 goto error;
955 }
956
957 return NV_OK;
958
959 error:
960 block_context_free_tracking(tracking);
961 return NV_ERR_NO_MEMORY;
962 }
963
uvm_va_block_context_alloc(struct mm_struct * mm)964 uvm_va_block_context_t *uvm_va_block_context_alloc(struct mm_struct *mm)
965 {
966 uvm_va_block_context_t *block_context = kmem_cache_alloc(g_uvm_va_block_context_cache, NV_UVM_GFP_FLAGS);
967 NV_STATUS status;
968
969 if (!block_context)
970 return NULL;
971
972 status = block_context_alloc_tracking(&block_context->make_resident.cpu_pages_used);
973 if (status != NV_OK) {
974 kmem_cache_free(g_uvm_va_block_context_cache, block_context);
975 return NULL;
976 }
977
978 uvm_va_block_context_init(block_context, mm);
979 return block_context;
980 }
981
uvm_va_block_context_init(uvm_va_block_context_t * va_block_context,struct mm_struct * mm)982 void uvm_va_block_context_init(uvm_va_block_context_t *va_block_context, struct mm_struct *mm)
983 {
984 UVM_ASSERT(va_block_context);
985
986 // Write garbage into the VA Block context to ensure that the UVM code
987 // clears masks appropriately
988 if (UVM_IS_DEBUG()) {
989 uvm_page_mask_t **mask_array = va_block_context->make_resident.cpu_pages_used.node_masks;
990 int nid;
991
992 memset(va_block_context, 0xff, sizeof(*va_block_context));
993
994 for_each_possible_uvm_node(nid)
995 uvm_page_mask_fill(mask_array[node_to_index(nid)]);
996
997 va_block_context->make_resident.cpu_pages_used.node_masks = mask_array;
998 }
999
1000 va_block_context->mm = mm;
1001 va_block_context->make_resident.dest_nid = NUMA_NO_NODE;
1002 nodes_clear(va_block_context->make_resident.cpu_pages_used.nodes);
1003 }
1004
uvm_va_block_context_free(uvm_va_block_context_t * va_block_context)1005 void uvm_va_block_context_free(uvm_va_block_context_t *va_block_context)
1006 {
1007 if (va_block_context) {
1008 block_context_free_tracking(&va_block_context->make_resident.cpu_pages_used);
1009 kmem_cache_free(g_uvm_va_block_context_cache, va_block_context);
1010 }
1011 }
1012
1013 // Convert from page_index to chunk_index. The goal is for each system page in
1014 // the region [start, start + size) to be covered by the largest naturally-
1015 // aligned user chunk size.
uvm_va_block_gpu_chunk_index_range(NvU64 start,NvU64 size,uvm_gpu_t * gpu,uvm_page_index_t page_index,uvm_chunk_size_t * out_chunk_size)1016 size_t uvm_va_block_gpu_chunk_index_range(NvU64 start,
1017 NvU64 size,
1018 uvm_gpu_t *gpu,
1019 uvm_page_index_t page_index,
1020 uvm_chunk_size_t *out_chunk_size)
1021 {
1022 uvm_chunk_sizes_mask_t chunk_sizes = gpu->parent->mmu_user_chunk_sizes;
1023 uvm_chunk_size_t chunk_size, final_chunk_size;
1024 size_t num_chunks, num_chunks_total;
1025 NvU64 addr, end, aligned_start, aligned_addr, aligned_end, temp_size;
1026
1027 UVM_ASSERT(PAGE_ALIGNED(start));
1028 UVM_ASSERT(PAGE_ALIGNED(size));
1029 UVM_ASSERT(size > 0);
1030 UVM_ASSERT(size <= UVM_CHUNK_SIZE_2M);
1031 UVM_ASSERT(UVM_ALIGN_DOWN(start, UVM_CHUNK_SIZE_2M) == UVM_ALIGN_DOWN(start + size - 1, UVM_CHUNK_SIZE_2M));
1032 BUILD_BUG_ON(UVM_VA_BLOCK_SIZE != UVM_CHUNK_SIZE_2M);
1033
1034 // PAGE_SIZE needs to be the lowest natively-supported chunk size in the
1035 // mask, since we never deal with chunk sizes smaller than that (although we
1036 // may have PTEs mapping pages smaller than that).
1037 UVM_ASSERT(uvm_chunk_find_first_size(chunk_sizes) == PAGE_SIZE);
1038
1039 // Optimize the ideal Pascal+ case: the whole block is covered by a single
1040 // 2M page.
1041 if ((chunk_sizes & UVM_CHUNK_SIZE_2M) && size == UVM_CHUNK_SIZE_2M) {
1042 UVM_ASSERT(IS_ALIGNED(start, UVM_CHUNK_SIZE_2M));
1043 final_chunk_size = UVM_CHUNK_SIZE_2M;
1044 num_chunks_total = 0;
1045 goto out;
1046 }
1047
1048 // Only one 2M chunk can fit within a VA block on any GPU architecture, so
1049 // remove that size from consideration.
1050 chunk_sizes &= ~UVM_CHUNK_SIZE_2M;
1051
1052 // Next common case: the whole block is aligned and sized to perfectly fit
1053 // the largest page size.
1054 final_chunk_size = uvm_chunk_find_last_size(chunk_sizes);
1055 if (IS_ALIGNED(start, final_chunk_size) && IS_ALIGNED(size, final_chunk_size)) {
1056 num_chunks_total = (size_t)uvm_div_pow2_64(page_index * PAGE_SIZE, final_chunk_size);
1057 goto out;
1058 }
1059
1060 // We didn't hit our special paths. Do it the hard way.
1061
1062 num_chunks_total = 0;
1063 addr = start + page_index * PAGE_SIZE;
1064 end = start + size;
1065 final_chunk_size = 0;
1066 UVM_ASSERT(addr < end);
1067
1068 // The below loop collapses almost completely when chunk_size == PAGE_SIZE
1069 // since in that lowest-common-denominator case everything is already
1070 // aligned. Skip it and handle that specially after the loop.
1071 //
1072 // Note that since we removed 2M already above, this loop will only iterate
1073 // once on x86 Pascal+ since only 64K is left.
1074 chunk_sizes &= ~PAGE_SIZE;
1075
1076 // This loop calculates the number of chunks between start and addr by
1077 // calculating the number of whole chunks of each size between them,
1078 // starting with the largest allowed chunk size. This requires fewer
1079 // iterations than if we began from start and kept calculating the next
1080 // larger chunk size boundary.
1081 for_each_chunk_size_rev(chunk_size, chunk_sizes) {
1082 aligned_start = UVM_ALIGN_UP(start, chunk_size);
1083 aligned_addr = UVM_ALIGN_DOWN(addr, chunk_size);
1084 aligned_end = UVM_ALIGN_DOWN(end, chunk_size);
1085
1086 // If addr and start are within the same chunk, try smaller
1087 if (aligned_start > aligned_addr)
1088 continue;
1089
1090 // If addr and end are not in the same chunk, then addr is covered by a
1091 // single chunk of the current size. Ignore smaller boundaries between
1092 // addr and aligned_addr.
1093 if (aligned_addr < aligned_end && final_chunk_size == 0) {
1094 addr = aligned_addr;
1095 final_chunk_size = chunk_size;
1096 }
1097
1098 // How many chunks of this size are between start and addr? Note that
1099 // this might be 0 since aligned_addr and aligned_start could be in the
1100 // same chunk.
1101 num_chunks = uvm_div_pow2_32(((NvU32)aligned_addr - aligned_start), chunk_size);
1102 num_chunks_total += num_chunks;
1103
1104 // We've already accounted for these chunks, so "remove" them by
1105 // bringing start, addr, and end closer together to calculate the
1106 // remaining chunk sizes.
1107 temp_size = num_chunks * chunk_size;
1108 addr -= temp_size;
1109 end -= temp_size;
1110
1111 // Once there's no separation between addr and start, and we've
1112 // successfully found the right chunk size when taking end into account,
1113 // we're done.
1114 if (addr == start && final_chunk_size)
1115 break;
1116 }
1117
1118 // Handle PAGE_SIZE cleanup since we skipped it in the loop
1119 num_chunks_total += (addr - start) / PAGE_SIZE;
1120 if (final_chunk_size == 0)
1121 final_chunk_size = PAGE_SIZE;
1122
1123 out:
1124 if (out_chunk_size)
1125 *out_chunk_size = final_chunk_size;
1126
1127 return num_chunks_total;
1128 }
1129
block_gpu_chunk_index_range(uvm_va_block_t * va_block,NvU64 start,NvU64 size,uvm_gpu_t * gpu,uvm_page_index_t page_index,uvm_chunk_size_t * out_chunk_size)1130 static size_t block_gpu_chunk_index_range(uvm_va_block_t *va_block,
1131 NvU64 start,
1132 NvU64 size,
1133 uvm_gpu_t *gpu,
1134 uvm_page_index_t page_index,
1135 uvm_chunk_size_t *out_chunk_size)
1136 {
1137 if (uvm_va_block_is_hmm(va_block)) {
1138 if (out_chunk_size)
1139 *out_chunk_size = PAGE_SIZE;
1140 return page_index;
1141 }
1142
1143 return uvm_va_block_gpu_chunk_index_range(start, size, gpu, page_index, out_chunk_size);
1144 }
1145
block_gpu_chunk_index(uvm_va_block_t * block,uvm_gpu_t * gpu,uvm_page_index_t page_index,uvm_chunk_size_t * out_chunk_size)1146 static size_t block_gpu_chunk_index(uvm_va_block_t *block,
1147 uvm_gpu_t *gpu,
1148 uvm_page_index_t page_index,
1149 uvm_chunk_size_t *out_chunk_size)
1150 {
1151 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
1152 uvm_chunk_size_t size;
1153 uvm_gpu_chunk_t *chunk;
1154 size_t index;
1155
1156 index = block_gpu_chunk_index_range(block, block->start, uvm_va_block_size(block), gpu, page_index, &size);
1157
1158 UVM_ASSERT(size >= PAGE_SIZE);
1159
1160 if (gpu_state) {
1161 UVM_ASSERT(gpu_state->chunks);
1162 chunk = gpu_state->chunks[index];
1163 if (chunk) {
1164 UVM_ASSERT(uvm_gpu_chunk_get_size(chunk) == size);
1165 UVM_ASSERT(chunk->state != UVM_PMM_GPU_CHUNK_STATE_PMA_OWNED);
1166 UVM_ASSERT(chunk->state != UVM_PMM_GPU_CHUNK_STATE_FREE);
1167 }
1168 }
1169
1170 if (out_chunk_size)
1171 *out_chunk_size = size;
1172
1173 return index;
1174 }
1175
1176 // Compute the size of the chunk known to start at start_page_index
block_gpu_chunk_size(uvm_va_block_t * block,uvm_gpu_t * gpu,uvm_page_index_t start_page_index)1177 static uvm_chunk_size_t block_gpu_chunk_size(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_page_index_t start_page_index)
1178 {
1179 uvm_chunk_sizes_mask_t chunk_sizes = gpu->parent->mmu_user_chunk_sizes;
1180 uvm_chunk_sizes_mask_t start_alignments, pow2_leq_size, allowed_sizes;
1181 NvU64 start = uvm_va_block_cpu_page_address(block, start_page_index);
1182 NvU64 size = block->end - start + 1;
1183
1184 if (uvm_va_block_is_hmm(block))
1185 return PAGE_SIZE;
1186
1187 // Create a mask of all sizes for which start is aligned. x ^ (x-1) yields a
1188 // mask of the rightmost 1 bit in x, as well as all trailing 0 bits in x.
1189 // Example: 1011000 -> 0001111
1190 start_alignments = (uvm_chunk_sizes_mask_t)(start ^ (start - 1));
1191
1192 // Next, compute all sizes (powers of two) which are <= size.
1193 pow2_leq_size = (uvm_chunk_sizes_mask_t)rounddown_pow_of_two(size);
1194 pow2_leq_size |= pow2_leq_size - 1;
1195
1196 // Now and them all together to get our list of GPU-supported chunk sizes
1197 // which are aligned to start and will fit within size.
1198 allowed_sizes = chunk_sizes & start_alignments & pow2_leq_size;
1199
1200 // start and size must always be aligned to at least the smallest supported
1201 // chunk size (PAGE_SIZE).
1202 UVM_ASSERT(allowed_sizes >= PAGE_SIZE);
1203
1204 // Take the largest allowed size
1205 return uvm_chunk_find_last_size(allowed_sizes);
1206 }
1207
block_num_gpu_chunks(uvm_va_block_t * block,uvm_gpu_t * gpu)1208 static size_t block_num_gpu_chunks(uvm_va_block_t *block, uvm_gpu_t *gpu)
1209 {
1210 return block_gpu_chunk_index(block, gpu, uvm_va_block_cpu_page_index(block, block->end), NULL) + 1;
1211 }
1212
block_num_gpu_chunks_range(uvm_va_block_t * block,NvU64 start,NvU64 size,uvm_gpu_t * gpu)1213 static size_t block_num_gpu_chunks_range(uvm_va_block_t *block, NvU64 start, NvU64 size, uvm_gpu_t *gpu)
1214 {
1215 uvm_page_index_t last_page_index = (size_t)((size / PAGE_SIZE) - 1);
1216 return block_gpu_chunk_index_range(block, start, size, gpu, last_page_index, NULL) + 1;
1217 }
1218
uvm_va_block_lookup_gpu_chunk(uvm_va_block_t * va_block,uvm_gpu_t * gpu,NvU64 address)1219 uvm_gpu_chunk_t *uvm_va_block_lookup_gpu_chunk(uvm_va_block_t *va_block, uvm_gpu_t *gpu, NvU64 address)
1220 {
1221 size_t chunk_index;
1222 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id);
1223 uvm_page_index_t page_index = uvm_va_block_cpu_page_index(va_block, address);
1224
1225 uvm_assert_mutex_locked(&va_block->lock);
1226
1227 if (!gpu_state)
1228 return NULL;
1229
1230 chunk_index = block_gpu_chunk_index(va_block, gpu, page_index, NULL);
1231
1232 return gpu_state->chunks[chunk_index];
1233 }
1234
uvm_va_block_free(uvm_va_block_t * block)1235 static void uvm_va_block_free(uvm_va_block_t *block)
1236 {
1237 if (uvm_enable_builtin_tests) {
1238 uvm_va_block_wrapper_t *block_wrapper = container_of(block, uvm_va_block_wrapper_t, block);
1239
1240 kmem_cache_free(g_uvm_va_block_cache, block_wrapper);
1241 }
1242 else {
1243 kmem_cache_free(g_uvm_va_block_cache, block);
1244 }
1245 }
1246
uvm_va_block_create(uvm_va_range_t * va_range,NvU64 start,NvU64 end,uvm_va_block_t ** out_block)1247 NV_STATUS uvm_va_block_create(uvm_va_range_t *va_range,
1248 NvU64 start,
1249 NvU64 end,
1250 uvm_va_block_t **out_block)
1251 {
1252 uvm_va_block_t *block = NULL;
1253 NvU64 size = end - start + 1;
1254 int nid;
1255
1256 UVM_ASSERT(PAGE_ALIGNED(start));
1257 UVM_ASSERT(PAGE_ALIGNED(end + 1));
1258 UVM_ASSERT(PAGE_ALIGNED(size));
1259 UVM_ASSERT(size > 0);
1260 UVM_ASSERT(size <= UVM_VA_BLOCK_SIZE);
1261
1262 if (va_range) {
1263 // Create a managed va_block.
1264 UVM_ASSERT(start >= va_range->node.start);
1265 UVM_ASSERT(end <= va_range->node.end);
1266 UVM_ASSERT(va_range->type == UVM_VA_RANGE_TYPE_MANAGED);
1267 }
1268
1269 // Blocks can't span a block alignment boundary
1270 UVM_ASSERT(UVM_VA_BLOCK_ALIGN_DOWN(start) == UVM_VA_BLOCK_ALIGN_DOWN(end));
1271
1272 if (uvm_enable_builtin_tests) {
1273 uvm_va_block_wrapper_t *block_wrapper = nv_kmem_cache_zalloc(g_uvm_va_block_cache, NV_UVM_GFP_FLAGS);
1274
1275 if (block_wrapper) {
1276 block = &block_wrapper->block;
1277 block_wrapper->test.cpu_chunk_allocation_target_id = NUMA_NO_NODE;
1278 block_wrapper->test.cpu_chunk_allocation_actual_id = NUMA_NO_NODE;
1279 }
1280 }
1281 else {
1282 block = nv_kmem_cache_zalloc(g_uvm_va_block_cache, NV_UVM_GFP_FLAGS);
1283 }
1284
1285 if (!block)
1286 return NV_ERR_NO_MEMORY;
1287
1288 block->cpu.node_state = uvm_kvmalloc_zero(sizeof(*block->cpu.node_state) * num_possible_nodes());
1289 if (!block->cpu.node_state)
1290 goto error_block_free;
1291
1292 for_each_possible_uvm_node(nid) {
1293 size_t index = node_to_index(nid);
1294
1295 block->cpu.node_state[index] = nv_kmem_cache_zalloc(g_uvm_va_block_cpu_node_state_cache, NV_UVM_GFP_FLAGS);
1296 if (!block->cpu.node_state[index])
1297 goto error;
1298 }
1299
1300 nv_kref_init(&block->kref);
1301 uvm_mutex_init(&block->lock, UVM_LOCK_ORDER_VA_BLOCK);
1302 block->start = start;
1303 block->end = end;
1304 block->va_range = va_range;
1305 uvm_tracker_init(&block->tracker);
1306 block->prefetch_info.last_migration_proc_id = UVM_ID_INVALID;
1307
1308 nv_kthread_q_item_init(&block->eviction_mappings_q_item, block_add_eviction_mappings_entry, block);
1309
1310 *out_block = block;
1311 return NV_OK;
1312
1313 error:
1314 if (block->cpu.node_state) {
1315 for_each_possible_uvm_node(nid) {
1316 size_t index = node_to_index(nid);
1317
1318 if (block->cpu.node_state[index])
1319 kmem_cache_free(g_uvm_va_block_cpu_node_state_cache, block->cpu.node_state[index]);
1320 }
1321 }
1322
1323 uvm_kvfree(block->cpu.node_state);
1324
1325 error_block_free:
1326 uvm_va_block_free(block);
1327 return NV_ERR_NO_MEMORY;
1328 }
1329
cpu_chunk_remove_sysmem_gpu_mapping(uvm_cpu_chunk_t * chunk,uvm_gpu_t * gpu)1330 static void cpu_chunk_remove_sysmem_gpu_mapping(uvm_cpu_chunk_t *chunk, uvm_gpu_t *gpu)
1331 {
1332 NvU64 gpu_mapping_addr = uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk, gpu->parent);
1333 if (gpu_mapping_addr == 0)
1334 return;
1335
1336 uvm_pmm_sysmem_mappings_remove_gpu_mapping(&gpu->pmm_reverse_sysmem_mappings, gpu_mapping_addr);
1337 uvm_cpu_chunk_unmap_parent_gpu_phys(chunk, gpu->parent);
1338 }
1339
cpu_chunk_add_sysmem_gpu_mapping(uvm_cpu_chunk_t * chunk,uvm_va_block_t * block,uvm_page_index_t page_index,uvm_gpu_t * gpu)1340 static NV_STATUS cpu_chunk_add_sysmem_gpu_mapping(uvm_cpu_chunk_t *chunk,
1341 uvm_va_block_t *block,
1342 uvm_page_index_t page_index,
1343 uvm_gpu_t *gpu)
1344 {
1345 NV_STATUS status;
1346 uvm_chunk_size_t chunk_size;
1347
1348 // When the Confidential Computing feature is enabled the transfers don't
1349 // use the DMA mapping of CPU chunks (since it's protected memory), but
1350 // the DMA address of the unprotected dma buffer.
1351 if (g_uvm_global.conf_computing_enabled)
1352 return NV_OK;
1353
1354 status = uvm_cpu_chunk_map_gpu(chunk, gpu);
1355 if (status != NV_OK)
1356 return status;
1357
1358 chunk_size = uvm_cpu_chunk_get_size(chunk);
1359
1360 // TODO: Bug 3744779: Handle benign assertion in
1361 // pmm_sysmem_mappings_remove_gpu_mapping() in case of a
1362 // failure.
1363 status = uvm_pmm_sysmem_mappings_add_gpu_mapping(&gpu->pmm_reverse_sysmem_mappings,
1364 uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk, gpu->parent),
1365 uvm_va_block_cpu_page_address(block, page_index),
1366 chunk_size,
1367 block,
1368 UVM_ID_CPU);
1369 if (status != NV_OK)
1370 cpu_chunk_remove_sysmem_gpu_mapping(chunk, gpu);
1371
1372 return status;
1373 }
1374
block_gpu_unmap_phys_all_cpu_pages(uvm_va_block_t * block,uvm_gpu_t * gpu)1375 static void block_gpu_unmap_phys_all_cpu_pages(uvm_va_block_t *block, uvm_gpu_t *gpu)
1376 {
1377 uvm_cpu_chunk_t *chunk;
1378 uvm_page_index_t page_index;
1379 int nid;
1380
1381 for_each_possible_uvm_node(nid) {
1382 for_each_cpu_chunk_in_block(chunk, page_index, block, nid)
1383 cpu_chunk_remove_sysmem_gpu_mapping(chunk, gpu);
1384 }
1385 }
1386
block_gpu_map_phys_all_cpu_pages(uvm_va_block_t * block,uvm_gpu_t * gpu)1387 static NV_STATUS block_gpu_map_phys_all_cpu_pages(uvm_va_block_t *block, uvm_gpu_t *gpu)
1388 {
1389 NV_STATUS status;
1390 uvm_cpu_chunk_t *chunk;
1391 NvU64 block_mapping_size = uvm_va_block_size(block);
1392 uvm_page_index_t page_index;
1393 int nid;
1394
1395 UVM_ASSERT(IS_ALIGNED(block_mapping_size, UVM_PAGE_SIZE_4K));
1396
1397 for_each_possible_uvm_node(nid) {
1398 for_each_cpu_chunk_in_block(chunk, page_index, block, nid) {
1399 UVM_ASSERT_MSG(uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk, gpu->parent) == 0,
1400 "GPU%u DMA address 0x%llx\n",
1401 uvm_id_value(gpu->id),
1402 uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk, gpu->parent));
1403
1404 status = cpu_chunk_add_sysmem_gpu_mapping(chunk, block, page_index, gpu);
1405 if (status != NV_OK)
1406 goto error;
1407 }
1408 }
1409
1410 return NV_OK;
1411
1412 error:
1413 block_gpu_unmap_phys_all_cpu_pages(block, gpu);
1414 return status;
1415 }
1416
block_sysmem_mappings_add_gpu_chunk(uvm_va_block_t * block,uvm_gpu_t * local_gpu,uvm_gpu_chunk_t * chunk,uvm_gpu_t * accessing_gpu)1417 static NV_STATUS block_sysmem_mappings_add_gpu_chunk(uvm_va_block_t *block,
1418 uvm_gpu_t *local_gpu,
1419 uvm_gpu_chunk_t *chunk,
1420 uvm_gpu_t *accessing_gpu)
1421 {
1422 NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(&local_gpu->pmm, chunk, accessing_gpu);
1423 return uvm_pmm_sysmem_mappings_add_gpu_chunk_mapping(&accessing_gpu->pmm_reverse_sysmem_mappings,
1424 peer_addr,
1425 block->start + chunk->va_block_page_index * PAGE_SIZE,
1426 uvm_gpu_chunk_get_size(chunk),
1427 block,
1428 local_gpu->id);
1429 }
1430
block_sysmem_mappings_remove_gpu_chunk(uvm_gpu_t * local_gpu,uvm_gpu_chunk_t * chunk,uvm_gpu_t * accessing_gpu)1431 static void block_sysmem_mappings_remove_gpu_chunk(uvm_gpu_t *local_gpu,
1432 uvm_gpu_chunk_t *chunk,
1433 uvm_gpu_t *accessing_gpu)
1434 {
1435 NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(&local_gpu->pmm, chunk, accessing_gpu);
1436 uvm_pmm_sysmem_mappings_remove_gpu_chunk_mapping(&accessing_gpu->pmm_reverse_sysmem_mappings, peer_addr);
1437 }
1438
block_gpu_map_all_chunks_indirect_peer(uvm_va_block_t * block,uvm_gpu_t * local_gpu,uvm_gpu_t * accessing_gpu)1439 static NV_STATUS block_gpu_map_all_chunks_indirect_peer(uvm_va_block_t *block,
1440 uvm_gpu_t *local_gpu,
1441 uvm_gpu_t *accessing_gpu)
1442 {
1443 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, local_gpu->id);
1444 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
1445 size_t num_chunks, i;
1446 NV_STATUS status;
1447
1448 UVM_ASSERT(uvm_processor_mask_test(&va_space->indirect_peers[uvm_id_value(local_gpu->id)],
1449 accessing_gpu->id));
1450
1451 // If no chunks are allocated currently, the mappings will be created later
1452 // at chunk allocation.
1453 if (!gpu_state || !gpu_state->chunks)
1454 return NV_OK;
1455
1456 num_chunks = block_num_gpu_chunks(block, local_gpu);
1457 for (i = 0; i < num_chunks; i++) {
1458 uvm_gpu_chunk_t *chunk = gpu_state->chunks[i];
1459 if (!chunk)
1460 continue;
1461
1462 status = uvm_pmm_gpu_indirect_peer_map(&local_gpu->pmm, chunk, accessing_gpu);
1463 if (status != NV_OK)
1464 goto error;
1465
1466 status = block_sysmem_mappings_add_gpu_chunk(block, local_gpu, chunk, accessing_gpu);
1467 if (status != NV_OK)
1468 goto error;
1469 }
1470
1471 return NV_OK;
1472
1473 error:
1474 while (i-- > 0) {
1475 uvm_gpu_chunk_t *chunk = gpu_state->chunks[i];
1476 if (chunk) {
1477 // Indirect peer mappings are removed lazily by PMM, so if an error
1478 // occurs the mappings established above will be removed when the
1479 // chunk is freed later on. We only need to remove the sysmem
1480 // reverse mappings.
1481 block_sysmem_mappings_remove_gpu_chunk(local_gpu, chunk, accessing_gpu);
1482 }
1483 }
1484
1485 return status;
1486 }
1487
1488 // Mappings for indirect peers are removed lazily by PMM, but we need to remove
1489 // the entries from the reverse map.
block_gpu_unmap_all_chunks_indirect_peer(uvm_va_block_t * block,uvm_gpu_t * local_gpu,uvm_gpu_t * accessing_gpu)1490 static void block_gpu_unmap_all_chunks_indirect_peer(uvm_va_block_t *block,
1491 uvm_gpu_t *local_gpu,
1492 uvm_gpu_t *accessing_gpu)
1493 {
1494 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, local_gpu->id);
1495 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
1496 size_t num_chunks, i;
1497
1498 UVM_ASSERT(uvm_processor_mask_test(&va_space->indirect_peers[uvm_id_value(local_gpu->id)],
1499 accessing_gpu->id));
1500
1501 // Exit if no chunks are allocated currently.
1502 if (!gpu_state || !gpu_state->chunks)
1503 return;
1504
1505 num_chunks = block_num_gpu_chunks(block, local_gpu);
1506 for (i = 0; i < num_chunks; i++) {
1507 uvm_gpu_chunk_t *chunk = gpu_state->chunks[i];
1508 if (chunk)
1509 block_sysmem_mappings_remove_gpu_chunk(local_gpu, chunk, accessing_gpu);
1510 }
1511 }
1512
1513 // Retrieves the gpu_state for the given GPU. The returned pointer is
1514 // internally managed and will be allocated (and freed) automatically,
1515 // rather than by the caller.
block_gpu_state_get_alloc(uvm_va_block_t * block,uvm_gpu_t * gpu)1516 static uvm_va_block_gpu_state_t *block_gpu_state_get_alloc(uvm_va_block_t *block, uvm_gpu_t *gpu)
1517 {
1518 NV_STATUS status;
1519 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
1520
1521 if (gpu_state)
1522 return gpu_state;
1523
1524 gpu_state = nv_kmem_cache_zalloc(g_uvm_va_block_gpu_state_cache, NV_UVM_GFP_FLAGS);
1525 if (!gpu_state)
1526 return NULL;
1527
1528 gpu_state->chunks = uvm_kvmalloc_zero(block_num_gpu_chunks(block, gpu) * sizeof(gpu_state->chunks[0]));
1529 if (!gpu_state->chunks)
1530 goto error;
1531
1532 block->gpus[uvm_id_gpu_index(gpu->id)] = gpu_state;
1533
1534 status = block_gpu_map_phys_all_cpu_pages(block, gpu);
1535 if (status != NV_OK)
1536 goto error;
1537
1538 return gpu_state;
1539
1540 error:
1541 uvm_kvfree(gpu_state->chunks);
1542 kmem_cache_free(g_uvm_va_block_gpu_state_cache, gpu_state);
1543 block->gpus[uvm_id_gpu_index(gpu->id)] = NULL;
1544
1545 return NULL;
1546 }
1547
uvm_va_block_gpu_state_alloc(uvm_va_block_t * va_block)1548 NV_STATUS uvm_va_block_gpu_state_alloc(uvm_va_block_t *va_block)
1549 {
1550 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
1551 uvm_gpu_id_t gpu_id;
1552
1553 UVM_ASSERT(uvm_va_block_is_hmm(va_block));
1554 uvm_assert_mutex_locked(&va_block->lock);
1555
1556 for_each_gpu_id_in_mask(gpu_id, &va_space->registered_gpus) {
1557 if (!block_gpu_state_get_alloc(va_block, uvm_va_space_get_gpu(va_space, gpu_id)))
1558 return NV_ERR_NO_MEMORY;
1559 }
1560
1561 return NV_OK;
1562 }
1563
uvm_va_block_unmap_cpu_chunk_on_gpus(uvm_va_block_t * block,uvm_cpu_chunk_t * chunk,uvm_page_index_t page_index)1564 void uvm_va_block_unmap_cpu_chunk_on_gpus(uvm_va_block_t *block,
1565 uvm_cpu_chunk_t *chunk,
1566 uvm_page_index_t page_index)
1567 {
1568 uvm_gpu_id_t id;
1569
1570 for_each_gpu_id(id) {
1571 if (uvm_va_block_gpu_state_get(block, id))
1572 cpu_chunk_remove_sysmem_gpu_mapping(chunk, block_get_gpu(block, id));
1573 }
1574 }
1575
uvm_va_block_map_cpu_chunk_on_gpus(uvm_va_block_t * block,uvm_cpu_chunk_t * chunk,uvm_page_index_t page_index)1576 NV_STATUS uvm_va_block_map_cpu_chunk_on_gpus(uvm_va_block_t *block,
1577 uvm_cpu_chunk_t *chunk,
1578 uvm_page_index_t page_index)
1579 {
1580 NV_STATUS status;
1581 uvm_gpu_id_t id;
1582 uvm_chunk_size_t chunk_size = uvm_cpu_chunk_get_size(chunk);
1583 uvm_va_block_region_t chunk_region = uvm_va_block_chunk_region(block, chunk_size, page_index);
1584
1585 // We can't iterate over va_space->registered_gpus because we might be
1586 // on the eviction path, which does not have the VA space lock held. We have
1587 // the VA block lock held however, so the gpu_states can't change.
1588 uvm_assert_mutex_locked(&block->lock);
1589
1590 for_each_gpu_id(id) {
1591 uvm_gpu_t *gpu;
1592
1593 if (!uvm_va_block_gpu_state_get(block, id))
1594 continue;
1595
1596 gpu = block_get_gpu(block, id);
1597 status = cpu_chunk_add_sysmem_gpu_mapping(chunk, block, chunk_region.first, gpu);
1598 if (status != NV_OK)
1599 goto error;
1600 }
1601
1602 return NV_OK;
1603
1604 error:
1605 uvm_va_block_unmap_cpu_chunk_on_gpus(block, chunk, page_index);
1606 return status;
1607 }
1608
uvm_va_block_remove_cpu_chunks(uvm_va_block_t * va_block,uvm_va_block_region_t region)1609 void uvm_va_block_remove_cpu_chunks(uvm_va_block_t *va_block, uvm_va_block_region_t region)
1610 {
1611 uvm_cpu_chunk_t *chunk;
1612 uvm_page_index_t page_index, next_page_index;
1613 uvm_va_block_region_t chunk_region;
1614 int nid;
1615
1616 for_each_possible_uvm_node(nid) {
1617 for_each_cpu_chunk_in_block_region_safe(chunk, page_index, next_page_index, va_block, nid, region) {
1618 chunk_region = uvm_va_block_region(page_index, page_index + uvm_cpu_chunk_num_pages(chunk));
1619
1620 uvm_page_mask_region_clear(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ], chunk_region);
1621 uvm_page_mask_region_clear(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE], chunk_region);
1622 uvm_va_block_cpu_clear_resident_region(va_block, nid, chunk_region);
1623 uvm_cpu_chunk_remove_from_block(va_block, nid, page_index);
1624 uvm_va_block_unmap_cpu_chunk_on_gpus(va_block, chunk, page_index);
1625 uvm_cpu_chunk_free(chunk);
1626 }
1627 }
1628
1629 if (uvm_page_mask_empty(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ]))
1630 uvm_processor_mask_clear(&va_block->mapped, UVM_ID_CPU);
1631
1632 if (uvm_page_mask_empty(uvm_va_block_resident_mask_get(va_block, UVM_ID_CPU, NUMA_NO_NODE)))
1633 uvm_processor_mask_clear(&va_block->resident, UVM_ID_CPU);
1634 }
1635
1636 // Create physical mappings to allow other GPUs to access this chunk.
block_map_indirect_peers_to_gpu_chunk(uvm_va_block_t * block,uvm_gpu_t * gpu,uvm_gpu_chunk_t * chunk)1637 static NV_STATUS block_map_indirect_peers_to_gpu_chunk(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_gpu_chunk_t *chunk)
1638 {
1639 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
1640 uvm_gpu_t *accessing_gpu, *remove_gpu;
1641 NV_STATUS status;
1642
1643 // Unlike uvm_va_block_map_cpu_chunk_on_gpus, this function isn't called on
1644 // the eviction path, so we can assume that the VA space is locked.
1645 //
1646 // TODO: Bug 2007346: In the future we may want to enable eviction to peers,
1647 // meaning we may need to allocate peer memory and map it on the
1648 // eviction path. That will require making sure that peers can't be
1649 // enabled or disabled either in the VA space or globally within this
1650 // function.
1651 uvm_assert_rwsem_locked(&va_space->lock);
1652 uvm_assert_mutex_locked(&block->lock);
1653
1654 for_each_va_space_gpu_in_mask(accessing_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)]) {
1655 status = uvm_pmm_gpu_indirect_peer_map(&gpu->pmm, chunk, accessing_gpu);
1656 if (status != NV_OK)
1657 goto error;
1658
1659 status = block_sysmem_mappings_add_gpu_chunk(block, gpu, chunk, accessing_gpu);
1660 if (status != NV_OK)
1661 goto error;
1662 }
1663
1664 return NV_OK;
1665
1666 error:
1667 for_each_va_space_gpu_in_mask(remove_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)]) {
1668 if (remove_gpu == accessing_gpu)
1669 break;
1670
1671 // Indirect peer mappings are removed lazily by PMM, so if an error
1672 // occurs the mappings established above will be removed when the
1673 // chunk is freed later on. We only need to remove the sysmem
1674 // reverse mappings.
1675 block_sysmem_mappings_remove_gpu_chunk(gpu, chunk, remove_gpu);
1676 }
1677
1678 return status;
1679 }
1680
block_unmap_indirect_peers_from_gpu_chunk(uvm_va_block_t * block,uvm_gpu_t * gpu,uvm_gpu_chunk_t * chunk)1681 static void block_unmap_indirect_peers_from_gpu_chunk(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_gpu_chunk_t *chunk)
1682 {
1683 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
1684 uvm_gpu_t *peer_gpu;
1685
1686 uvm_assert_rwsem_locked(&va_space->lock);
1687 uvm_assert_mutex_locked(&block->lock);
1688
1689 // Indirect peer mappings are removed lazily by PMM, so we only need to
1690 // remove the sysmem reverse mappings.
1691 for_each_va_space_gpu_in_mask(peer_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)])
1692 block_sysmem_mappings_remove_gpu_chunk(gpu, chunk, peer_gpu);
1693 }
1694
1695 // Mark a CPU page as dirty.
block_mark_cpu_page_dirty(uvm_va_block_t * block,uvm_page_index_t page_index,int nid)1696 static void block_mark_cpu_page_dirty(uvm_va_block_t *block, uvm_page_index_t page_index, int nid)
1697 {
1698 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, nid, page_index);
1699 uvm_va_block_region_t chunk_region = uvm_va_block_chunk_region(block, uvm_cpu_chunk_get_size(chunk), page_index);
1700 uvm_cpu_chunk_mark_dirty(chunk, page_index - chunk_region.first);
1701 }
1702
1703 // Mark a CPU page as clean.
block_mark_cpu_page_clean(uvm_va_block_t * block,uvm_page_index_t page_index,int nid)1704 static void block_mark_cpu_page_clean(uvm_va_block_t *block, uvm_page_index_t page_index, int nid)
1705 {
1706 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, nid, page_index);
1707 uvm_va_block_region_t chunk_region = uvm_va_block_chunk_region(block, uvm_cpu_chunk_get_size(chunk), page_index);
1708 uvm_cpu_chunk_mark_clean(chunk, page_index - chunk_region.first);
1709 }
1710
1711 // Check if a CPU page is dirty.
block_cpu_page_is_dirty(uvm_va_block_t * block,uvm_page_index_t page_index,int nid)1712 static bool block_cpu_page_is_dirty(uvm_va_block_t *block, uvm_page_index_t page_index, int nid)
1713 {
1714 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, nid, page_index);
1715 uvm_va_block_region_t chunk_region = uvm_va_block_chunk_region(block, uvm_cpu_chunk_get_size(chunk), page_index);
1716 return uvm_cpu_chunk_is_dirty(chunk, page_index - chunk_region.first);
1717 }
1718
block_alloc_cpu_chunk_inject_error(uvm_va_block_t * block,uvm_chunk_size_t alloc_size,uvm_cpu_chunk_alloc_flags_t flags,int nid,uvm_cpu_chunk_t ** chunk)1719 static NV_STATUS block_alloc_cpu_chunk_inject_error(uvm_va_block_t *block,
1720 uvm_chunk_size_t alloc_size,
1721 uvm_cpu_chunk_alloc_flags_t flags,
1722 int nid,
1723 uvm_cpu_chunk_t **chunk)
1724 {
1725 uvm_va_block_test_t *block_test = uvm_va_block_get_test(block);
1726
1727 if (block_test) {
1728 // Return out of memory error if the tests have requested it. As opposed
1729 // to other error injection settings, this one fails N times and then
1730 // succeeds.
1731 // TODO: Bug 3701182: This will print a warning in Linux kernels newer
1732 // than 5.16.0-rc1+.
1733 if (block_test->inject_cpu_pages_allocation_error_count) {
1734 if (block_test->inject_cpu_pages_allocation_error_count != ~(NvU32)0)
1735 block_test->inject_cpu_pages_allocation_error_count--;
1736 return NV_ERR_NO_MEMORY;
1737 }
1738
1739 if (block_test->cpu_chunk_allocation_actual_id != NUMA_NO_NODE)
1740 nid = block_test->cpu_chunk_allocation_actual_id;
1741 }
1742
1743 return uvm_cpu_chunk_alloc(alloc_size, flags, nid, chunk);
1744 }
1745
1746 // Allocate a CPU chunk with the given properties. This may involve retrying if
1747 // allocations fail. Allocating larger chunk sizes takes priority over
1748 // allocating on the specified node in the following manner:
1749
1750 // 1. Attempt to allocate the largest chunk on nid.
1751 // 2. If that fails attempt allocation of the largest chunk on any nid.
1752 // 3. If that fails attempt progressively smaller allocations on any nid.
1753 //
1754 // Returns NV_OK on success. Returns NV_WARN_MORE_PROCESSING_REQUIRED if
1755 // UVM_CPU_CHUNK_ALLOC_FLAGS_STRICT was ignored to successfully allocate.
block_alloc_cpu_chunk(uvm_va_block_t * block,uvm_chunk_sizes_mask_t cpu_allocation_sizes,uvm_cpu_chunk_alloc_flags_t flags,int nid,uvm_cpu_chunk_t ** chunk)1756 static NV_STATUS block_alloc_cpu_chunk(uvm_va_block_t *block,
1757 uvm_chunk_sizes_mask_t cpu_allocation_sizes,
1758 uvm_cpu_chunk_alloc_flags_t flags,
1759 int nid,
1760 uvm_cpu_chunk_t **chunk)
1761 {
1762 NV_STATUS status = NV_ERR_NO_MEMORY;
1763 uvm_chunk_size_t alloc_size;
1764 bool numa_fallback = false;
1765
1766 for_each_chunk_size_rev(alloc_size, cpu_allocation_sizes) {
1767 status = block_alloc_cpu_chunk_inject_error(block, alloc_size, flags, nid, chunk);
1768 if (status == NV_OK)
1769 break;
1770
1771 if (flags & UVM_CPU_CHUNK_ALLOC_FLAGS_STRICT) {
1772 flags &= ~UVM_CPU_CHUNK_ALLOC_FLAGS_STRICT;
1773 numa_fallback = true;
1774 status = block_alloc_cpu_chunk_inject_error(block, alloc_size, flags, NUMA_NO_NODE, chunk);
1775 if (status == NV_OK)
1776 break;
1777 }
1778 }
1779
1780 UVM_ASSERT(status == NV_OK || status == NV_ERR_NO_MEMORY);
1781
1782 if (numa_fallback && status == NV_OK)
1783 status = NV_WARN_MORE_PROCESSING_REQUIRED;
1784
1785 return status;
1786 }
1787
1788 // Same as block_alloc_cpu_chunk() but allocate a chunk suitable for use as
1789 // a HMM destination page. The main difference is UVM does not own the reference
1790 // on the struct page backing these chunks.
block_alloc_hmm_cpu_chunk(uvm_va_block_t * block,uvm_chunk_sizes_mask_t cpu_allocation_sizes,uvm_cpu_chunk_alloc_flags_t flags,int nid,uvm_cpu_chunk_t ** chunk)1791 static NV_STATUS block_alloc_hmm_cpu_chunk(uvm_va_block_t *block,
1792 uvm_chunk_sizes_mask_t cpu_allocation_sizes,
1793 uvm_cpu_chunk_alloc_flags_t flags,
1794 int nid,
1795 uvm_cpu_chunk_t **chunk)
1796 {
1797 NV_STATUS status;
1798
1799 UVM_ASSERT(uvm_va_block_is_hmm(block));
1800
1801 status = block_alloc_cpu_chunk(block, cpu_allocation_sizes, flags, nid, chunk);
1802 if (status == NV_OK)
1803 (*chunk)->type = UVM_CPU_CHUNK_TYPE_HMM;
1804
1805 return status;
1806 }
1807
1808 // Find the largest allocation size we can use for the given page_index in the
1809 // given block. Returns the mask of possible sizes and region covered by the
1810 // largest. Callers may also elect to use a smaller size.
block_calculate_largest_alloc_size(uvm_va_block_t * va_block,uvm_page_index_t page_index,uvm_page_mask_t * allocated_mask,uvm_chunk_sizes_mask_t cpu_allocation_sizes,uvm_va_block_region_t * allocated_region)1811 static uvm_chunk_sizes_mask_t block_calculate_largest_alloc_size(uvm_va_block_t *va_block,
1812 uvm_page_index_t page_index,
1813 uvm_page_mask_t *allocated_mask,
1814 uvm_chunk_sizes_mask_t cpu_allocation_sizes,
1815 uvm_va_block_region_t *allocated_region)
1816 {
1817 uvm_chunk_size_t alloc_size;
1818 uvm_chunk_sizes_mask_t allocation_sizes = cpu_allocation_sizes;
1819
1820 for_each_chunk_size_rev(alloc_size, cpu_allocation_sizes) {
1821 NvU64 alloc_virt_addr;
1822
1823 // Page must be aligned to the allocation size.
1824 alloc_virt_addr = UVM_ALIGN_DOWN(uvm_va_block_cpu_page_address(va_block, page_index), alloc_size);
1825
1826 // Allocation region must fit within the VA block.
1827 if (!uvm_va_block_contains_address(va_block, alloc_virt_addr) ||
1828 !uvm_va_block_contains_address(va_block, alloc_virt_addr + alloc_size - 1)) {
1829 allocation_sizes &= ~alloc_size;
1830 continue;
1831 }
1832
1833 *allocated_region = uvm_va_block_region_from_start_end(va_block,
1834 alloc_virt_addr,
1835 alloc_virt_addr + alloc_size - 1);
1836
1837 // Allocation region can't overlap previously allocated regions.
1838 if (!uvm_page_mask_region_empty(allocated_mask, *allocated_region)) {
1839 allocation_sizes &= ~alloc_size;
1840 continue;
1841 }
1842
1843 return allocation_sizes;
1844 }
1845
1846 // No possible size was found.
1847 allocated_region->first = 0;
1848 allocated_region->outer = 0;
1849
1850 return UVM_CHUNK_SIZE_INVALID;
1851 }
1852
1853 // Handle insertion of overlapping CPU chunks.
1854 // In cases where the kernel allocates CPU chunks on NUMA nodes that already
1855 // have existing chunks, it's possible that the newly allocated chunk overlaps
1856 // existing chunks.
1857 // In such cases, the newly allocated chunk has to be appropriately split and
1858 // only the non-overlapping subchunks inserted into the block.
1859 // The subchunks, which are not inserted are freed.
1860 // If there is an error during split, insertion, or mapping, any sub-chunks that
1861 // have already been successfully inserted will remain in the block. The rest of
1862 // the sub-chunks will be freed in order to maintain proper refcounts on the
1863 // parent chunk.
block_populate_overlapping_cpu_chunks(uvm_va_block_t * block,uvm_page_mask_t * node_pages_mask,uvm_cpu_chunk_t * chunk,uvm_page_index_t page_index)1864 static NV_STATUS block_populate_overlapping_cpu_chunks(uvm_va_block_t *block,
1865 uvm_page_mask_t *node_pages_mask,
1866 uvm_cpu_chunk_t *chunk,
1867 uvm_page_index_t page_index)
1868 {
1869 int nid = uvm_cpu_chunk_get_numa_node(chunk);
1870 uvm_va_block_cpu_node_state_t *node_state = block_node_state_get(block, nid);
1871 uvm_chunk_size_t chunk_size = uvm_cpu_chunk_get_size(chunk);
1872 uvm_va_block_region_t chunk_region = uvm_va_block_chunk_region(block, chunk_size, page_index);
1873 uvm_page_index_t running_page_index;
1874 uvm_cpu_chunk_t **split_chunks;
1875 uvm_cpu_chunk_t **small_chunks = NULL;
1876 uvm_cpu_chunk_t *chunk_ptr;
1877 uvm_chunk_size_t split_size;
1878 size_t i;
1879 NV_STATUS status;
1880
1881 UVM_ASSERT(IS_ALIGNED(uvm_va_block_cpu_page_address(block, page_index), chunk_size));
1882
1883 // Get a mask of all the chunk pages that are not overlapping existing
1884 // chunks.
1885 uvm_page_mask_init_from_region(node_pages_mask, chunk_region, NULL);
1886 uvm_page_mask_andnot(node_pages_mask, node_pages_mask, &node_state->allocated);
1887
1888 split_size = uvm_chunk_find_prev_size(uvm_cpu_chunk_get_allocation_sizes(), chunk_size);
1889 split_chunks = uvm_kvmalloc_zero((chunk_size / split_size) * sizeof(*split_chunks));
1890 if (!split_chunks) {
1891 uvm_cpu_chunk_free(chunk);
1892 return NV_ERR_NO_MEMORY;
1893 }
1894
1895 if (split_size > UVM_PAGE_SIZE_4K) {
1896 small_chunks = uvm_kvmalloc_zero(MAX_SMALL_CHUNKS_PER_BIG_SLOT * sizeof(*small_chunks));
1897 if (!small_chunks) {
1898 uvm_kvfree(split_chunks);
1899 uvm_cpu_chunk_free(chunk);
1900 return NV_ERR_NO_MEMORY;
1901 }
1902 }
1903
1904 // If we are here, we have to do at least one split.
1905 // We can't call any of the block_split_cpu_chunk_to_* functions since they
1906 // insert all of the split chunks into the block.
1907 // We only want to insert the sub-chunks that don't overlap. So, we have to
1908 // handle that by calling uvm_cpu_chunk_split() directly.
1909 status = uvm_cpu_chunk_split(chunk, split_chunks);
1910 if (status != NV_OK)
1911 goto done;
1912
1913 // Insert all split chunks that don't overlap existing allocations.
1914 // Note that this handles both splitting to 64K and 4K.
1915 running_page_index = page_index;
1916 for (i = 0; i < chunk_size / split_size; i++) {
1917 uvm_va_block_region_t subchunk_region = uvm_va_block_chunk_region(block, split_size, running_page_index);
1918
1919 // - If all the pages covered by the split chunk are missing, insert the
1920 // chunk into the block.
1921 // - If none of the pages are missing, free the chunk.
1922 // - Otherwise, some of the pages covered by the chunk are missing and a
1923 // second split will be needed.
1924 if (uvm_page_mask_region_full(node_pages_mask, subchunk_region)) {
1925 status = uvm_cpu_chunk_insert_in_block(block, split_chunks[i], running_page_index);
1926 if (status != NV_OK)
1927 goto done;
1928
1929 // To prevent double chunk freeing on error, clear the array pointer
1930 // before mapping.
1931 chunk_ptr = split_chunks[i];
1932 split_chunks[i] = NULL;
1933 status = uvm_va_block_map_cpu_chunk_on_gpus(block, chunk_ptr, running_page_index);
1934 if (status != NV_OK)
1935 goto done;
1936 }
1937 else if (uvm_page_mask_region_empty(node_pages_mask, subchunk_region)) {
1938 uvm_cpu_chunk_free(split_chunks[i]);
1939 split_chunks[i] = NULL;
1940 }
1941
1942 running_page_index = subchunk_region.outer;
1943 }
1944
1945 if (split_size > UVM_PAGE_SIZE_4K) {
1946 // Split any 64K chunks that overlap 4K chunks.
1947 for (i = 0; i < chunk_size / split_size; i++) {
1948 size_t j;
1949
1950 if (!split_chunks[i])
1951 continue;
1952
1953 running_page_index = page_index + ((split_size * i) / PAGE_SIZE);
1954 status = uvm_cpu_chunk_split(split_chunks[i], small_chunks);
1955 if (status != NV_OK)
1956 goto done;
1957
1958 for (j = 0; j < MAX_SMALL_CHUNKS_PER_BIG_SLOT; j++) {
1959 size_t chunk_num_pages = uvm_cpu_chunk_num_pages(small_chunks[j]);
1960
1961 if (uvm_page_mask_test(node_pages_mask, running_page_index)) {
1962 status = uvm_cpu_chunk_insert_in_block(block, small_chunks[j], running_page_index);
1963 if (status != NV_OK)
1964 goto done;
1965
1966 // To prevent double chunk freeing on error, clear the array pointer
1967 // before mapping.
1968 chunk_ptr = small_chunks[j];
1969 small_chunks[j] = NULL;
1970 status = uvm_va_block_map_cpu_chunk_on_gpus(block, chunk_ptr, running_page_index);
1971 if (status != NV_OK)
1972 goto done;
1973 }
1974 else {
1975 uvm_cpu_chunk_free(small_chunks[j]);
1976 }
1977
1978 running_page_index += chunk_num_pages;
1979 }
1980 }
1981 }
1982
1983 done:
1984 if (status != NV_OK) {
1985 // First, free any small chunks that have not been inserted.
1986 if (small_chunks) {
1987 for (i = 0; i < MAX_SMALL_CHUNKS_PER_BIG_SLOT; i++)
1988 uvm_cpu_chunk_free(small_chunks[i]);
1989 }
1990
1991 // Next, free any large chunks that have not been inserted.
1992 for (i = 0; i < chunk_size / split_size; i++)
1993 uvm_cpu_chunk_free(split_chunks[i]);
1994 }
1995
1996 uvm_kvfree(small_chunks);
1997 uvm_kvfree(split_chunks);
1998 return status;
1999 }
2000
2001 // Add the already allocated chunk to the block. Note that this
2002 // handles chunk management on failure, so the caller must not free
2003 // the chunk on failure.
block_add_cpu_chunk(uvm_va_block_t * block,uvm_page_mask_t * node_pages_mask,uvm_cpu_chunk_t * chunk,uvm_va_block_region_t region)2004 static NV_STATUS block_add_cpu_chunk(uvm_va_block_t *block,
2005 uvm_page_mask_t *node_pages_mask,
2006 uvm_cpu_chunk_t *chunk,
2007 uvm_va_block_region_t region)
2008 {
2009 NV_STATUS status = NV_OK;
2010 int alloced_nid;
2011 uvm_va_block_cpu_node_state_t *node_state;
2012 uvm_page_index_t page_index = region.first;
2013
2014 alloced_nid = uvm_cpu_chunk_get_numa_node(chunk);
2015 node_state = block_node_state_get(block, alloced_nid);
2016 if (!uvm_page_mask_region_empty(&node_state->allocated, region)) {
2017 // We may have ended up falling back to allocating the chunk on a
2018 // non-preferred node which may already had a chunk allocated on it in
2019 // which case we can discard the new chunk.
2020 if (uvm_page_mask_region_full(&node_state->allocated, region)) {
2021 uvm_cpu_chunk_free(chunk);
2022 }
2023 else {
2024 // There is no need to free the chunk on failure since
2025 // block_populate_overlapping_cpu_chunks() would already have
2026 // done it.
2027 status = block_populate_overlapping_cpu_chunks(block, node_pages_mask, chunk, page_index);
2028 }
2029
2030 return status;
2031 }
2032 else {
2033 status = uvm_cpu_chunk_insert_in_block(block, chunk, page_index);
2034 if (status != NV_OK)
2035 goto out;
2036
2037 status = uvm_va_block_map_cpu_chunk_on_gpus(block, chunk, page_index);
2038 if (status != NV_OK) {
2039 uvm_cpu_chunk_remove_from_block(block, uvm_cpu_chunk_get_numa_node(chunk), page_index);
2040 goto out;
2041 }
2042 }
2043
2044 out:
2045 if (status != NV_OK) {
2046 // We free the chunk even though it was allocated by the caller because
2047 // block_populate_overlapping_cpu_chunks() can fail after freeing the
2048 // orignal chunk so we need to do the same here.
2049 uvm_cpu_chunk_free(chunk);
2050 }
2051
2052 return status;
2053 }
2054
2055 // Allocates the input page in the block, if it doesn't already exist
2056 //
2057 // Also maps the page for physical access by all GPUs used by the block, which
2058 // is required for IOMMU support. Skipped on GPUs without access to CPU memory.
2059 // e.g., this happens when the Confidential Computing Feature is enabled.
block_populate_pages_cpu(uvm_va_block_t * block,uvm_page_mask_t * populate_page_mask,uvm_va_block_region_t populate_region,uvm_va_block_context_t * block_context,bool staged)2060 static NV_STATUS block_populate_pages_cpu(uvm_va_block_t *block,
2061 uvm_page_mask_t *populate_page_mask,
2062 uvm_va_block_region_t populate_region,
2063 uvm_va_block_context_t *block_context,
2064 bool staged)
2065 {
2066 NV_STATUS status = NV_OK;
2067 uvm_cpu_chunk_t *chunk;
2068 uvm_va_block_test_t *block_test = uvm_va_block_get_test(block);
2069 uvm_chunk_sizes_mask_t cpu_allocation_sizes = uvm_cpu_chunk_get_allocation_sizes();
2070 uvm_page_mask_t *resident_mask = &block_context->scratch_page_mask;
2071 uvm_page_mask_t *allocated_mask;
2072 uvm_cpu_chunk_alloc_flags_t alloc_flags = UVM_CPU_CHUNK_ALLOC_FLAGS_NONE;
2073 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
2074 const uvm_va_policy_t *policy = uvm_va_policy_get_region(block, populate_region);
2075 uvm_page_index_t page_index;
2076 uvm_gpu_id_t id;
2077 int preferred_nid = block_context->make_resident.dest_nid;
2078
2079 if (block_test && block_test->cpu_chunk_allocation_target_id != NUMA_NO_NODE)
2080 preferred_nid = block_test->cpu_chunk_allocation_target_id;
2081
2082 // If the VA range has a preferred NUMA node, use it.
2083 if (preferred_nid == NUMA_NO_NODE)
2084 preferred_nid = policy->preferred_nid;
2085
2086 // TODO: Bug 4158598: Using NUMA_NO_NODE for staging allocations is sub-optimal.
2087 if (preferred_nid != NUMA_NO_NODE) {
2088 uvm_va_block_cpu_node_state_t *node_state = block_node_state_get(block, preferred_nid);
2089 allocated_mask = &node_state->allocated;
2090 alloc_flags |= UVM_CPU_CHUNK_ALLOC_FLAGS_STRICT;
2091 }
2092 else {
2093 allocated_mask = &block->cpu.allocated;
2094 }
2095
2096 if (va_space->test.allow_allocation_from_movable)
2097 alloc_flags |= UVM_CPU_CHUNK_ALLOC_FLAGS_ALLOW_MOVABLE;
2098
2099 // Check whether all requested pages have already been allocated.
2100 uvm_page_mask_init_from_region(&block_context->scratch_page_mask, populate_region, populate_page_mask);
2101 if (!uvm_page_mask_andnot(&block_context->scratch_page_mask,
2102 &block_context->scratch_page_mask,
2103 allocated_mask))
2104 return NV_OK;
2105
2106 if (block_test) {
2107 if (block_test->cpu_chunk_allocation_size_mask)
2108 cpu_allocation_sizes &= block_test->cpu_chunk_allocation_size_mask;
2109 }
2110
2111 uvm_page_mask_zero(resident_mask);
2112 for_each_id_in_mask(id, &block->resident)
2113 uvm_page_mask_or(resident_mask, resident_mask, uvm_va_block_resident_mask_get(block, id, NUMA_NO_NODE));
2114
2115 // If the VA space has a UVM-Lite GPU registered, only PAGE_SIZE allocations
2116 // should be used in order to avoid extra copies due to dirty compound
2117 // pages. HMM va_blocks also require PAGE_SIZE allocations.
2118 // TODO: Bug 3368756: add support for HMM transparent huge page (THP)
2119 // migrations.
2120
2121 if (!uvm_processor_mask_empty(&va_space->non_faultable_processors) || uvm_va_block_is_hmm(block))
2122 cpu_allocation_sizes = PAGE_SIZE;
2123
2124 if (block_context->mm && !uvm_va_block_is_hmm(block))
2125 alloc_flags |= UVM_CPU_CHUNK_ALLOC_FLAGS_ACCOUNT;
2126
2127 UVM_ASSERT(cpu_allocation_sizes >= PAGE_SIZE);
2128 UVM_ASSERT(cpu_allocation_sizes & PAGE_SIZE);
2129
2130 for_each_va_block_page_in_region_mask(page_index, populate_page_mask, populate_region) {
2131 uvm_cpu_chunk_alloc_flags_t chunk_alloc_flags = alloc_flags;
2132 uvm_va_block_region_t region = populate_region;
2133 uvm_page_mask_t *node_pages_mask = &block_context->make_resident.node_pages_mask;
2134 uvm_chunk_sizes_mask_t allocation_sizes;
2135
2136 if (uvm_page_mask_test(allocated_mask, page_index) ||
2137 uvm_va_block_cpu_is_page_resident_on(block, preferred_nid, page_index)) {
2138 page_index = uvm_va_block_next_unset_page_in_mask(populate_region, allocated_mask, page_index) - 1;
2139 continue;
2140 }
2141
2142 allocation_sizes = block_calculate_largest_alloc_size(block,
2143 page_index,
2144 allocated_mask,
2145 cpu_allocation_sizes,
2146 ®ion);
2147 if (allocation_sizes == UVM_CHUNK_SIZE_INVALID)
2148 return NV_ERR_NO_MEMORY;
2149
2150 // If not all pages in the allocation region are resident somewhere,
2151 // zero out the allocated page.
2152 // This could be wasteful if only a few pages in high-order
2153 // allocation need to be zero'ed out but the alternative is to map
2154 // single sub-pages one-by-one.
2155 if (!uvm_page_mask_region_full(resident_mask, region))
2156 chunk_alloc_flags |= UVM_CPU_CHUNK_ALLOC_FLAGS_ZERO;
2157
2158 // Management of a page used for a staged migration is never handed off
2159 // to the kernel and is really just a driver managed page. Therefore
2160 // don't allocate a HMM chunk in this case.
2161 if (uvm_va_block_is_hmm(block) && !staged)
2162 status = block_alloc_hmm_cpu_chunk(block, allocation_sizes, chunk_alloc_flags, preferred_nid, &chunk);
2163 else
2164 status = block_alloc_cpu_chunk(block, allocation_sizes, chunk_alloc_flags, preferred_nid, &chunk);
2165
2166 if (status == NV_WARN_MORE_PROCESSING_REQUIRED) {
2167 alloc_flags &= ~UVM_CPU_CHUNK_ALLOC_FLAGS_STRICT;
2168 preferred_nid = NUMA_NO_NODE;
2169 block_context->make_resident.dest_nid = NUMA_NO_NODE;
2170 }
2171 else if (status != NV_OK) {
2172 return status;
2173 }
2174
2175 // A smaller chunk than the maximum size may have been allocated, update the region accordingly.
2176 region = uvm_va_block_chunk_region(block, uvm_cpu_chunk_get_size(chunk), page_index);
2177 status = block_add_cpu_chunk(block, node_pages_mask, chunk, region);
2178 if (status != NV_OK)
2179 return status;
2180
2181 // Skip iterating over all pages covered by the allocated chunk.
2182 page_index = region.outer - 1;
2183
2184 #if UVM_IS_CONFIG_HMM()
2185 if (uvm_va_block_is_hmm(block) && block_context)
2186 block_context->hmm.dst_pfns[page_index] = migrate_pfn(page_to_pfn(chunk->page));
2187 #endif
2188 }
2189
2190 return NV_OK;
2191 }
2192
2193 // Note this clears the block_context caller_page_mask.
uvm_va_block_populate_page_cpu(uvm_va_block_t * va_block,uvm_page_index_t page_index,uvm_va_block_context_t * block_context)2194 NV_STATUS uvm_va_block_populate_page_cpu(uvm_va_block_t *va_block, uvm_page_index_t page_index, uvm_va_block_context_t *block_context)
2195 {
2196 uvm_page_mask_t *page_mask = &block_context->caller_page_mask;
2197
2198 uvm_page_mask_zero(page_mask);
2199 uvm_page_mask_set(page_mask, page_index);
2200 return block_populate_pages_cpu(va_block, page_mask, uvm_va_block_region_from_block(va_block), block_context, false);
2201 }
2202
2203 // Try allocating a chunk. If eviction was required,
2204 // NV_ERR_MORE_PROCESSING_REQUIRED will be returned since the block's lock was
2205 // unlocked and relocked. The caller is responsible for adding the chunk to the
2206 // retry used_chunks list.
block_alloc_gpu_chunk(uvm_va_block_t * block,uvm_va_block_retry_t * retry,uvm_gpu_t * gpu,uvm_chunk_size_t size,uvm_gpu_chunk_t ** out_gpu_chunk)2207 static NV_STATUS block_alloc_gpu_chunk(uvm_va_block_t *block,
2208 uvm_va_block_retry_t *retry,
2209 uvm_gpu_t *gpu,
2210 uvm_chunk_size_t size,
2211 uvm_gpu_chunk_t **out_gpu_chunk)
2212 {
2213 NV_STATUS status = NV_OK;
2214 uvm_gpu_chunk_t *gpu_chunk;
2215
2216 // First try getting a free chunk from previously-made allocations.
2217 gpu_chunk = block_retry_get_free_chunk(retry, gpu, size);
2218 if (!gpu_chunk) {
2219 uvm_va_block_test_t *block_test = uvm_va_block_get_test(block);
2220 if (block_test && block_test->user_pages_allocation_retry_force_count > 0) {
2221 // Force eviction by pretending the allocation failed with no memory
2222 --block_test->user_pages_allocation_retry_force_count;
2223 status = NV_ERR_NO_MEMORY;
2224 }
2225 else {
2226 // Try allocating a new one without eviction
2227 status = uvm_pmm_gpu_alloc_user(&gpu->pmm, 1, size, UVM_PMM_ALLOC_FLAGS_NONE, &gpu_chunk, &retry->tracker);
2228 }
2229
2230 if (status == NV_ERR_NO_MEMORY) {
2231 // If that fails with no memory, try allocating with eviction and
2232 // return back to the caller immediately so that the operation can
2233 // be restarted.
2234 uvm_mutex_unlock(&block->lock);
2235
2236 status = uvm_pmm_gpu_alloc_user(&gpu->pmm, 1, size, UVM_PMM_ALLOC_FLAGS_EVICT, &gpu_chunk, &retry->tracker);
2237 if (status == NV_OK) {
2238 block_retry_add_free_chunk(retry, gpu_chunk);
2239 status = NV_ERR_MORE_PROCESSING_REQUIRED;
2240 }
2241
2242 uvm_mutex_lock(&block->lock);
2243 return status;
2244 }
2245 else if (status != NV_OK) {
2246 return status;
2247 }
2248 }
2249
2250 *out_gpu_chunk = gpu_chunk;
2251 return NV_OK;
2252 }
2253
block_gpu_has_page_tables(uvm_va_block_t * block,uvm_gpu_t * gpu)2254 static bool block_gpu_has_page_tables(uvm_va_block_t *block, uvm_gpu_t *gpu)
2255 {
2256 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
2257
2258 if (!gpu_state)
2259 return false;
2260
2261 return gpu_state->page_table_range_4k.table ||
2262 gpu_state->page_table_range_big.table ||
2263 gpu_state->page_table_range_2m.table;
2264 }
2265
2266 // A helper to get a known-to-be-present GPU VA space given a VA block that's
2267 // locked. In order to use this function, the caller must know that at least one
2268 // of these conditions is true:
2269 //
2270 // 1) The VA space lock is held
2271 // 2) The VA block has active page tables for the GPU
2272 //
2273 // If the VA space lock is held (#1), then the gpu_va_space obviously can't go
2274 // away.
2275 //
2276 // On the eviction path, we don't have a lock on the VA space state. However,
2277 // since remove_gpu_va_space walks each block to unmap the GPU and free GPU page
2278 // tables before destroying the gpu_va_space, we're guaranteed that if this GPU
2279 // has page tables (#2), the gpu_va_space can't go away while we're holding the
2280 // block lock.
uvm_va_block_get_gpu_va_space(uvm_va_block_t * va_block,uvm_gpu_t * gpu)2281 static uvm_gpu_va_space_t *uvm_va_block_get_gpu_va_space(uvm_va_block_t *va_block, uvm_gpu_t *gpu)
2282 {
2283 uvm_gpu_va_space_t *gpu_va_space;
2284 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
2285
2286 UVM_ASSERT(gpu);
2287
2288 if (!block_gpu_has_page_tables(va_block, gpu))
2289 uvm_assert_rwsem_locked(&va_space->lock);
2290
2291 UVM_ASSERT(uvm_processor_mask_test(&va_space->registered_gpu_va_spaces, gpu->id));
2292
2293 gpu_va_space = va_space->gpu_va_spaces[uvm_id_gpu_index(gpu->id)];
2294
2295 UVM_ASSERT(uvm_gpu_va_space_state(gpu_va_space) == UVM_GPU_VA_SPACE_STATE_ACTIVE);
2296 UVM_ASSERT(gpu_va_space->va_space == va_space);
2297 UVM_ASSERT(gpu_va_space->gpu == gpu);
2298
2299 return gpu_va_space;
2300 }
2301
block_gpu_supports_2m(uvm_va_block_t * block,uvm_gpu_t * gpu)2302 static bool block_gpu_supports_2m(uvm_va_block_t *block, uvm_gpu_t *gpu)
2303 {
2304 uvm_gpu_va_space_t *gpu_va_space;
2305
2306 // TODO: Bug 3368756: add HMM support for transparent huge page migrations.
2307 if (uvm_va_block_size(block) < UVM_PAGE_SIZE_2M || uvm_va_block_is_hmm(block))
2308 return false;
2309
2310 UVM_ASSERT(uvm_va_block_size(block) == UVM_PAGE_SIZE_2M);
2311
2312 gpu_va_space = uvm_va_block_get_gpu_va_space(block, gpu);
2313 return uvm_mmu_page_size_supported(&gpu_va_space->page_tables, UVM_PAGE_SIZE_2M);
2314 }
2315
uvm_va_block_gpu_big_page_size(uvm_va_block_t * va_block,uvm_gpu_t * gpu)2316 NvU32 uvm_va_block_gpu_big_page_size(uvm_va_block_t *va_block, uvm_gpu_t *gpu)
2317 {
2318 uvm_gpu_va_space_t *gpu_va_space;
2319
2320 gpu_va_space = uvm_va_block_get_gpu_va_space(va_block, gpu);
2321 return gpu_va_space->page_tables.big_page_size;
2322 }
2323
range_big_page_region_all(NvU64 start,NvU64 end,NvU32 big_page_size)2324 static uvm_va_block_region_t range_big_page_region_all(NvU64 start, NvU64 end, NvU32 big_page_size)
2325 {
2326 NvU64 first_addr = UVM_ALIGN_UP(start, big_page_size);
2327 NvU64 outer_addr = UVM_ALIGN_DOWN(end + 1, big_page_size);
2328
2329 // The range must fit within a VA block
2330 UVM_ASSERT(UVM_VA_BLOCK_ALIGN_DOWN(start) == UVM_VA_BLOCK_ALIGN_DOWN(end));
2331
2332 if (outer_addr <= first_addr)
2333 return uvm_va_block_region(0, 0);
2334
2335 return uvm_va_block_region((first_addr - start) / PAGE_SIZE, (outer_addr - start) / PAGE_SIZE);
2336 }
2337
range_num_big_pages(NvU64 start,NvU64 end,NvU32 big_page_size)2338 static size_t range_num_big_pages(NvU64 start, NvU64 end, NvU32 big_page_size)
2339 {
2340 uvm_va_block_region_t region = range_big_page_region_all(start, end, big_page_size);
2341 return (size_t)uvm_div_pow2_64(uvm_va_block_region_size(region), big_page_size);
2342 }
2343
uvm_va_block_big_page_region_all(uvm_va_block_t * va_block,NvU32 big_page_size)2344 uvm_va_block_region_t uvm_va_block_big_page_region_all(uvm_va_block_t *va_block, NvU32 big_page_size)
2345 {
2346 return range_big_page_region_all(va_block->start, va_block->end, big_page_size);
2347 }
2348
uvm_va_block_big_page_region_subset(uvm_va_block_t * va_block,uvm_va_block_region_t region,NvU32 big_page_size)2349 uvm_va_block_region_t uvm_va_block_big_page_region_subset(uvm_va_block_t *va_block,
2350 uvm_va_block_region_t region,
2351 NvU32 big_page_size)
2352 {
2353 NvU64 start = uvm_va_block_region_start(va_block, region);
2354 NvU64 end = uvm_va_block_region_end(va_block, region);
2355 uvm_va_block_region_t big_region;
2356
2357 UVM_ASSERT(start < va_block->end);
2358 UVM_ASSERT(end <= va_block->end);
2359
2360 big_region = range_big_page_region_all(start, end, big_page_size);
2361 if (big_region.outer) {
2362 big_region.first += region.first;
2363 big_region.outer += region.first;
2364 }
2365
2366 return big_region;
2367 }
2368
uvm_va_block_num_big_pages(uvm_va_block_t * va_block,NvU32 big_page_size)2369 size_t uvm_va_block_num_big_pages(uvm_va_block_t *va_block, NvU32 big_page_size)
2370 {
2371 return range_num_big_pages(va_block->start, va_block->end, big_page_size);
2372 }
2373
uvm_va_block_big_page_addr(uvm_va_block_t * va_block,size_t big_page_index,NvU32 big_page_size)2374 NvU64 uvm_va_block_big_page_addr(uvm_va_block_t *va_block, size_t big_page_index, NvU32 big_page_size)
2375 {
2376 NvU64 addr = UVM_ALIGN_UP(va_block->start, big_page_size) + (big_page_index * big_page_size);
2377 UVM_ASSERT(addr >= va_block->start);
2378 UVM_ASSERT(addr < va_block->end);
2379 return addr;
2380 }
2381
uvm_va_block_big_page_region(uvm_va_block_t * va_block,size_t big_page_index,NvU32 big_page_size)2382 uvm_va_block_region_t uvm_va_block_big_page_region(uvm_va_block_t *va_block, size_t big_page_index, NvU32 big_page_size)
2383 {
2384 NvU64 page_addr = uvm_va_block_big_page_addr(va_block, big_page_index, big_page_size);
2385
2386 // Assume that we don't have to handle multiple big PTEs per system page.
2387 // It's not terribly difficult to implement, but we don't currently have a
2388 // use case.
2389 UVM_ASSERT(big_page_size >= PAGE_SIZE);
2390
2391 return uvm_va_block_region_from_start_size(va_block, page_addr, big_page_size);
2392 }
2393
2394 // Returns the big page index (the bit index within
2395 // uvm_va_block_gpu_state_t::big_ptes) corresponding to page_index. If
2396 // page_index cannot be covered by a big PTE due to alignment or block size,
2397 // MAX_BIG_PAGES_PER_UVM_VA_BLOCK is returned.
uvm_va_block_big_page_index(uvm_va_block_t * va_block,uvm_page_index_t page_index,NvU32 big_page_size)2398 size_t uvm_va_block_big_page_index(uvm_va_block_t *va_block, uvm_page_index_t page_index, NvU32 big_page_size)
2399 {
2400 uvm_va_block_region_t big_region_all = uvm_va_block_big_page_region_all(va_block, big_page_size);
2401 size_t big_index;
2402
2403 // Note that this condition also handles the case of having no big pages in
2404 // the block, in which case .first >= .outer.
2405 if (page_index < big_region_all.first || page_index >= big_region_all.outer)
2406 return MAX_BIG_PAGES_PER_UVM_VA_BLOCK;
2407
2408 big_index = (size_t)uvm_div_pow2_64((page_index - big_region_all.first) * PAGE_SIZE, big_page_size);
2409
2410 UVM_ASSERT(uvm_va_block_big_page_addr(va_block, big_index, big_page_size) >= va_block->start);
2411 UVM_ASSERT(uvm_va_block_big_page_addr(va_block, big_index, big_page_size) + big_page_size <= va_block->end + 1);
2412
2413 return big_index;
2414 }
2415
uvm_page_mask_init_from_big_ptes(uvm_va_block_t * block,uvm_gpu_t * gpu,uvm_page_mask_t * mask_out,const unsigned long * big_ptes_in)2416 static void uvm_page_mask_init_from_big_ptes(uvm_va_block_t *block,
2417 uvm_gpu_t *gpu,
2418 uvm_page_mask_t *mask_out,
2419 const unsigned long *big_ptes_in)
2420 {
2421 uvm_va_block_region_t big_region;
2422 size_t big_page_index;
2423 NvU32 big_page_size = uvm_va_block_gpu_big_page_size(block, gpu);
2424
2425 uvm_page_mask_zero(mask_out);
2426
2427 for_each_set_bit(big_page_index, big_ptes_in, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) {
2428 big_region = uvm_va_block_big_page_region(block, big_page_index, big_page_size);
2429 uvm_page_mask_region_fill(mask_out, big_region);
2430 }
2431 }
2432
uvm_va_block_page_size_cpu(uvm_va_block_t * va_block,uvm_page_index_t page_index)2433 NvU32 uvm_va_block_page_size_cpu(uvm_va_block_t *va_block, uvm_page_index_t page_index)
2434 {
2435 if (!uvm_page_mask_test(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ], page_index))
2436 return 0;
2437
2438 UVM_ASSERT(uvm_processor_mask_test(&va_block->mapped, UVM_ID_CPU));
2439
2440 // Despite the fact that physical CPU memory can be allocated at sizes
2441 // greater than PAGE_SIZE, vm_insert_page(s)() always maps CPU memory
2442 // with 4K PTEs. Until the core kernel adds support for PMD mappings,
2443 // the return value of this function will remain at PAGE_SIZE.
2444 return PAGE_SIZE;
2445 }
2446
uvm_va_block_page_size_gpu(uvm_va_block_t * va_block,uvm_gpu_id_t gpu_id,uvm_page_index_t page_index)2447 NvU32 uvm_va_block_page_size_gpu(uvm_va_block_t *va_block, uvm_gpu_id_t gpu_id, uvm_page_index_t page_index)
2448 {
2449 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu_id);
2450 size_t big_page_size, big_page_index;
2451
2452 if (!gpu_state)
2453 return 0;
2454
2455 if (!uvm_page_mask_test(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ], page_index))
2456 return 0;
2457
2458 UVM_ASSERT(uvm_processor_mask_test(&va_block->mapped, gpu_id));
2459
2460 if (gpu_state->pte_is_2m)
2461 return UVM_PAGE_SIZE_2M;
2462
2463 big_page_size = uvm_va_block_gpu_big_page_size(va_block, block_get_gpu(va_block, gpu_id));
2464 big_page_index = uvm_va_block_big_page_index(va_block, page_index, big_page_size);
2465 if (big_page_index != MAX_BIG_PAGES_PER_UVM_VA_BLOCK && test_bit(big_page_index, gpu_state->big_ptes))
2466 return big_page_size;
2467
2468 return UVM_PAGE_SIZE_4K;
2469 }
2470
2471 // Get the size of the physical allocation backing the page, or 0 if not
2472 // resident. Note that this is different from uvm_va_block_page_size_* because
2473 // those return the size of the PTE which maps the page index, which may be
2474 // smaller than the physical allocation.
block_phys_page_size(uvm_va_block_t * block,block_phys_page_t page)2475 static NvU32 block_phys_page_size(uvm_va_block_t *block, block_phys_page_t page)
2476 {
2477 uvm_va_block_gpu_state_t *gpu_state;
2478 uvm_chunk_size_t chunk_size;
2479
2480 if (UVM_ID_IS_CPU(page.processor)) {
2481 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, page.nid, page.page_index);
2482 uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(block, page.processor, NUMA_NO_NODE);
2483
2484 if (!uvm_page_mask_test(resident_mask, page.page_index))
2485 return 0;
2486
2487 UVM_ASSERT(uvm_processor_mask_test(&block->resident, UVM_ID_CPU));
2488 return (NvU32)uvm_cpu_chunk_get_size(chunk);
2489 }
2490
2491 gpu_state = uvm_va_block_gpu_state_get(block, page.processor);
2492 if (!gpu_state || !uvm_page_mask_test(&gpu_state->resident, page.page_index))
2493 return 0;
2494
2495 UVM_ASSERT(uvm_processor_mask_test(&block->resident, page.processor));
2496 block_gpu_chunk_index(block, block_get_gpu(block, page.processor), page.page_index, &chunk_size);
2497 return (NvU32)chunk_size;
2498 }
2499
uvm_va_block_get_physical_size(uvm_va_block_t * block,uvm_processor_id_t processor,uvm_page_index_t page_index)2500 NvU32 uvm_va_block_get_physical_size(uvm_va_block_t *block,
2501 uvm_processor_id_t processor,
2502 uvm_page_index_t page_index)
2503 {
2504 int nid = NUMA_NO_NODE;
2505 block_phys_page_t page;
2506
2507 UVM_ASSERT(block);
2508
2509 uvm_assert_mutex_locked(&block->lock);
2510
2511 if (UVM_ID_IS_CPU(processor)) {
2512 nid = block_get_page_node_residency(block, page_index);
2513 if (nid == NUMA_NO_NODE)
2514 return 0;
2515 }
2516
2517 page = block_phys_page(processor, nid, page_index);
2518
2519 return block_phys_page_size(block, page);
2520 }
2521
get_cpu_pte_bit_index(uvm_prot_t prot)2522 static uvm_pte_bits_cpu_t get_cpu_pte_bit_index(uvm_prot_t prot)
2523 {
2524 uvm_pte_bits_cpu_t pte_bit_index = UVM_PTE_BITS_CPU_MAX;
2525
2526 // ATOMIC and WRITE are synonyms for the CPU
2527 if (prot == UVM_PROT_READ_WRITE_ATOMIC || prot == UVM_PROT_READ_WRITE)
2528 pte_bit_index = UVM_PTE_BITS_CPU_WRITE;
2529 else if (prot == UVM_PROT_READ_ONLY)
2530 pte_bit_index = UVM_PTE_BITS_CPU_READ;
2531 else
2532 UVM_ASSERT_MSG(false, "Invalid access permissions %s\n", uvm_prot_string(prot));
2533
2534 return pte_bit_index;
2535 }
2536
get_gpu_pte_bit_index(uvm_prot_t prot)2537 static uvm_pte_bits_gpu_t get_gpu_pte_bit_index(uvm_prot_t prot)
2538 {
2539 uvm_pte_bits_gpu_t pte_bit_index = UVM_PTE_BITS_GPU_MAX;
2540
2541 if (prot == UVM_PROT_READ_WRITE_ATOMIC)
2542 pte_bit_index = UVM_PTE_BITS_GPU_ATOMIC;
2543 else if (prot == UVM_PROT_READ_WRITE)
2544 pte_bit_index = UVM_PTE_BITS_GPU_WRITE;
2545 else if (prot == UVM_PROT_READ_ONLY)
2546 pte_bit_index = UVM_PTE_BITS_GPU_READ;
2547 else
2548 UVM_ASSERT_MSG(false, "Invalid access permissions %s\n", uvm_prot_string(prot));
2549
2550 return pte_bit_index;
2551 }
2552
uvm_va_block_resident_mask_get(uvm_va_block_t * block,uvm_processor_id_t processor,int nid)2553 uvm_page_mask_t *uvm_va_block_resident_mask_get(uvm_va_block_t *block, uvm_processor_id_t processor, int nid)
2554 {
2555 uvm_va_block_gpu_state_t *gpu_state;
2556 uvm_page_mask_t *resident_mask;
2557
2558 if (UVM_ID_IS_CPU(processor)) {
2559 uvm_va_block_cpu_node_state_t *node_state;
2560
2561 if (nid == NUMA_NO_NODE) {
2562 resident_mask = &block->cpu.resident;
2563 }
2564 else {
2565 node_state = block_node_state_get(block, nid);
2566 resident_mask = &node_state->resident;
2567 }
2568 }
2569 else {
2570 gpu_state = uvm_va_block_gpu_state_get(block, processor);
2571 UVM_ASSERT(gpu_state);
2572 resident_mask = &gpu_state->resident;
2573 }
2574
2575 return resident_mask;
2576 }
2577
2578 // Get the page residency mask for a processor
2579 //
2580 // Notably this will allocate GPU state if not yet present and if that fails
2581 // NULL is returned.
block_resident_mask_get_alloc(uvm_va_block_t * block,uvm_processor_id_t processor,int nid)2582 static uvm_page_mask_t *block_resident_mask_get_alloc(uvm_va_block_t *block, uvm_processor_id_t processor, int nid)
2583 {
2584 uvm_va_block_gpu_state_t *gpu_state;
2585
2586 if (UVM_ID_IS_CPU(processor))
2587 return uvm_va_block_resident_mask_get(block, processor, nid);
2588
2589 gpu_state = block_gpu_state_get_alloc(block, block_get_gpu(block, processor));
2590 if (!gpu_state)
2591 return NULL;
2592
2593 return &gpu_state->resident;
2594 }
2595
block_map_with_prot_mask_get(uvm_va_block_t * block,uvm_processor_id_t processor,uvm_prot_t prot)2596 static const uvm_page_mask_t *block_map_with_prot_mask_get(uvm_va_block_t *block,
2597 uvm_processor_id_t processor,
2598 uvm_prot_t prot)
2599 {
2600 uvm_va_block_gpu_state_t *gpu_state;
2601
2602 if (UVM_ID_IS_CPU(processor))
2603 return &block->cpu.pte_bits[get_cpu_pte_bit_index(prot)];
2604
2605 gpu_state = uvm_va_block_gpu_state_get(block, processor);
2606
2607 UVM_ASSERT(gpu_state);
2608 return &gpu_state->pte_bits[get_gpu_pte_bit_index(prot)];
2609 }
2610
uvm_va_block_map_mask_get(uvm_va_block_t * block,uvm_processor_id_t processor)2611 const uvm_page_mask_t *uvm_va_block_map_mask_get(uvm_va_block_t *block, uvm_processor_id_t processor)
2612 {
2613 return block_map_with_prot_mask_get(block, processor, UVM_PROT_READ_ONLY);
2614 }
2615
uvm_va_block_unmapped_pages_get(uvm_va_block_t * va_block,uvm_va_block_region_t region,uvm_page_mask_t * out_mask)2616 void uvm_va_block_unmapped_pages_get(uvm_va_block_t *va_block,
2617 uvm_va_block_region_t region,
2618 uvm_page_mask_t *out_mask)
2619 {
2620 uvm_processor_id_t id;
2621
2622 uvm_assert_mutex_locked(&va_block->lock);
2623
2624 if (!uvm_va_block_is_hmm(va_block)) {
2625 uvm_page_mask_complement(out_mask, &va_block->maybe_mapped_pages);
2626 return;
2627 }
2628
2629 uvm_page_mask_region_fill(out_mask, region);
2630
2631 for_each_id_in_mask(id, &va_block->mapped) {
2632 uvm_page_mask_andnot(out_mask, out_mask, uvm_va_block_map_mask_get(va_block, id));
2633 }
2634 }
2635
block_evicted_mask_get(uvm_va_block_t * block,uvm_gpu_id_t gpu_id)2636 static const uvm_page_mask_t *block_evicted_mask_get(uvm_va_block_t *block, uvm_gpu_id_t gpu_id)
2637 {
2638 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu_id);
2639 UVM_ASSERT(gpu_state);
2640
2641 return &gpu_state->evicted;
2642 }
2643
block_is_page_resident_anywhere(uvm_va_block_t * block,uvm_page_index_t page_index)2644 static bool block_is_page_resident_anywhere(uvm_va_block_t *block, uvm_page_index_t page_index)
2645 {
2646 uvm_processor_id_t id;
2647 for_each_id_in_mask(id, &block->resident) {
2648 if (uvm_page_mask_test(uvm_va_block_resident_mask_get(block, id, NUMA_NO_NODE), page_index))
2649 return true;
2650 }
2651
2652 return false;
2653 }
2654
block_processor_page_is_populated(uvm_va_block_t * block,uvm_processor_id_t proc,uvm_page_index_t page_index)2655 static bool block_processor_page_is_populated(uvm_va_block_t *block, uvm_processor_id_t proc, uvm_page_index_t page_index)
2656 {
2657 uvm_va_block_gpu_state_t *gpu_state;
2658 size_t chunk_index;
2659
2660 if (UVM_ID_IS_CPU(proc))
2661 return uvm_page_mask_test(&block->cpu.allocated, page_index);
2662
2663 gpu_state = uvm_va_block_gpu_state_get(block, proc);
2664 if (!gpu_state)
2665 return false;
2666
2667 chunk_index = block_gpu_chunk_index(block, block_get_gpu(block, proc), page_index, NULL);
2668 return gpu_state->chunks[chunk_index] != NULL;
2669 }
2670
2671 // Compute the gpus that have at least the given access permissions for the
2672 // range described by region and page_mask. The function sets the bit if any
2673 // page in the region has the permissions.
block_region_authorized_gpus(uvm_va_block_t * va_block,uvm_va_block_region_t region,uvm_prot_t access_permission,uvm_processor_mask_t * authorized_gpus)2674 static void block_region_authorized_gpus(uvm_va_block_t *va_block,
2675 uvm_va_block_region_t region,
2676 uvm_prot_t access_permission,
2677 uvm_processor_mask_t *authorized_gpus)
2678 {
2679 uvm_gpu_id_t gpu_id;
2680 uvm_pte_bits_gpu_t search_gpu_bit = get_gpu_pte_bit_index(access_permission);
2681
2682 uvm_processor_mask_zero(authorized_gpus);
2683
2684 // Test all GPUs with mappings on the block
2685 for_each_gpu_id_in_mask(gpu_id, &va_block->mapped) {
2686 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu_id);
2687 if (gpu_state && !uvm_page_mask_region_empty(&gpu_state->pte_bits[search_gpu_bit], region))
2688 uvm_processor_mask_set(authorized_gpus, gpu_id);
2689 }
2690 }
2691
2692 // Compute the processors that have at least the given access permissions for
2693 // the range described by region and page_mask. The function sets the bit if any
2694 // page in the region has the permissions.
block_region_authorized_processors(uvm_va_block_t * va_block,uvm_va_block_region_t region,uvm_prot_t access_permission,uvm_processor_mask_t * authorized_processors)2695 static void block_region_authorized_processors(uvm_va_block_t *va_block,
2696 uvm_va_block_region_t region,
2697 uvm_prot_t access_permission,
2698 uvm_processor_mask_t *authorized_processors)
2699 {
2700 uvm_pte_bits_cpu_t search_cpu_bit = get_cpu_pte_bit_index(access_permission);
2701
2702 // Compute GPUs
2703 block_region_authorized_gpus(va_block, region, access_permission, authorized_processors);
2704
2705 // Test CPU
2706 if (uvm_processor_mask_test(&va_block->mapped, UVM_ID_CPU) &&
2707 !uvm_page_mask_region_empty(&va_block->cpu.pte_bits[search_cpu_bit], region)) {
2708 uvm_processor_mask_set(authorized_processors, UVM_ID_CPU);
2709 }
2710 }
2711
block_page_authorized_processors(uvm_va_block_t * va_block,uvm_page_index_t page_index,uvm_prot_t access_permission,uvm_processor_mask_t * authorized_processors)2712 static void block_page_authorized_processors(uvm_va_block_t *va_block,
2713 uvm_page_index_t page_index,
2714 uvm_prot_t access_permission,
2715 uvm_processor_mask_t *authorized_processors)
2716 {
2717 block_region_authorized_processors(va_block,
2718 uvm_va_block_region_for_page(page_index),
2719 access_permission,
2720 authorized_processors);
2721 }
2722
block_is_gpu_authorized_on_whole_region(uvm_va_block_t * va_block,uvm_va_block_region_t region,uvm_gpu_id_t gpu_id,uvm_prot_t required_prot)2723 static bool block_is_gpu_authorized_on_whole_region(uvm_va_block_t *va_block,
2724 uvm_va_block_region_t region,
2725 uvm_gpu_id_t gpu_id,
2726 uvm_prot_t required_prot)
2727 {
2728 uvm_pte_bits_gpu_t search_gpu_bit = get_gpu_pte_bit_index(required_prot);
2729 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu_id);
2730
2731 if (!gpu_state)
2732 return false;
2733
2734 return uvm_page_mask_region_full(&gpu_state->pte_bits[search_gpu_bit], region);
2735 }
2736
block_is_processor_authorized_on_whole_region(uvm_va_block_t * va_block,uvm_va_block_region_t region,uvm_processor_id_t processor_id,uvm_prot_t required_prot)2737 static bool block_is_processor_authorized_on_whole_region(uvm_va_block_t *va_block,
2738 uvm_va_block_region_t region,
2739 uvm_processor_id_t processor_id,
2740 uvm_prot_t required_prot)
2741 {
2742 if (UVM_ID_IS_CPU(processor_id)) {
2743 uvm_pte_bits_cpu_t search_cpu_bit = get_cpu_pte_bit_index(required_prot);
2744
2745 return uvm_page_mask_region_full(&va_block->cpu.pte_bits[search_cpu_bit], region);
2746 }
2747 else {
2748 return block_is_gpu_authorized_on_whole_region(va_block, region, processor_id, required_prot);
2749 }
2750 }
2751
uvm_va_block_page_is_gpu_authorized(uvm_va_block_t * va_block,uvm_page_index_t page_index,uvm_gpu_id_t gpu_id,uvm_prot_t required_prot)2752 bool uvm_va_block_page_is_gpu_authorized(uvm_va_block_t *va_block,
2753 uvm_page_index_t page_index,
2754 uvm_gpu_id_t gpu_id,
2755 uvm_prot_t required_prot)
2756 {
2757 return block_is_gpu_authorized_on_whole_region(va_block,
2758 uvm_va_block_region_for_page(page_index),
2759 gpu_id,
2760 required_prot);
2761 }
2762
block_page_is_processor_authorized(uvm_va_block_t * va_block,uvm_page_index_t page_index,uvm_processor_id_t processor_id,uvm_prot_t required_prot)2763 static bool block_page_is_processor_authorized(uvm_va_block_t *va_block,
2764 uvm_page_index_t page_index,
2765 uvm_processor_id_t processor_id,
2766 uvm_prot_t required_prot)
2767 {
2768 return block_is_processor_authorized_on_whole_region(va_block,
2769 uvm_va_block_region_for_page(page_index),
2770 processor_id,
2771 required_prot);
2772 }
2773
2774 // Compute the gpus that have a copy of the given page resident in their memory
block_page_resident_gpus(uvm_va_block_t * va_block,uvm_page_index_t page_index,uvm_processor_mask_t * resident_gpus)2775 static void block_page_resident_gpus(uvm_va_block_t *va_block,
2776 uvm_page_index_t page_index,
2777 uvm_processor_mask_t *resident_gpus)
2778 {
2779 uvm_gpu_id_t id;
2780 uvm_processor_mask_zero(resident_gpus);
2781
2782 for_each_gpu_id_in_mask(id, &va_block->resident) {
2783 if (uvm_page_mask_test(uvm_va_block_resident_mask_get(va_block, id, NUMA_NO_NODE), page_index)) {
2784 UVM_ASSERT(block_processor_page_is_populated(va_block, id, page_index));
2785 uvm_processor_mask_set(resident_gpus, id);
2786 }
2787 }
2788 }
2789
2790 // Compute the processors that have a copy of the given page resident in their
2791 // memory.
uvm_va_block_page_resident_processors(uvm_va_block_t * va_block,uvm_page_index_t page_index,uvm_processor_mask_t * resident_processors)2792 void uvm_va_block_page_resident_processors(uvm_va_block_t *va_block,
2793 uvm_page_index_t page_index,
2794 uvm_processor_mask_t *resident_processors)
2795 {
2796 block_page_resident_gpus(va_block, page_index, resident_processors);
2797
2798 if (uvm_page_mask_test(uvm_va_block_resident_mask_get(va_block, UVM_ID_CPU, NUMA_NO_NODE), page_index)) {
2799 UVM_ASSERT(block_processor_page_is_populated(va_block, UVM_ID_CPU, page_index));
2800 uvm_processor_mask_set(resident_processors, UVM_ID_CPU);
2801 }
2802 }
2803
uvm_va_block_page_resident_processors_count(uvm_va_block_t * va_block,uvm_va_block_context_t * va_block_context,uvm_page_index_t page_index)2804 NvU32 uvm_va_block_page_resident_processors_count(uvm_va_block_t *va_block,
2805 uvm_va_block_context_t *va_block_context,
2806 uvm_page_index_t page_index)
2807 {
2808 uvm_processor_mask_t *resident_processors = &va_block_context->scratch_processor_mask;
2809 uvm_va_block_page_resident_processors(va_block, page_index, resident_processors);
2810
2811 return uvm_processor_mask_get_count(resident_processors);
2812 }
2813
uvm_va_block_page_get_closest_resident(uvm_va_block_t * va_block,uvm_va_block_context_t * va_block_context,uvm_page_index_t page_index,uvm_processor_id_t processor)2814 uvm_processor_id_t uvm_va_block_page_get_closest_resident(uvm_va_block_t *va_block,
2815 uvm_va_block_context_t *va_block_context,
2816 uvm_page_index_t page_index,
2817 uvm_processor_id_t processor)
2818 {
2819 uvm_processor_id_t id;
2820 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
2821
2822 uvm_processor_mask_copy(&va_block_context->scratch_processor_mask, &va_block->resident);
2823
2824 for_each_closest_id(id, &va_block_context->scratch_processor_mask, processor, va_space) {
2825 if (uvm_page_mask_test(uvm_va_block_resident_mask_get(va_block, id, NUMA_NO_NODE), page_index))
2826 return id;
2827 }
2828
2829 return UVM_ID_INVALID;
2830 }
2831
2832 // We don't track the specific aperture of each mapped page. Instead, we assume
2833 // that each virtual mapping from a given processor always targets the closest
2834 // processor on which that page is resident (with special rules for UVM-Lite).
2835 //
2836 // This function verifies that assumption: before a page becomes resident on a
2837 // new location, assert that no processor has a valid mapping to a farther
2838 // processor on that page.
block_check_resident_proximity(uvm_va_block_t * block,uvm_va_block_context_t * block_context,uvm_page_index_t page_index,uvm_processor_id_t new_residency)2839 static bool block_check_resident_proximity(uvm_va_block_t *block,
2840 uvm_va_block_context_t *block_context,
2841 uvm_page_index_t page_index,
2842 uvm_processor_id_t new_residency)
2843 {
2844 uvm_processor_id_t mapped_id, closest_id;
2845 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
2846 uvm_processor_mask_t *resident_procs = &block_context->scratch_processor_mask;
2847 const uvm_processor_mask_t *uvm_lite_gpus = block_get_uvm_lite_gpus(block);
2848
2849 for_each_id_in_mask(mapped_id, &block->mapped) {
2850 if (uvm_processor_mask_test(uvm_lite_gpus, mapped_id))
2851 continue;
2852
2853 if (!uvm_page_mask_test(uvm_va_block_map_mask_get(block, mapped_id), page_index))
2854 continue;
2855
2856 uvm_va_block_page_resident_processors(block, page_index, resident_procs);
2857 UVM_ASSERT(!uvm_processor_mask_empty(resident_procs));
2858 UVM_ASSERT(!uvm_processor_mask_test(resident_procs, new_residency));
2859 uvm_processor_mask_set(resident_procs, new_residency);
2860 closest_id = uvm_processor_mask_find_closest_id(va_space, resident_procs, mapped_id);
2861 UVM_ASSERT(!uvm_id_equal(closest_id, new_residency));
2862 }
2863
2864 return true;
2865 }
2866
2867 // Returns the processor to which page_index should be mapped on gpu
block_gpu_get_processor_to_map(uvm_va_block_t * block,uvm_va_block_context_t * block_context,uvm_gpu_t * gpu,uvm_page_index_t page_index)2868 static uvm_processor_id_t block_gpu_get_processor_to_map(uvm_va_block_t *block,
2869 uvm_va_block_context_t *block_context,
2870 uvm_gpu_t *gpu,
2871 uvm_page_index_t page_index)
2872 {
2873 uvm_processor_id_t dest_id;
2874
2875 // UVM-Lite GPUs can only map pages on the preferred location
2876 if (uvm_processor_mask_test(block_get_uvm_lite_gpus(block), gpu->id))
2877 return uvm_va_range_get_policy(block->va_range)->preferred_location;
2878
2879 // Otherwise we always map the closest resident processor
2880 dest_id = uvm_va_block_page_get_closest_resident(block, block_context, page_index, gpu->id);
2881 UVM_ASSERT(UVM_ID_IS_VALID(dest_id));
2882 return dest_id;
2883 }
2884
2885 // Returns the processor to which page_index should be mapped on mapping_id
block_get_processor_to_map(uvm_va_block_t * block,uvm_va_block_context_t * block_context,uvm_processor_id_t mapping_id,uvm_page_index_t page_index)2886 static uvm_processor_id_t block_get_processor_to_map(uvm_va_block_t *block,
2887 uvm_va_block_context_t *block_context,
2888 uvm_processor_id_t mapping_id,
2889 uvm_page_index_t page_index)
2890 {
2891
2892 if (UVM_ID_IS_CPU(mapping_id))
2893 return uvm_va_block_page_get_closest_resident(block, block_context, page_index, mapping_id);
2894
2895 return block_gpu_get_processor_to_map(block, block_context, block_get_gpu(block, mapping_id), page_index);
2896 }
2897
block_get_mapped_processors(uvm_va_block_t * block,uvm_va_block_context_t * block_context,uvm_processor_id_t resident_id,uvm_page_index_t page_index,uvm_processor_mask_t * mapped_procs)2898 static void block_get_mapped_processors(uvm_va_block_t *block,
2899 uvm_va_block_context_t *block_context,
2900 uvm_processor_id_t resident_id,
2901 uvm_page_index_t page_index,
2902 uvm_processor_mask_t *mapped_procs)
2903 {
2904 uvm_processor_id_t mapped_id;
2905
2906 uvm_processor_mask_zero(mapped_procs);
2907
2908 for_each_id_in_mask(mapped_id, &block->mapped) {
2909 if (uvm_page_mask_test(uvm_va_block_map_mask_get(block, mapped_id), page_index)) {
2910 uvm_processor_id_t to_map_id = block_get_processor_to_map(block, block_context, mapped_id, page_index);
2911
2912 if (uvm_id_equal(to_map_id, resident_id))
2913 uvm_processor_mask_set(mapped_procs, mapped_id);
2914 }
2915 }
2916 }
2917
2918 // We use block_gpu_get_processor_to_map to find the destination processor of a
2919 // given GPU mapping. This function is called when the mapping is established to
2920 // sanity check that the destination of the mapping matches the query.
block_check_mapping_residency_region(uvm_va_block_t * block,uvm_va_block_context_t * block_context,uvm_gpu_t * gpu,uvm_processor_id_t mapping_dest,uvm_va_block_region_t region,const uvm_page_mask_t * page_mask)2921 static bool block_check_mapping_residency_region(uvm_va_block_t *block,
2922 uvm_va_block_context_t *block_context,
2923 uvm_gpu_t *gpu,
2924 uvm_processor_id_t mapping_dest,
2925 uvm_va_block_region_t region,
2926 const uvm_page_mask_t *page_mask)
2927 {
2928 uvm_page_index_t page_index;
2929 for_each_va_block_page_in_region_mask(page_index, page_mask, region) {
2930 NvU64 va = uvm_va_block_cpu_page_address(block, page_index);
2931 uvm_processor_id_t proc_to_map = block_gpu_get_processor_to_map(block, block_context, gpu, page_index);
2932 UVM_ASSERT_MSG(uvm_id_equal(mapping_dest, proc_to_map),
2933 "VA 0x%llx on %s: mapping %s, supposed to map %s",
2934 va,
2935 uvm_gpu_name(gpu),
2936 block_processor_name(block, mapping_dest),
2937 block_processor_name(block, proc_to_map));
2938 }
2939 return true;
2940 }
2941
block_check_mapping_residency(uvm_va_block_t * block,uvm_va_block_context_t * block_context,uvm_gpu_t * gpu,uvm_processor_id_t mapping_dest,const uvm_page_mask_t * page_mask)2942 static bool block_check_mapping_residency(uvm_va_block_t *block,
2943 uvm_va_block_context_t *block_context,
2944 uvm_gpu_t *gpu,
2945 uvm_processor_id_t mapping_dest,
2946 const uvm_page_mask_t *page_mask)
2947 {
2948 return block_check_mapping_residency_region(block,
2949 block_context,
2950 gpu,
2951 mapping_dest,
2952 uvm_va_block_region_from_block(block),
2953 page_mask);
2954 }
2955
2956 // Check that there are no mappings targeting resident_id from any processor in
2957 // the block.
block_check_processor_not_mapped(uvm_va_block_t * block,uvm_va_block_context_t * block_context,uvm_processor_id_t resident_id)2958 static bool block_check_processor_not_mapped(uvm_va_block_t *block,
2959 uvm_va_block_context_t *block_context,
2960 uvm_processor_id_t resident_id)
2961 {
2962 uvm_processor_id_t mapped_id;
2963 uvm_page_index_t page_index;
2964
2965 for_each_id_in_mask(mapped_id, &block->mapped) {
2966 const uvm_page_mask_t *map_mask = uvm_va_block_map_mask_get(block, mapped_id);
2967
2968 for_each_va_block_page_in_mask(page_index, map_mask, block) {
2969 uvm_processor_id_t to_map_id = block_get_processor_to_map(block, block_context, mapped_id, page_index);
2970 UVM_ASSERT(!uvm_id_equal(to_map_id, resident_id));
2971 }
2972 }
2973
2974 return true;
2975 }
2976
2977 // Zero all pages of the newly-populated chunk which are not resident anywhere
2978 // else in the system, adding that work to the block's tracker. In all cases,
2979 // this function adds a dependency on passed in tracker to the block's tracker.
block_zero_new_gpu_chunk(uvm_va_block_t * block,uvm_gpu_t * gpu,uvm_gpu_chunk_t * chunk,uvm_va_block_region_t chunk_region,uvm_tracker_t * tracker)2980 static NV_STATUS block_zero_new_gpu_chunk(uvm_va_block_t *block,
2981 uvm_gpu_t *gpu,
2982 uvm_gpu_chunk_t *chunk,
2983 uvm_va_block_region_t chunk_region,
2984 uvm_tracker_t *tracker)
2985 {
2986 uvm_va_block_gpu_state_t *gpu_state;
2987 NV_STATUS status;
2988 uvm_gpu_address_t memset_addr_base, memset_addr;
2989 uvm_push_t push;
2990 uvm_gpu_id_t id;
2991 uvm_va_block_region_t subregion;
2992 uvm_page_mask_t *zero_mask;
2993
2994 UVM_ASSERT(uvm_va_block_region_size(chunk_region) == uvm_gpu_chunk_get_size(chunk));
2995
2996 if (chunk->is_zero)
2997 return NV_OK;
2998
2999 gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
3000 zero_mask = kmem_cache_alloc(g_uvm_page_mask_cache, NV_UVM_GFP_FLAGS);
3001
3002 if (!zero_mask)
3003 return NV_ERR_NO_MEMORY;
3004
3005 // Tradeoff: zeroing entire chunk vs zeroing only the pages needed for the
3006 // operation.
3007 //
3008 // We may over-zero the page with this approach. For example, we might be
3009 // populating a 2MB chunk because only a single page within that chunk needs
3010 // to be made resident. If we also zero non-resident pages outside of the
3011 // strict region, we could waste the effort if those pages are populated on
3012 // another processor later and migrated here.
3013 //
3014 // We zero all non-resident pages in the chunk anyway for two reasons:
3015 //
3016 // 1) Efficiency. It's better to do all zeros as pipelined transfers once
3017 // rather than scatter them around for each populate operation.
3018 //
3019 // 2) Optimizing the common case of block_populate_gpu_chunk being called
3020 // for already-populated chunks. If we zero once at initial populate, we
3021 // can simply check whether the chunk is present in the array. Otherwise
3022 // we'd have to recompute the "is any page resident" mask every time.
3023
3024 // Roll up all pages in chunk_region which are resident somewhere
3025 uvm_page_mask_zero(zero_mask);
3026 for_each_id_in_mask(id, &block->resident)
3027 uvm_page_mask_or(zero_mask, zero_mask, uvm_va_block_resident_mask_get(block, id, NUMA_NO_NODE));
3028
3029 // If all pages in the chunk are resident somewhere, we don't need to clear
3030 // anything. Just make sure the chunk is tracked properly.
3031 if (uvm_page_mask_region_full(zero_mask, chunk_region)) {
3032 status = uvm_tracker_add_tracker_safe(&block->tracker, tracker);
3033 goto out;
3034 }
3035
3036 // Complement to get the pages which are not resident anywhere. These
3037 // are the pages which must be zeroed.
3038 uvm_page_mask_complement(zero_mask, zero_mask);
3039
3040 memset_addr_base = uvm_gpu_address_copy(gpu, uvm_gpu_phys_address(UVM_APERTURE_VID, chunk->address));
3041 memset_addr = memset_addr_base;
3042
3043 status = uvm_push_begin_acquire(gpu->channel_manager,
3044 UVM_CHANNEL_TYPE_GPU_INTERNAL,
3045 tracker,
3046 &push,
3047 "Zero out chunk [0x%llx, 0x%llx) for region [0x%llx, 0x%llx) in va block [0x%llx, 0x%llx)",
3048 chunk->address,
3049 chunk->address + uvm_gpu_chunk_get_size(chunk),
3050 uvm_va_block_region_start(block, chunk_region),
3051 uvm_va_block_region_end(block, chunk_region) + 1,
3052 block->start,
3053 block->end + 1);
3054 if (status != NV_OK)
3055 goto out;
3056
3057 for_each_va_block_subregion_in_mask(subregion, zero_mask, chunk_region) {
3058 // Pipeline the memsets since they never overlap with each other
3059 uvm_push_set_flag(&push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
3060
3061 // We'll push one membar later for all memsets in this loop
3062 uvm_push_set_flag(&push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
3063
3064 memset_addr.address = memset_addr_base.address + (subregion.first - chunk_region.first) * PAGE_SIZE;
3065 gpu->parent->ce_hal->memset_8(&push, memset_addr, 0, uvm_va_block_region_size(subregion));
3066 }
3067
3068 // A membar from this GPU is required between this memset and any PTE write
3069 // pointing this or another GPU to this chunk. Otherwise an engine could
3070 // read the PTE then access the page before the memset write is visible to
3071 // that engine.
3072 //
3073 // This memset writes GPU memory, so local mappings need only a GPU-local
3074 // membar. We can't easily determine here whether a peer GPU will ever map
3075 // this page in the future, so always use a sysmembar. uvm_push_end provides
3076 // one by default.
3077 //
3078 // TODO: Bug 1766424: Use GPU-local membars if no peer can currently map
3079 // this page. When peer access gets enabled, do a MEMBAR_SYS at that
3080 // point.
3081 uvm_push_end(&push);
3082 status = uvm_tracker_add_push_safe(&block->tracker, &push);
3083
3084 out:
3085 if (zero_mask)
3086 kmem_cache_free(g_uvm_page_mask_cache, zero_mask);
3087
3088 return status;
3089 }
3090
block_populate_gpu_chunk(uvm_va_block_t * block,uvm_va_block_retry_t * retry,uvm_gpu_t * gpu,size_t chunk_index,uvm_va_block_region_t chunk_region)3091 static NV_STATUS block_populate_gpu_chunk(uvm_va_block_t *block,
3092 uvm_va_block_retry_t *retry,
3093 uvm_gpu_t *gpu,
3094 size_t chunk_index,
3095 uvm_va_block_region_t chunk_region)
3096 {
3097 uvm_va_block_gpu_state_t *gpu_state = block_gpu_state_get_alloc(block, gpu);
3098 uvm_gpu_chunk_t *chunk = NULL;
3099 uvm_chunk_size_t chunk_size = uvm_va_block_region_size(chunk_region);
3100 uvm_va_block_test_t *block_test = uvm_va_block_get_test(block);
3101 NV_STATUS status;
3102
3103 if (!gpu_state)
3104 return NV_ERR_NO_MEMORY;
3105
3106 uvm_assert_mutex_locked(&block->lock);
3107 UVM_ASSERT(chunk_index < block_num_gpu_chunks(block, gpu));
3108 UVM_ASSERT(chunk_size & gpu->parent->mmu_user_chunk_sizes);
3109
3110 // We zero chunks as necessary at initial population, so if the chunk is
3111 // already populated we're done. See the comment in
3112 // block_zero_new_gpu_chunk.
3113 if (gpu_state->chunks[chunk_index])
3114 return NV_OK;
3115
3116 UVM_ASSERT(uvm_page_mask_region_empty(&gpu_state->resident, chunk_region));
3117
3118 status = block_alloc_gpu_chunk(block, retry, gpu, chunk_size, &chunk);
3119 if (status != NV_OK)
3120 return status;
3121
3122 // In some configurations such as SR-IOV heavy, the chunk cannot be
3123 // referenced using its physical address. Create a virtual mapping.
3124 status = uvm_mmu_chunk_map(chunk);
3125 if (status != NV_OK)
3126 goto chunk_free;
3127
3128 status = block_zero_new_gpu_chunk(block, gpu, chunk, chunk_region, &retry->tracker);
3129 if (status != NV_OK)
3130 goto chunk_unmap;
3131
3132 // It is safe to modify the page index field without holding any PMM locks
3133 // because the chunk is pinned, which means that none of the other fields in
3134 // the bitmap can change.
3135 chunk->va_block_page_index = chunk_region.first;
3136
3137 // va_block_page_index is a bitfield of size PAGE_SHIFT. Make sure at
3138 // compile-time that it can store VA Block page indexes.
3139 BUILD_BUG_ON(PAGES_PER_UVM_VA_BLOCK >= PAGE_SIZE);
3140
3141 status = block_map_indirect_peers_to_gpu_chunk(block, gpu, chunk);
3142 if (status != NV_OK)
3143 goto chunk_unmap;
3144
3145 if (block_test && block_test->inject_populate_error) {
3146 block_test->inject_populate_error = false;
3147
3148 // Use NV_ERR_MORE_PROCESSING_REQUIRED to force a retry rather than
3149 // causing a fatal OOM failure.
3150 status = NV_ERR_MORE_PROCESSING_REQUIRED;
3151 goto chunk_unmap_indirect_peers;
3152 }
3153
3154 // Record the used chunk so that it can be unpinned at the end of the whole
3155 // operation.
3156 block_retry_add_used_chunk(retry, chunk);
3157 gpu_state->chunks[chunk_index] = chunk;
3158
3159 return NV_OK;
3160
3161 chunk_unmap_indirect_peers:
3162 block_unmap_indirect_peers_from_gpu_chunk(block, gpu, chunk);
3163
3164 chunk_unmap:
3165 uvm_mmu_chunk_unmap(chunk, &block->tracker);
3166
3167 chunk_free:
3168 // block_zero_new_gpu_chunk may have pushed memsets on this chunk which it
3169 // placed in the block tracker.
3170 uvm_pmm_gpu_free(&gpu->pmm, chunk, &block->tracker);
3171
3172 return status;
3173 }
3174
3175 // Populate all chunks which cover the given region and page mask.
block_populate_pages_gpu(uvm_va_block_t * block,uvm_va_block_retry_t * retry,uvm_gpu_t * gpu,uvm_va_block_region_t region,const uvm_page_mask_t * populate_mask)3176 static NV_STATUS block_populate_pages_gpu(uvm_va_block_t *block,
3177 uvm_va_block_retry_t *retry,
3178 uvm_gpu_t *gpu,
3179 uvm_va_block_region_t region,
3180 const uvm_page_mask_t *populate_mask)
3181 {
3182 uvm_va_block_region_t chunk_region, check_region;
3183 size_t chunk_index;
3184 uvm_page_index_t page_index;
3185 uvm_chunk_size_t chunk_size;
3186 NV_STATUS status;
3187
3188 page_index = uvm_va_block_first_page_in_mask(region, populate_mask);
3189 if (page_index == region.outer)
3190 return NV_OK;
3191
3192 chunk_index = block_gpu_chunk_index(block, gpu, page_index, &chunk_size);
3193 chunk_region = uvm_va_block_chunk_region(block, chunk_size, page_index);
3194
3195 while (1) {
3196 check_region = uvm_va_block_region(max(chunk_region.first, region.first),
3197 min(chunk_region.outer, region.outer));
3198 page_index = uvm_va_block_first_page_in_mask(check_region, populate_mask);
3199 if (page_index != check_region.outer) {
3200 status = block_populate_gpu_chunk(block, retry, gpu, chunk_index, chunk_region);
3201 if (status != NV_OK)
3202 return status;
3203 }
3204
3205 if (check_region.outer == region.outer)
3206 break;
3207
3208 ++chunk_index;
3209 chunk_size = block_gpu_chunk_size(block, gpu, chunk_region.outer);
3210 chunk_region = uvm_va_block_region(chunk_region.outer, chunk_region.outer + (chunk_size / PAGE_SIZE));
3211 }
3212
3213 return NV_OK;
3214 }
3215
block_get_can_copy_from_mask(uvm_va_block_t * block,uvm_processor_id_t from)3216 static const uvm_processor_mask_t *block_get_can_copy_from_mask(uvm_va_block_t *block, uvm_processor_id_t from)
3217 {
3218 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
3219
3220 return &va_space->can_copy_from[uvm_id_value(from)];
3221 }
3222
block_populate_pages(uvm_va_block_t * block,uvm_va_block_retry_t * retry,uvm_va_block_context_t * block_context,uvm_processor_id_t dest_id,uvm_va_block_region_t region,const uvm_page_mask_t * page_mask)3223 static NV_STATUS block_populate_pages(uvm_va_block_t *block,
3224 uvm_va_block_retry_t *retry,
3225 uvm_va_block_context_t *block_context,
3226 uvm_processor_id_t dest_id,
3227 uvm_va_block_region_t region,
3228 const uvm_page_mask_t *page_mask)
3229 {
3230 NV_STATUS status;
3231 const uvm_page_mask_t *resident_mask = block_resident_mask_get_alloc(block,
3232 dest_id,
3233 block_context->make_resident.dest_nid);
3234 uvm_page_mask_t *populate_page_mask = &block_context->make_resident.page_mask;
3235 uvm_page_mask_t *pages_staged = &block_context->make_resident.pages_staged;
3236 uvm_page_mask_t *cpu_populate_mask;
3237 uvm_memcg_context_t memcg_context;
3238
3239 if (!resident_mask)
3240 return NV_ERR_NO_MEMORY;
3241
3242 if (page_mask)
3243 uvm_page_mask_andnot(populate_page_mask, page_mask, resident_mask);
3244 else
3245 uvm_page_mask_complement(populate_page_mask, resident_mask);
3246
3247 if (UVM_ID_IS_GPU(dest_id)) {
3248 const uvm_processor_mask_t *can_copy_from_processors;
3249 uvm_processor_mask_t *tmp_processor_mask;
3250 uvm_page_mask_t *scratch_page_mask = &block_context->scratch_page_mask;
3251 uvm_page_mask_t *id_resident_mask;
3252 uvm_processor_id_t id;
3253
3254 tmp_processor_mask = uvm_processor_mask_cache_alloc();
3255 if (!tmp_processor_mask)
3256 return NV_ERR_NO_MEMORY;
3257
3258 status = block_populate_pages_gpu(block, retry, block_get_gpu(block, dest_id), region, populate_page_mask);
3259 if (status != NV_OK) {
3260 uvm_processor_mask_cache_free(tmp_processor_mask);
3261 return status;
3262 }
3263
3264 uvm_page_mask_zero(pages_staged);
3265
3266 // Get the mask of all processors that have resident pages from which
3267 // the destination cannot copy directly.
3268 can_copy_from_processors = block_get_can_copy_from_mask(block, dest_id);
3269 if (!uvm_processor_mask_andnot(tmp_processor_mask, &block->resident, can_copy_from_processors)) {
3270 uvm_processor_mask_cache_free(tmp_processor_mask);
3271 return status;
3272 }
3273
3274 // Compute the pages that will be staged through the CPU by:
3275 // 1. Computing all of the pages resident on the processors from which
3276 // dest_id cannot directly copy.
3277 for_each_id_in_mask(id, tmp_processor_mask) {
3278 id_resident_mask = uvm_va_block_resident_mask_get(block, id, NUMA_NO_NODE);
3279 uvm_page_mask_and(scratch_page_mask, populate_page_mask, id_resident_mask);
3280 uvm_page_mask_or(pages_staged, pages_staged, scratch_page_mask);
3281 }
3282
3283 // 2. Remove any pages in pages_staged that are on any resident processor
3284 // dest_id can copy from.
3285 if (uvm_processor_mask_and(tmp_processor_mask, can_copy_from_processors, &block->resident)) {
3286 for_each_id_in_mask(id, tmp_processor_mask) {
3287 id_resident_mask = uvm_va_block_resident_mask_get(block, id, NUMA_NO_NODE);
3288 uvm_page_mask_andnot(pages_staged, pages_staged, id_resident_mask);
3289 }
3290 }
3291
3292 // 3. Removing any pages not in the populate mask.
3293 uvm_page_mask_region_clear_outside(pages_staged, region);
3294 cpu_populate_mask = pages_staged;
3295
3296 uvm_processor_mask_cache_free(tmp_processor_mask);
3297 }
3298 else {
3299 cpu_populate_mask = populate_page_mask;
3300 }
3301
3302 uvm_memcg_context_start(&memcg_context, block_context->mm);
3303 status = block_populate_pages_cpu(block, cpu_populate_mask, region, block_context, UVM_ID_IS_GPU(dest_id));
3304 uvm_memcg_context_end(&memcg_context);
3305 return status;
3306 }
3307
block_can_copy_from(uvm_va_block_t * va_block,uvm_processor_id_t from,uvm_processor_id_t to)3308 static bool block_can_copy_from(uvm_va_block_t *va_block, uvm_processor_id_t from, uvm_processor_id_t to)
3309 {
3310 return uvm_processor_mask_test(block_get_can_copy_from_mask(va_block, to), from);
3311 }
3312
3313 // Get the chunk containing the given page, along with the offset of that page
3314 // within the chunk.
block_phys_page_chunk(uvm_va_block_t * block,block_phys_page_t block_page,size_t * chunk_offset)3315 static uvm_gpu_chunk_t *block_phys_page_chunk(uvm_va_block_t *block, block_phys_page_t block_page, size_t *chunk_offset)
3316 {
3317 uvm_gpu_t *gpu = block_get_gpu(block, block_page.processor);
3318 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, block_page.processor);
3319 size_t chunk_index;
3320 uvm_gpu_chunk_t *chunk;
3321 uvm_chunk_size_t chunk_size;
3322
3323 UVM_ASSERT(gpu_state);
3324
3325 chunk_index = block_gpu_chunk_index(block, gpu, block_page.page_index, &chunk_size);
3326 chunk = gpu_state->chunks[chunk_index];
3327 UVM_ASSERT(chunk);
3328
3329 if (chunk_offset) {
3330 size_t page_offset = block_page.page_index -
3331 uvm_va_block_chunk_region(block,chunk_size, block_page.page_index).first;
3332 *chunk_offset = page_offset * PAGE_SIZE;
3333 }
3334
3335 return chunk;
3336 }
3337
3338 // Get the physical GPU address of a block's page from the POV of the specified GPU
3339 // This is the address that should be used for making PTEs for the specified GPU.
block_phys_page_address(uvm_va_block_t * block,block_phys_page_t block_page,uvm_gpu_t * gpu)3340 static uvm_gpu_phys_address_t block_phys_page_address(uvm_va_block_t *block,
3341 block_phys_page_t block_page,
3342 uvm_gpu_t *gpu)
3343 {
3344 uvm_va_block_gpu_state_t *accessing_gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
3345 size_t chunk_offset;
3346 uvm_gpu_chunk_t *chunk;
3347
3348 UVM_ASSERT(accessing_gpu_state);
3349
3350 if (UVM_ID_IS_CPU(block_page.processor)) {
3351 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, block_page.nid, block_page.page_index);
3352 NvU64 dma_addr = uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk, gpu->parent);
3353 uvm_va_block_region_t chunk_region = uvm_va_block_chunk_region(block,
3354 uvm_cpu_chunk_get_size(chunk),
3355 block_page.page_index);
3356
3357 // The page should be mapped for physical access already as we do that
3358 // eagerly on CPU page population and GPU state alloc.
3359 UVM_ASSERT(dma_addr != 0);
3360 dma_addr += (block_page.page_index - chunk_region.first) * PAGE_SIZE;
3361
3362 return uvm_gpu_phys_address(UVM_APERTURE_SYS, dma_addr);
3363 }
3364
3365 chunk = block_phys_page_chunk(block, block_page, &chunk_offset);
3366
3367 if (uvm_id_equal(block_page.processor, gpu->id)) {
3368 return uvm_gpu_phys_address(UVM_APERTURE_VID, chunk->address + chunk_offset);
3369 }
3370 else {
3371 uvm_gpu_phys_address_t phys_addr;
3372 uvm_gpu_t *owning_gpu = block_get_gpu(block, block_page.processor);
3373 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
3374
3375 UVM_ASSERT(uvm_va_space_peer_enabled(va_space, gpu, owning_gpu));
3376 phys_addr = uvm_pmm_gpu_peer_phys_address(&owning_gpu->pmm, chunk, gpu);
3377 phys_addr.address += chunk_offset;
3378 return phys_addr;
3379 }
3380 }
3381
3382 // Get the physical GPU address of a block's page from the POV of the specified
3383 // GPU, suitable for accessing the memory from UVM-internal CE channels.
3384 //
3385 // Notably this is may be different from block_phys_page_address() to handle CE
3386 // limitations in addressing physical memory directly.
block_phys_page_copy_address(uvm_va_block_t * block,block_phys_page_t block_page,uvm_gpu_t * gpu)3387 static uvm_gpu_address_t block_phys_page_copy_address(uvm_va_block_t *block,
3388 block_phys_page_t block_page,
3389 uvm_gpu_t *gpu)
3390 {
3391 uvm_gpu_t *owning_gpu;
3392 size_t chunk_offset;
3393 uvm_gpu_chunk_t *chunk;
3394 uvm_gpu_address_t copy_addr;
3395 uvm_va_space_t *va_space;
3396
3397 UVM_ASSERT_MSG(block_can_copy_from(block, gpu->id, block_page.processor),
3398 "from %s to %s\n",
3399 block_processor_name(block, gpu->id),
3400 block_processor_name(block, block_page.processor));
3401
3402 // CPU and local GPU accesses can rely on block_phys_page_address, but the
3403 // resulting physical address may need to be converted into virtual.
3404 if (UVM_ID_IS_CPU(block_page.processor) || uvm_id_equal(block_page.processor, gpu->id))
3405 return uvm_gpu_address_copy(gpu, block_phys_page_address(block, block_page, gpu));
3406
3407 va_space = uvm_va_block_get_va_space(block);
3408
3409 // See the comments on the peer_identity_mappings_supported assignments in
3410 // the HAL for why we disable direct copies between peers.
3411 owning_gpu = block_get_gpu(block, block_page.processor);
3412
3413 UVM_ASSERT(uvm_va_space_peer_enabled(va_space, gpu, owning_gpu));
3414
3415 chunk = block_phys_page_chunk(block, block_page, &chunk_offset);
3416 copy_addr = uvm_pmm_gpu_peer_copy_address(&owning_gpu->pmm, chunk, gpu);
3417 copy_addr.address += chunk_offset;
3418 return copy_addr;
3419 }
3420
uvm_va_block_res_phys_page_address(uvm_va_block_t * va_block,uvm_page_index_t page_index,uvm_processor_id_t residency,uvm_gpu_t * gpu)3421 uvm_gpu_phys_address_t uvm_va_block_res_phys_page_address(uvm_va_block_t *va_block,
3422 uvm_page_index_t page_index,
3423 uvm_processor_id_t residency,
3424 uvm_gpu_t *gpu)
3425 {
3426 int nid = NUMA_NO_NODE;
3427
3428 uvm_assert_mutex_locked(&va_block->lock);
3429 if (UVM_ID_IS_CPU(residency)) {
3430 nid = block_get_page_node_residency(va_block, page_index);
3431 UVM_ASSERT(nid != NUMA_NO_NODE);
3432 }
3433
3434 return block_phys_page_address(va_block, block_phys_page(residency, nid, page_index), gpu);
3435 }
3436
uvm_va_block_gpu_phys_page_address(uvm_va_block_t * va_block,uvm_page_index_t page_index,uvm_gpu_t * gpu)3437 uvm_gpu_phys_address_t uvm_va_block_gpu_phys_page_address(uvm_va_block_t *va_block,
3438 uvm_page_index_t page_index,
3439 uvm_gpu_t *gpu)
3440 {
3441 return uvm_va_block_res_phys_page_address(va_block, page_index, gpu->id, gpu);
3442 }
3443
3444 typedef struct
3445 {
3446 // Location of the memory
3447 uvm_processor_id_t id;
3448
3449 // NUMA node ID if the processor is the CPU. Ignored otherwise.
3450 int nid;
3451
3452 // Whether the whole block has a single physically-contiguous chunk of
3453 // storage on the processor.
3454 bool is_block_contig;
3455
3456 // Starting address of the physically-contiguous allocation, from the view
3457 // of the copying GPU. Valid only if is_block_contig.
3458 uvm_gpu_address_t gpu_address;
3459 } block_copy_addr_t;
3460
3461 typedef struct
3462 {
3463 block_copy_addr_t src;
3464 block_copy_addr_t dst;
3465 uvm_conf_computing_dma_buffer_t *dma_buffer;
3466 // True if at least one CE transfer (such as a memcopy) has already been
3467 // pushed to the GPU during the VA block copy thus far.
3468 bool copy_pushed;
3469 } block_copy_state_t;
3470
3471 // Begin a push appropriate for copying data from src_id processor to dst_id processor.
3472 // One of src_id and dst_id needs to be a GPU.
block_copy_begin_push(uvm_va_block_t * va_block,block_copy_state_t * copy_state,uvm_tracker_t * tracker,uvm_push_t * push)3473 static NV_STATUS block_copy_begin_push(uvm_va_block_t *va_block,
3474 block_copy_state_t *copy_state,
3475 uvm_tracker_t *tracker,
3476 uvm_push_t *push)
3477 {
3478 uvm_gpu_t *gpu;
3479 NV_STATUS status;
3480 uvm_channel_type_t channel_type;
3481 uvm_tracker_t *tracker_ptr = tracker;
3482 uvm_processor_id_t dst_id = copy_state->dst.id;
3483 uvm_processor_id_t src_id = copy_state->src.id;
3484 uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
3485 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
3486
3487 if (!(uvm_block_cpu_to_cpu_copy_with_ce || va_space->test.force_cpu_to_cpu_copy_with_ce) ||
3488 UVM_ID_IS_GPU(src_id) ||
3489 UVM_ID_IS_GPU(dst_id)) {
3490 UVM_ASSERT_MSG(!uvm_id_equal(src_id, dst_id),
3491 "Unexpected copy to self, processor %s\n",
3492 block_processor_name(va_block, src_id));
3493 }
3494
3495 if (UVM_ID_IS_CPU(src_id) && UVM_ID_IS_CPU(dst_id)) {
3496 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
3497
3498 gpu = uvm_va_space_find_first_gpu_attached_to_cpu_node(va_space, copy_state->src.nid);
3499 if (!gpu)
3500 gpu = uvm_va_space_find_first_gpu(va_space);
3501
3502 channel_type = UVM_CHANNEL_TYPE_CPU_TO_GPU;
3503 }
3504 else if (UVM_ID_IS_CPU(src_id)) {
3505 gpu = block_get_gpu(va_block, dst_id);
3506 channel_type = UVM_CHANNEL_TYPE_CPU_TO_GPU;
3507 }
3508 else if (UVM_ID_IS_CPU(dst_id)) {
3509 gpu = block_get_gpu(va_block, src_id);
3510 channel_type = UVM_CHANNEL_TYPE_GPU_TO_CPU;
3511 }
3512 else {
3513 // For GPU to GPU copies, prefer to "push" the data from the source as
3514 // that works better at least for P2P over PCI-E.
3515 gpu = block_get_gpu(va_block, src_id);
3516
3517 channel_type = UVM_CHANNEL_TYPE_GPU_TO_GPU;
3518 }
3519
3520 UVM_ASSERT_MSG(block_can_copy_from(va_block, gpu->id, dst_id),
3521 "GPU %s dst %s src %s\n",
3522 block_processor_name(va_block, gpu->id),
3523 block_processor_name(va_block, dst_id),
3524 block_processor_name(va_block, src_id));
3525 UVM_ASSERT_MSG(block_can_copy_from(va_block, gpu->id, src_id),
3526 "GPU %s dst %s src %s\n",
3527 block_processor_name(va_block, gpu->id),
3528 block_processor_name(va_block, dst_id),
3529 block_processor_name(va_block, src_id));
3530
3531 if (channel_type == UVM_CHANNEL_TYPE_GPU_TO_GPU) {
3532 uvm_gpu_t *dst_gpu = block_get_gpu(va_block, dst_id);
3533 return uvm_push_begin_acquire_gpu_to_gpu(gpu->channel_manager,
3534 dst_gpu,
3535 tracker,
3536 push,
3537 "Copy from %s to %s for block [0x%llx, 0x%llx]",
3538 block_processor_name(va_block, src_id),
3539 block_processor_name(va_block, dst_id),
3540 va_block->start,
3541 va_block->end);
3542 }
3543
3544 if (g_uvm_global.conf_computing_enabled) {
3545 // When Confidential Computing is enabled, additional dependencies
3546 // apply to the input tracker as well as the dma_buffer tracker.
3547 // * In the CPU to GPU case, because UVM performs CPU side
3548 // crypto-operations first before the GPU copy, we both need to
3549 // ensure that the dma_buffer and the input tracker are completed.
3550 // * In the GPU to CPU case, the GPU copy happens first, but the same
3551 // principles apply. Hence, UVM acquires the input tracker and the
3552 // dma buffer.
3553 status = uvm_tracker_overwrite_safe(&local_tracker, tracker);
3554 if (status != NV_OK)
3555 goto error;
3556
3557 UVM_ASSERT(copy_state->dma_buffer == NULL);
3558 status = uvm_conf_computing_dma_buffer_alloc(&gpu->conf_computing.dma_buffer_pool,
3559 ©_state->dma_buffer,
3560 &local_tracker);
3561
3562 if (status != NV_OK)
3563 goto error;
3564
3565 if (channel_type == UVM_CHANNEL_TYPE_CPU_TO_GPU) {
3566 status = uvm_tracker_wait(&local_tracker);
3567 if (status != NV_OK)
3568 goto error;
3569 }
3570
3571 tracker_ptr = &local_tracker;
3572 }
3573
3574 status = uvm_push_begin_acquire(gpu->channel_manager,
3575 channel_type,
3576 tracker_ptr,
3577 push,
3578 "Copy from %s to %s for block [0x%llx, 0x%llx]",
3579 block_processor_name(va_block, src_id),
3580 block_processor_name(va_block, dst_id),
3581 va_block->start,
3582 va_block->end);
3583
3584 error:
3585 // Caller is responsible for freeing the DMA buffer on error
3586 uvm_tracker_deinit(&local_tracker);
3587 return status;
3588 }
3589
3590 // A page is clean iff...
3591 // the destination is equal to the preferred location
3592 // the source is the CPU and
3593 // the destination is not the CPU
3594 // the destination does not support faults/eviction and
3595 // the CPU page is not dirty
block_page_is_clean(uvm_va_block_t * block,uvm_processor_id_t dst_id,int dst_nid,uvm_processor_id_t src_id,int src_nid,uvm_page_index_t page_index)3596 static bool block_page_is_clean(uvm_va_block_t *block,
3597 uvm_processor_id_t dst_id,
3598 int dst_nid,
3599 uvm_processor_id_t src_id,
3600 int src_nid,
3601 uvm_page_index_t page_index)
3602 {
3603 return !uvm_va_block_is_hmm(block) &&
3604 uvm_va_policy_preferred_location_equal(uvm_va_range_get_policy(block->va_range), dst_id, dst_nid) &&
3605 UVM_ID_IS_CPU(src_id) &&
3606 !UVM_ID_IS_CPU(dst_id) &&
3607 !block_get_gpu(block, dst_id)->parent->isr.replayable_faults.handling &&
3608 !block_cpu_page_is_dirty(block, page_index, src_nid);
3609 }
3610
3611 // When the destination is the CPU...
3612 // if the source is the preferred location and NUMA node id, mark as clean
3613 // otherwise, mark as dirty
block_update_page_dirty_state(uvm_va_block_t * block,uvm_processor_id_t dst_id,int dst_nid,uvm_processor_id_t src_id,int src_nid,uvm_page_index_t page_index)3614 static void block_update_page_dirty_state(uvm_va_block_t *block,
3615 uvm_processor_id_t dst_id,
3616 int dst_nid,
3617 uvm_processor_id_t src_id,
3618 int src_nid,
3619 uvm_page_index_t page_index)
3620 {
3621 uvm_va_policy_t *policy;
3622
3623 if (UVM_ID_IS_GPU(dst_id))
3624 return;
3625
3626 policy = uvm_va_range_get_policy(block->va_range);
3627 if (uvm_va_policy_preferred_location_equal(policy, src_id, src_nid))
3628 block_mark_cpu_page_clean(block, page_index, dst_nid);
3629 else
3630 block_mark_cpu_page_dirty(block, page_index, dst_nid);
3631 }
3632
block_mark_memory_used(uvm_va_block_t * block,uvm_processor_id_t id)3633 static void block_mark_memory_used(uvm_va_block_t *block, uvm_processor_id_t id)
3634 {
3635 uvm_gpu_t *gpu;
3636
3637 if (UVM_ID_IS_CPU(id))
3638 return;
3639
3640 gpu = block_get_gpu(block, id);
3641
3642 // If the block is of the max size and the GPU supports eviction, mark the
3643 // root chunk as used in PMM.
3644 // HMM always allocates PAGE_SIZE GPU chunks so skip HMM va_blocks.
3645 if (!uvm_va_block_is_hmm(block) &&
3646 uvm_va_block_size(block) == UVM_CHUNK_SIZE_MAX &&
3647 uvm_parent_gpu_supports_eviction(gpu->parent)) {
3648 // The chunk has to be there if this GPU is resident
3649 UVM_ASSERT(uvm_processor_mask_test(&block->resident, id));
3650 uvm_pmm_gpu_mark_root_chunk_used(&gpu->pmm, uvm_va_block_gpu_state_get(block, gpu->id)->chunks[0]);
3651 }
3652 }
3653
block_set_resident_processor(uvm_va_block_t * block,uvm_processor_id_t id)3654 static void block_set_resident_processor(uvm_va_block_t *block, uvm_processor_id_t id)
3655 {
3656 UVM_ASSERT(!uvm_page_mask_empty(uvm_va_block_resident_mask_get(block, id, NUMA_NO_NODE)));
3657
3658 if (uvm_processor_mask_test_and_set(&block->resident, id))
3659 return;
3660
3661 block_mark_memory_used(block, id);
3662 }
3663
block_clear_resident_processor(uvm_va_block_t * block,uvm_processor_id_t id)3664 static void block_clear_resident_processor(uvm_va_block_t *block, uvm_processor_id_t id)
3665 {
3666 uvm_gpu_t *gpu;
3667
3668 UVM_ASSERT(uvm_page_mask_empty(uvm_va_block_resident_mask_get(block, id, NUMA_NO_NODE)));
3669
3670 if (!uvm_processor_mask_test_and_clear(&block->resident, id))
3671 return;
3672
3673 if (UVM_ID_IS_CPU(id))
3674 return;
3675
3676 gpu = block_get_gpu(block, id);
3677
3678 // If the block is of the max size and the GPU supports eviction, mark the
3679 // root chunk as unused in PMM.
3680 if (!uvm_va_block_is_hmm(block) &&
3681 uvm_va_block_size(block) == UVM_CHUNK_SIZE_MAX &&
3682 uvm_parent_gpu_supports_eviction(gpu->parent)) {
3683 // The chunk may not be there any more when residency is cleared.
3684 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
3685 if (gpu_state && gpu_state->chunks[0])
3686 uvm_pmm_gpu_mark_root_chunk_unused(&gpu->pmm, gpu_state->chunks[0]);
3687 }
3688 }
3689
block_phys_copy_contig_check(uvm_va_block_t * block,uvm_page_index_t page_index,const uvm_gpu_address_t * base_address,uvm_processor_id_t proc_id,int nid,uvm_gpu_t * copying_gpu)3690 static bool block_phys_copy_contig_check(uvm_va_block_t *block,
3691 uvm_page_index_t page_index,
3692 const uvm_gpu_address_t *base_address,
3693 uvm_processor_id_t proc_id,
3694 int nid,
3695 uvm_gpu_t *copying_gpu)
3696 {
3697 uvm_gpu_address_t page_address;
3698 uvm_gpu_address_t contig_address = *base_address;
3699
3700 contig_address.address += page_index * PAGE_SIZE;
3701 page_address = block_phys_page_copy_address(block, block_phys_page(proc_id, nid, page_index), copying_gpu);
3702
3703 return uvm_gpu_addr_cmp(page_address, contig_address) == 0;
3704 }
3705
3706 // Check if the VA block has a single physically-contiguous chunk of storage
3707 // on the processor.
is_block_phys_contig(uvm_va_block_t * block,uvm_processor_id_t id,int nid)3708 static bool is_block_phys_contig(uvm_va_block_t *block, uvm_processor_id_t id, int nid)
3709 {
3710 uvm_cpu_chunk_t *chunk;
3711
3712 if (UVM_ID_IS_GPU(id))
3713 return uvm_va_block_size(block) == block_gpu_chunk_size(block, block_get_gpu(block, id), 0);
3714
3715 UVM_ASSERT(nid != NUMA_NO_NODE);
3716 chunk = uvm_cpu_chunk_first_in_region(block, uvm_va_block_region_from_block(block), nid, NULL);
3717 return chunk && (uvm_va_block_size(block) == uvm_cpu_chunk_get_size(chunk));
3718 }
3719
block_phys_contig_region(uvm_va_block_t * block,uvm_page_index_t page_index,uvm_processor_id_t resident_id,int nid)3720 static uvm_va_block_region_t block_phys_contig_region(uvm_va_block_t *block,
3721 uvm_page_index_t page_index,
3722 uvm_processor_id_t resident_id,
3723 int nid)
3724 {
3725 if (UVM_ID_IS_CPU(resident_id)) {
3726 uvm_cpu_chunk_t *chunk;
3727 UVM_ASSERT(nid != NUMA_NO_NODE);
3728 chunk = uvm_cpu_chunk_get_chunk_for_page(block, nid, page_index);
3729 return uvm_cpu_chunk_block_region(block, chunk, page_index);
3730 }
3731 else {
3732 uvm_chunk_size_t chunk_size;
3733 (void)block_gpu_chunk_index(block, block_get_gpu(block, resident_id), page_index, &chunk_size);
3734 return uvm_va_block_chunk_region(block, chunk_size, page_index);
3735 }
3736 }
3737
3738 // Like block_phys_page_copy_address, but uses the address cached in bca when
3739 // possible.
block_copy_get_address(uvm_va_block_t * block,block_copy_addr_t * bca,uvm_page_index_t page_index,uvm_gpu_t * copying_gpu)3740 static uvm_gpu_address_t block_copy_get_address(uvm_va_block_t *block,
3741 block_copy_addr_t *bca,
3742 uvm_page_index_t page_index,
3743 uvm_gpu_t *copying_gpu)
3744 {
3745 if (bca->is_block_contig) {
3746 uvm_gpu_address_t addr = bca->gpu_address;
3747 addr.address += page_index * PAGE_SIZE;
3748 UVM_ASSERT(block_phys_copy_contig_check(block, page_index, &bca->gpu_address, bca->id, bca->nid, copying_gpu));
3749 return addr;
3750 }
3751
3752 return block_phys_page_copy_address(block, block_phys_page(bca->id, bca->nid, page_index), copying_gpu);
3753 }
3754
3755 // When the Confidential Computing feature is enabled, the function performs
3756 // CPU side page encryption and GPU side decryption to the CPR.
3757 // GPU operations respect the caller's membar previously set in the push.
conf_computing_block_copy_push_cpu_to_gpu(uvm_va_block_t * block,block_copy_state_t * copy_state,uvm_va_block_region_t region,uvm_push_t * push)3758 static void conf_computing_block_copy_push_cpu_to_gpu(uvm_va_block_t *block,
3759 block_copy_state_t *copy_state,
3760 uvm_va_block_region_t region,
3761 uvm_push_t *push)
3762 {
3763 uvm_push_flag_t push_membar_flag = UVM_PUSH_FLAG_COUNT;
3764 uvm_gpu_t *gpu = uvm_push_get_gpu(push);
3765 uvm_page_index_t page_index = region.first;
3766 uvm_conf_computing_dma_buffer_t *dma_buffer = copy_state->dma_buffer;
3767 struct page *src_page;
3768 uvm_gpu_address_t staging_buffer = uvm_mem_gpu_address_virtual_kernel(dma_buffer->alloc, gpu);
3769 uvm_gpu_address_t auth_tag_buffer = uvm_mem_gpu_address_virtual_kernel(dma_buffer->auth_tag, gpu);
3770 char *cpu_auth_tag_buffer = (char *)uvm_mem_get_cpu_addr_kernel(dma_buffer->auth_tag) +
3771 (page_index * UVM_CONF_COMPUTING_AUTH_TAG_SIZE);
3772 uvm_gpu_address_t dst_address = block_copy_get_address(block, ©_state->dst, page_index, gpu);
3773 char *cpu_va_staging_buffer = (char *)uvm_mem_get_cpu_addr_kernel(dma_buffer->alloc) + (page_index * PAGE_SIZE);
3774 uvm_cpu_chunk_t *chunk;
3775 uvm_va_block_region_t chunk_region;
3776
3777 UVM_ASSERT(UVM_ID_IS_CPU(copy_state->src.id));
3778 UVM_ASSERT(UVM_ID_IS_GPU(copy_state->dst.id));
3779 UVM_ASSERT(g_uvm_global.conf_computing_enabled);
3780
3781 // See comment in block_copy_begin_push.
3782 UVM_ASSERT(uvm_tracker_is_completed(&block->tracker));
3783
3784 chunk = uvm_cpu_chunk_get_chunk_for_page(block, copy_state->src.nid, page_index);
3785 UVM_ASSERT(chunk);
3786
3787 // The caller guarantees that all pages in region are contiguous,
3788 // meaning they're guaranteed to be part of the same compound page.
3789 chunk_region = uvm_va_block_chunk_region(block, uvm_cpu_chunk_get_size(chunk), page_index);
3790 UVM_ASSERT(uvm_va_block_region_contains_region(chunk_region, region));
3791
3792 src_page = uvm_cpu_chunk_get_cpu_page(block, chunk, page_index);
3793 staging_buffer.address += page_index * PAGE_SIZE;
3794 auth_tag_buffer.address += page_index * UVM_CONF_COMPUTING_AUTH_TAG_SIZE;
3795
3796 if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE))
3797 push_membar_flag = UVM_PUSH_FLAG_NEXT_MEMBAR_NONE;
3798 else if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU))
3799 push_membar_flag = UVM_PUSH_FLAG_NEXT_MEMBAR_GPU;
3800
3801 // kmap() only guarantees PAGE_SIZE contiguity, all encryption and
3802 // decryption must happen on a PAGE_SIZE basis.
3803 for_each_va_block_page_in_region(page_index, region) {
3804 void *src_cpu_virt_addr;
3805
3806 src_cpu_virt_addr = kmap(src_page);
3807 uvm_conf_computing_cpu_encrypt(push->channel,
3808 cpu_va_staging_buffer,
3809 src_cpu_virt_addr,
3810 NULL,
3811 PAGE_SIZE,
3812 cpu_auth_tag_buffer);
3813 kunmap(src_page);
3814
3815 // All but the first decryption can be pipelined. The first decryption
3816 // uses the caller's pipelining settings.
3817 if (page_index > region.first)
3818 uvm_push_set_flag(push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
3819
3820 if (page_index < (region.outer - 1))
3821 uvm_push_set_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
3822 else if (push_membar_flag != UVM_PUSH_FLAG_COUNT)
3823 uvm_push_set_flag(push, push_membar_flag);
3824
3825 gpu->parent->ce_hal->decrypt(push, dst_address, staging_buffer, PAGE_SIZE, auth_tag_buffer);
3826
3827 src_page++;
3828 dst_address.address += PAGE_SIZE;
3829 cpu_va_staging_buffer += PAGE_SIZE;
3830 staging_buffer.address += PAGE_SIZE;
3831 cpu_auth_tag_buffer += UVM_CONF_COMPUTING_AUTH_TAG_SIZE;
3832 auth_tag_buffer.address += UVM_CONF_COMPUTING_AUTH_TAG_SIZE;
3833 }
3834 }
3835
3836 // When the Confidential Computing feature is enabled, the function performs
3837 // GPU side page encryption. GPU operations respect the caller's membar
3838 // previously set in the push.
conf_computing_block_copy_push_gpu_to_cpu(uvm_va_block_t * block,block_copy_state_t * copy_state,uvm_va_block_region_t region,uvm_push_t * push)3839 static void conf_computing_block_copy_push_gpu_to_cpu(uvm_va_block_t *block,
3840 block_copy_state_t *copy_state,
3841 uvm_va_block_region_t region,
3842 uvm_push_t *push)
3843 {
3844 uvm_push_flag_t push_membar_flag = UVM_PUSH_FLAG_COUNT;
3845 uvm_gpu_t *gpu = uvm_push_get_gpu(push);
3846 uvm_page_index_t page_index = region.first;
3847 uvm_conf_computing_dma_buffer_t *dma_buffer = copy_state->dma_buffer;
3848 uvm_gpu_address_t staging_buffer = uvm_mem_gpu_address_virtual_kernel(dma_buffer->alloc, gpu);
3849 uvm_gpu_address_t auth_tag_buffer = uvm_mem_gpu_address_virtual_kernel(dma_buffer->auth_tag, gpu);
3850 uvm_gpu_address_t src_address = block_copy_get_address(block, ©_state->src, page_index, gpu);
3851 NvU32 key_version = uvm_channel_pool_key_version(push->channel->pool);
3852
3853 UVM_ASSERT(UVM_ID_IS_GPU(copy_state->src.id));
3854 UVM_ASSERT(UVM_ID_IS_CPU(copy_state->dst.id));
3855 UVM_ASSERT(g_uvm_global.conf_computing_enabled);
3856
3857 staging_buffer.address += page_index * PAGE_SIZE;
3858 auth_tag_buffer.address += page_index * UVM_CONF_COMPUTING_AUTH_TAG_SIZE;
3859
3860 if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE))
3861 push_membar_flag = UVM_PUSH_FLAG_NEXT_MEMBAR_NONE;
3862 else if (uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU))
3863 push_membar_flag = UVM_PUSH_FLAG_NEXT_MEMBAR_GPU;
3864
3865 // Because we use kmap() for mapping pages for CPU side
3866 // crypto-operations and it only guarantees PAGE_SIZE contiguity, all
3867 // encryptions and decryptions must happen on a PAGE_SIZE basis.
3868 for_each_va_block_page_in_region(page_index, region) {
3869 uvm_conf_computing_log_gpu_encryption(push->channel, PAGE_SIZE, &dma_buffer->decrypt_iv[page_index]);
3870 dma_buffer->key_version[page_index] = key_version;
3871
3872 // All but the first encryption can be pipelined. The first encryption
3873 // uses the caller's pipelining settings.
3874 if (page_index > region.first)
3875 uvm_push_set_flag(push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
3876
3877 if (page_index < (region.outer - 1))
3878 uvm_push_set_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
3879 else if (push_membar_flag != UVM_PUSH_FLAG_COUNT)
3880 uvm_push_set_flag(push, push_membar_flag);
3881
3882 gpu->parent->ce_hal->encrypt(push, staging_buffer, src_address, PAGE_SIZE, auth_tag_buffer);
3883
3884 src_address.address += PAGE_SIZE;
3885 staging_buffer.address += PAGE_SIZE;
3886 auth_tag_buffer.address += UVM_CONF_COMPUTING_AUTH_TAG_SIZE;
3887 }
3888
3889 uvm_page_mask_region_fill(&dma_buffer->encrypted_page_mask, region);
3890 }
3891
conf_computing_copy_pages_finish(uvm_va_block_t * block,block_copy_state_t * copy_state,uvm_push_t * push)3892 static NV_STATUS conf_computing_copy_pages_finish(uvm_va_block_t *block,
3893 block_copy_state_t *copy_state,
3894 uvm_push_t *push)
3895 {
3896 NV_STATUS status;
3897 uvm_page_index_t page_index;
3898 uvm_conf_computing_dma_buffer_t *dma_buffer = copy_state->dma_buffer;
3899 uvm_page_mask_t *encrypted_page_mask = &dma_buffer->encrypted_page_mask;
3900 void *auth_tag_buffer_base = uvm_mem_get_cpu_addr_kernel(dma_buffer->auth_tag);
3901 void *staging_buffer_base = uvm_mem_get_cpu_addr_kernel(dma_buffer->alloc);
3902
3903 UVM_ASSERT(g_uvm_global.conf_computing_enabled);
3904
3905 if (UVM_ID_IS_GPU(copy_state->dst.id))
3906 return NV_OK;
3907
3908 UVM_ASSERT(UVM_ID_IS_GPU(copy_state->src.id));
3909
3910 status = uvm_push_wait(push);
3911 if (status != NV_OK)
3912 return status;
3913
3914 // kmap() only guarantees PAGE_SIZE contiguity, all encryption and
3915 // decryption must happen on a PAGE_SIZE basis.
3916 for_each_va_block_page_in_mask(page_index, encrypted_page_mask, block) {
3917 // All CPU chunks for the copy have already been allocated in
3918 // block_populate_pages() and copy_state has been filled in based on
3919 // those allocations.
3920 uvm_cpu_chunk_t *cpu_chunk = uvm_cpu_chunk_get_chunk_for_page(block, copy_state->dst.nid, page_index);
3921 struct page *dst_page = uvm_cpu_chunk_get_cpu_page(block, cpu_chunk, page_index);
3922 void *staging_buffer = (char *)staging_buffer_base + (page_index * PAGE_SIZE);
3923 void *auth_tag_buffer = (char *)auth_tag_buffer_base + (page_index * UVM_CONF_COMPUTING_AUTH_TAG_SIZE);
3924 void *cpu_page_address = kmap(dst_page);
3925
3926 status = uvm_conf_computing_cpu_decrypt(push->channel,
3927 cpu_page_address,
3928 staging_buffer,
3929 dma_buffer->decrypt_iv + page_index,
3930 dma_buffer->key_version[page_index],
3931 PAGE_SIZE,
3932 auth_tag_buffer);
3933 kunmap(dst_page);
3934 if (status != NV_OK) {
3935 // TODO: Bug 3814087: [UVM][HCC] Handle CSL auth_tag verification
3936 // failures & other failures gracefully.
3937 // uvm_conf_computing_cpu_decrypt() can fail if the authentication
3938 // tag verification fails. May this happen, it is considered a
3939 // critical failure and cannot be recovered.
3940 uvm_global_set_fatal_error(status);
3941 return status;
3942 }
3943 }
3944
3945 return NV_OK;
3946 }
3947
block_copy_push(uvm_va_block_t * block,block_copy_state_t * copy_state,uvm_va_block_region_t region,uvm_push_t * push)3948 static void block_copy_push(uvm_va_block_t *block,
3949 block_copy_state_t *copy_state,
3950 uvm_va_block_region_t region,
3951 uvm_push_t *push)
3952 {
3953 uvm_gpu_address_t gpu_dst_address, gpu_src_address;
3954 uvm_gpu_t *gpu = uvm_push_get_gpu(push);
3955
3956 // Only the first transfer is not pipelined. Since the callees observe the
3957 // caller's pipeline settings, pipelining must be disabled in that first
3958 // transfer.
3959 if (copy_state->copy_pushed)
3960 uvm_push_set_flag(push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED);
3961 else
3962 UVM_ASSERT(!uvm_push_test_flag(push, UVM_PUSH_FLAG_CE_NEXT_PIPELINED));
3963
3964 uvm_push_set_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
3965
3966 if (g_uvm_global.conf_computing_enabled) {
3967 if (UVM_ID_IS_CPU(copy_state->src.id))
3968 conf_computing_block_copy_push_cpu_to_gpu(block, copy_state, region, push);
3969 else
3970 conf_computing_block_copy_push_gpu_to_cpu(block, copy_state, region, push);
3971 }
3972 else {
3973 gpu_dst_address = block_copy_get_address(block, ©_state->dst, region.first, gpu);
3974 gpu_src_address = block_copy_get_address(block, ©_state->src, region.first, gpu);
3975
3976 gpu->parent->ce_hal->memcopy(push, gpu_dst_address, gpu_src_address, uvm_va_block_region_size(region));
3977 }
3978
3979 copy_state->copy_pushed = true;
3980 }
3981
block_copy_end_push(uvm_va_block_t * block,block_copy_state_t * copy_state,uvm_tracker_t * copy_tracker,NV_STATUS push_status,uvm_push_t * push)3982 static NV_STATUS block_copy_end_push(uvm_va_block_t *block,
3983 block_copy_state_t *copy_state,
3984 uvm_tracker_t *copy_tracker,
3985 NV_STATUS push_status,
3986 uvm_push_t *push)
3987 {
3988 NV_STATUS tracker_status;
3989
3990 // TODO: Bug 1766424: If the destination is a GPU and the copy was done
3991 // by that GPU, use a GPU-local membar if no peer can currently
3992 // map this page. When peer access gets enabled, do a MEMBAR_SYS
3993 // at that point.
3994 uvm_push_end(push);
3995
3996 if ((push_status == NV_OK) && g_uvm_global.conf_computing_enabled)
3997 push_status = conf_computing_copy_pages_finish(block, copy_state, push);
3998
3999 tracker_status = uvm_tracker_add_push_safe(copy_tracker, push);
4000 if (push_status == NV_OK)
4001 push_status = tracker_status;
4002
4003 if (g_uvm_global.conf_computing_enabled) {
4004 uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
4005
4006 uvm_tracker_overwrite_with_push(&local_tracker, push);
4007 uvm_conf_computing_dma_buffer_free(&push->gpu->conf_computing.dma_buffer_pool,
4008 copy_state->dma_buffer,
4009 &local_tracker);
4010 copy_state->dma_buffer = NULL;
4011 uvm_tracker_deinit(&local_tracker);
4012 }
4013
4014 return push_status;
4015 }
4016
4017 // Copies use CEs if:
4018 // - uvm_block_cpu_to_cpu_copy_with_ce or
4019 // uvm_test_force_block_cpu_to_cpu_copy_with_ce are set AND there are
4020 // registered GPUs in the VA space.
4021 // - the source and destination are not the CPU.
block_copy_should_use_push(uvm_va_block_t * block,block_copy_state_t * copy_state)4022 static bool block_copy_should_use_push(uvm_va_block_t *block, block_copy_state_t *copy_state)
4023 {
4024 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
4025
4026 return ((uvm_block_cpu_to_cpu_copy_with_ce || va_space->test.force_cpu_to_cpu_copy_with_ce) &&
4027 uvm_processor_mask_get_gpu_count(&va_space->registered_gpus)) ||
4028 !(UVM_ID_IS_CPU(copy_state->src.id) && uvm_id_equal(copy_state->src.id, copy_state->dst.id));
4029 }
4030
block_copy_pages(uvm_va_block_t * va_block,block_copy_state_t * copy_state,uvm_va_block_region_t region,uvm_push_t * push)4031 static NV_STATUS block_copy_pages(uvm_va_block_t *va_block,
4032 block_copy_state_t *copy_state,
4033 uvm_va_block_region_t region,
4034 uvm_push_t *push)
4035 {
4036 if (!block_copy_should_use_push(va_block, copy_state)) {
4037 uvm_cpu_chunk_t *src_chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, copy_state->src.nid, region.first);
4038 uvm_cpu_chunk_t *dst_chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, copy_state->dst.nid, region.first);
4039 uvm_va_block_region_t src_chunk_region = uvm_cpu_chunk_block_region(va_block, src_chunk, region.first);
4040 uvm_va_block_region_t dst_chunk_region = uvm_cpu_chunk_block_region(va_block, dst_chunk, region.first);
4041 struct page *src_chunk_page = uvm_cpu_chunk_get_cpu_page(va_block, src_chunk, src_chunk_region.first);
4042 struct page *dst_chunk_page = uvm_cpu_chunk_get_cpu_page(va_block, dst_chunk, dst_chunk_region.first);
4043 uvm_page_index_t page_index;
4044 NV_STATUS status;
4045
4046 UVM_ASSERT(dst_chunk);
4047 UVM_ASSERT(uvm_cpu_chunk_get_size(src_chunk) >= uvm_va_block_region_size(region));
4048 UVM_ASSERT(uvm_va_block_region_size(region) <= uvm_cpu_chunk_get_size(dst_chunk));
4049
4050 // CPU-to-CPU copies using memcpy() don't have any inherent ordering with
4051 // copies using GPU CEs. So, we have to make sure that all previously
4052 // submitted work is complete.
4053 status = uvm_tracker_wait(&va_block->tracker);
4054 if (status != NV_OK)
4055 return status;
4056
4057 for_each_va_block_page_in_region(page_index, region) {
4058 struct page *src_page = src_chunk_page + (page_index - src_chunk_region.first);
4059 struct page *dst_page = dst_chunk_page + (page_index - dst_chunk_region.first);
4060 void *src_addr = kmap(src_page);
4061 void *dst_addr = kmap(dst_page);
4062
4063 memcpy(dst_addr, src_addr, PAGE_SIZE);
4064 kunmap(src_addr);
4065 kunmap(dst_addr);
4066
4067 if (block_cpu_page_is_dirty(va_block, page_index, copy_state->src.nid))
4068 block_mark_cpu_page_dirty(va_block, page_index, copy_state->dst.nid);
4069 }
4070 }
4071 else {
4072 block_copy_push(va_block, copy_state, region, push);
4073 }
4074
4075 return NV_OK;
4076 }
4077
4078 // Copies pages resident on the src_id processor to the dst_id processor
4079 //
4080 // The function adds the pages that were successfully copied to the output
4081 // migrated_pages mask and returns the number of pages in copied_pages. These
4082 // fields are reliable even if an error is returned.
4083 //
4084 // Acquires the block's tracker and adds all of its pushes to the copy_tracker.
block_copy_resident_pages_between(uvm_va_block_t * block,uvm_va_block_context_t * block_context,uvm_processor_id_t dst_id,int dst_nid,uvm_processor_id_t src_id,int src_nid,uvm_va_block_region_t region,uvm_page_mask_t * copy_mask,const uvm_page_mask_t * prefetch_page_mask,uvm_va_block_transfer_mode_t transfer_mode,uvm_page_mask_t * migrated_pages,NvU32 * copied_pages,uvm_tracker_t * copy_tracker)4085 static NV_STATUS block_copy_resident_pages_between(uvm_va_block_t *block,
4086 uvm_va_block_context_t *block_context,
4087 uvm_processor_id_t dst_id,
4088 int dst_nid,
4089 uvm_processor_id_t src_id,
4090 int src_nid,
4091 uvm_va_block_region_t region,
4092 uvm_page_mask_t *copy_mask,
4093 const uvm_page_mask_t *prefetch_page_mask,
4094 uvm_va_block_transfer_mode_t transfer_mode,
4095 uvm_page_mask_t *migrated_pages,
4096 NvU32 *copied_pages,
4097 uvm_tracker_t *copy_tracker)
4098 {
4099 NV_STATUS status = NV_OK;
4100 uvm_page_mask_t *dst_resident_mask = uvm_va_block_resident_mask_get(block, dst_id, dst_nid);
4101 uvm_gpu_t *copying_gpu = NULL;
4102 uvm_push_t push;
4103 uvm_page_index_t page_index;
4104 uvm_page_index_t contig_start_index = region.outer;
4105 uvm_page_index_t last_index = region.outer;
4106 uvm_range_group_range_t *rgr = NULL;
4107 bool rgr_has_changed = false;
4108 uvm_make_resident_cause_t cause = block_context->make_resident.cause;
4109 uvm_make_resident_cause_t contig_cause = cause;
4110 const bool may_prefetch = (cause == UVM_MAKE_RESIDENT_CAUSE_REPLAYABLE_FAULT ||
4111 cause == UVM_MAKE_RESIDENT_CAUSE_NON_REPLAYABLE_FAULT ||
4112 cause == UVM_MAKE_RESIDENT_CAUSE_ACCESS_COUNTER) && !!prefetch_page_mask;
4113 block_copy_state_t copy_state = {0};
4114 uvm_va_range_t *va_range = block->va_range;
4115 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
4116 uvm_va_block_region_t contig_region = {0};
4117 NvU64 cpu_migration_begin_timestamp = 0;
4118
4119 *copied_pages = 0;
4120
4121 if (UVM_ID_IS_CPU(src_id))
4122 UVM_ASSERT(src_nid != NUMA_NO_NODE);
4123
4124 if (UVM_ID_IS_CPU(dst_id))
4125 UVM_ASSERT(dst_nid != NUMA_NO_NODE);
4126
4127 // If there are no pages to be copied, exit early
4128 if (!uvm_page_mask_andnot(copy_mask, copy_mask, dst_resident_mask))
4129 return NV_OK;
4130
4131 if (migrated_pages && !uvm_page_mask_andnot(copy_mask, copy_mask, migrated_pages))
4132 return NV_OK;
4133
4134 copy_state.src.id = src_id;
4135 copy_state.dst.id = dst_id;
4136 copy_state.src.nid = src_nid;
4137 copy_state.dst.nid = dst_nid;
4138
4139 copy_state.src.is_block_contig = is_block_phys_contig(block, src_id, copy_state.src.nid);
4140 copy_state.dst.is_block_contig = is_block_phys_contig(block, dst_id, copy_state.dst.nid);
4141
4142 // uvm_range_group_range_iter_first should only be called when the va_space
4143 // lock is held, which is always the case unless an eviction is taking
4144 // place.
4145 if (cause != UVM_MAKE_RESIDENT_CAUSE_EVICTION) {
4146 rgr = uvm_range_group_range_iter_first(va_space,
4147 uvm_va_block_region_start(block, region),
4148 uvm_va_block_region_end(block, region));
4149 rgr_has_changed = true;
4150 }
4151
4152 // TODO: Bug 3745051: This function is complicated and needs refactoring
4153 for_each_va_block_page_in_region_mask(page_index, copy_mask, region) {
4154 NvU64 page_start = uvm_va_block_cpu_page_address(block, page_index);
4155 uvm_make_resident_cause_t page_cause = (may_prefetch && uvm_page_mask_test(prefetch_page_mask, page_index)) ?
4156 UVM_MAKE_RESIDENT_CAUSE_PREFETCH:
4157 cause;
4158
4159 UVM_ASSERT(block_check_resident_proximity(block, block_context, page_index, dst_id));
4160 UVM_ASSERT(block_processor_page_is_populated(block, dst_id, page_index));
4161
4162 // If we're not evicting and we're migrating away from the preferred
4163 // location, then we should add the range group range to the list of
4164 // migrated ranges in the range group. It's safe to skip this because
4165 // the use of range_group's migrated_ranges list is a UVM-Lite
4166 // optimization - eviction is not supported on UVM-Lite GPUs.
4167 if (cause != UVM_MAKE_RESIDENT_CAUSE_EVICTION && !uvm_va_block_is_hmm(block) &&
4168 uvm_va_policy_preferred_location_equal(uvm_va_range_get_policy(va_range), src_id, src_nid)) {
4169 // rgr_has_changed is used to minimize the number of times the
4170 // migrated_ranges_lock is taken. It is set to false when the range
4171 // group range pointed by rgr is added to the migrated_ranges list,
4172 // and it is just set back to true when we move to a different
4173 // range group range.
4174
4175 // The current page could be after the end of rgr. Iterate over the
4176 // range group ranges until rgr's end location is greater than or
4177 // equal to the current page.
4178 while (rgr && rgr->node.end < page_start) {
4179 rgr = uvm_range_group_range_iter_next(va_space, rgr, uvm_va_block_region_end(block, region));
4180 rgr_has_changed = true;
4181 }
4182
4183 // Check whether the current page lies within rgr. A single page
4184 // must entirely reside within a range group range. Since we've
4185 // incremented rgr until its end is higher than page_start, we now
4186 // check if page_start lies within rgr.
4187 if (rgr && rgr_has_changed && page_start >= rgr->node.start && page_start <= rgr->node.end) {
4188 uvm_spin_lock(&rgr->range_group->migrated_ranges_lock);
4189 if (list_empty(&rgr->range_group_migrated_list_node))
4190 list_move_tail(&rgr->range_group_migrated_list_node, &rgr->range_group->migrated_ranges);
4191 uvm_spin_unlock(&rgr->range_group->migrated_ranges_lock);
4192
4193 rgr_has_changed = false;
4194 }
4195 }
4196
4197 // No need to copy pages that haven't changed. Just clear residency
4198 // information
4199 if (block_page_is_clean(block, dst_id, copy_state.dst.nid, src_id, copy_state.src.nid, page_index))
4200 continue;
4201
4202 if (last_index == region.outer) {
4203 // Record all processors involved in the copy.
4204 uvm_processor_mask_set(&block_context->make_resident.all_involved_processors, dst_id);
4205 uvm_processor_mask_set(&block_context->make_resident.all_involved_processors, src_id);
4206 }
4207
4208 if (block_copy_should_use_push(block, ©_state)) {
4209 if (!copying_gpu) {
4210 status = block_copy_begin_push(block, ©_state, &block->tracker, &push);
4211
4212 if (status != NV_OK)
4213 break;
4214
4215 copying_gpu = uvm_push_get_gpu(&push);
4216
4217 // Ensure that there is GPU state that can be used for CPU-to-CPU copies
4218 if (UVM_ID_IS_CPU(dst_id) && uvm_id_equal(src_id, dst_id)) {
4219 uvm_va_block_gpu_state_t *gpu_state = block_gpu_state_get_alloc(block, copying_gpu);
4220 if (!gpu_state) {
4221 status = NV_ERR_NO_MEMORY;
4222 break;
4223 }
4224 }
4225
4226 // Record the GPU involved in the copy
4227 uvm_processor_mask_set(&block_context->make_resident.all_involved_processors, copying_gpu->id);
4228
4229 // This function is called just once per VA block and needs to
4230 // receive the "main" cause for the migration (it mainly checks if
4231 // we are in the eviction path). Therefore, we pass cause instead
4232 // of contig_cause
4233 uvm_tools_record_block_migration_begin(block, &push, dst_id, src_id, page_start, cause);
4234 }
4235 }
4236 else {
4237 // For CPU-to-CPU copies using memcpy(), record the start of the
4238 // migration here. This will be reported in the migration event.
4239 cpu_migration_begin_timestamp = NV_GETTIME();
4240 }
4241
4242 if (!uvm_va_block_is_hmm(block))
4243 block_update_page_dirty_state(block, dst_id, copy_state.dst.nid, src_id, copy_state.src.nid, page_index);
4244
4245 if (last_index == region.outer) {
4246 bool can_cache_src_phys_addr = copy_state.src.is_block_contig;
4247 bool can_cache_dst_phys_addr = copy_state.dst.is_block_contig;
4248 contig_start_index = page_index;
4249 contig_cause = page_cause;
4250
4251 if (block_copy_should_use_push(block, ©_state)) {
4252 // When CC is enabled, transfers between GPU and CPU don't rely on
4253 // any GPU mapping of CPU chunks, physical or virtual.
4254 if (UVM_ID_IS_CPU(src_id) && g_uvm_global.conf_computing_enabled)
4255 can_cache_src_phys_addr = false;
4256
4257 if (UVM_ID_IS_CPU(dst_id) && g_uvm_global.conf_computing_enabled)
4258 can_cache_dst_phys_addr = false;
4259 // Computing the physical address is a non-trivial operation and
4260 // seems to be a performance limiter on systems with 2 or more
4261 // NVLINK links. Therefore, for physically-contiguous block
4262 // storage, we cache the start address and compute the page address
4263 // using the page index.
4264 if (can_cache_src_phys_addr) {
4265 copy_state.src.gpu_address = block_phys_page_copy_address(block,
4266 block_phys_page(src_id,
4267 copy_state.src.nid,
4268 0),
4269 copying_gpu);
4270 }
4271 if (can_cache_dst_phys_addr) {
4272 copy_state.dst.gpu_address = block_phys_page_copy_address(block,
4273 block_phys_page(dst_id,
4274 copy_state.dst.nid,
4275 0),
4276 copying_gpu);
4277 }
4278 }
4279 }
4280 else if ((page_index != last_index + 1) || contig_cause != page_cause) {
4281 contig_region = uvm_va_block_region(contig_start_index, last_index + 1);
4282 UVM_ASSERT(uvm_va_block_region_contains_region(region, contig_region));
4283
4284 // If both src and dst are physically-contiguous, consolidate copies
4285 // of contiguous pages into a single method.
4286 if (copy_state.src.is_block_contig && copy_state.dst.is_block_contig) {
4287 status = block_copy_pages(block, ©_state, contig_region, &push);
4288 if (status != NV_OK)
4289 break;
4290 }
4291
4292 if (block_copy_should_use_push(block, ©_state)) {
4293 uvm_perf_event_notify_migration(&va_space->perf_events,
4294 &push,
4295 block,
4296 dst_id,
4297 src_id,
4298 uvm_va_block_region_start(block, contig_region),
4299 uvm_va_block_region_size(contig_region),
4300 transfer_mode,
4301 contig_cause,
4302 &block_context->make_resident);
4303 }
4304 else {
4305 uvm_perf_event_notify_migration_cpu(&va_space->perf_events,
4306 block,
4307 copy_state.dst.nid,
4308 copy_state.src.nid,
4309 uvm_va_block_region_start(block, contig_region),
4310 uvm_va_block_region_size(contig_region),
4311 cpu_migration_begin_timestamp,
4312 transfer_mode,
4313 contig_cause,
4314 &block_context->make_resident);
4315 }
4316
4317 contig_start_index = page_index;
4318 contig_cause = page_cause;
4319 }
4320
4321 if (!copy_state.src.is_block_contig || !copy_state.dst.is_block_contig) {
4322 status = block_copy_pages(block, ©_state, uvm_va_block_region_for_page(page_index), &push);
4323 if (status != NV_OK)
4324 return status;
4325 }
4326
4327 last_index = page_index;
4328 }
4329
4330 // Copy the remaining pages
4331 contig_region = uvm_va_block_region(contig_start_index, last_index + 1);
4332 if (uvm_va_block_region_size(contig_region) && uvm_va_block_region_contains_region(region, contig_region)) {
4333 if (copy_state.src.is_block_contig && copy_state.dst.is_block_contig) {
4334 status = block_copy_pages(block, ©_state, contig_region, &push);
4335 if (status != NV_OK)
4336 return status;
4337 }
4338
4339 if (block_copy_should_use_push(block, ©_state)) {
4340 uvm_perf_event_notify_migration(&va_space->perf_events,
4341 &push,
4342 block,
4343 dst_id,
4344 src_id,
4345 uvm_va_block_region_start(block, contig_region),
4346 uvm_va_block_region_size(contig_region),
4347 transfer_mode,
4348 contig_cause,
4349 &block_context->make_resident);
4350 }
4351 else {
4352 uvm_perf_event_notify_migration_cpu(&va_space->perf_events,
4353 block,
4354 copy_state.dst.nid,
4355 copy_state.src.nid,
4356 uvm_va_block_region_start(block, contig_region),
4357 uvm_va_block_region_size(contig_region),
4358 cpu_migration_begin_timestamp,
4359 transfer_mode,
4360 contig_cause,
4361 &block_context->make_resident);
4362 }
4363
4364 if (block_copy_should_use_push(block, ©_state) && copying_gpu)
4365 status = block_copy_end_push(block, ©_state, copy_tracker, status, &push);
4366 }
4367
4368 // Update VA block status bits
4369 //
4370 // Only update the bits for the pages that succeeded
4371 if (status != NV_OK)
4372 uvm_page_mask_region_clear(copy_mask, uvm_va_block_region(page_index, PAGES_PER_UVM_VA_BLOCK));
4373
4374 *copied_pages = uvm_page_mask_weight(copy_mask);
4375 if (*copied_pages && migrated_pages)
4376 uvm_page_mask_or(migrated_pages, migrated_pages, copy_mask);
4377
4378 return status;
4379 }
4380
block_copy_resident_pages_from(uvm_va_block_t * block,uvm_va_block_context_t * block_context,uvm_processor_id_t dst_id,uvm_processor_id_t src_id,int src_nid,uvm_va_block_region_t region,const uvm_page_mask_t * page_mask,const uvm_page_mask_t * prefetch_page_mask,uvm_va_block_transfer_mode_t transfer_mode,uvm_page_mask_t * migrated_pages,NvU32 * copied_pages_out,uvm_tracker_t * copy_tracker)4381 static NV_STATUS block_copy_resident_pages_from(uvm_va_block_t *block,
4382 uvm_va_block_context_t *block_context,
4383 uvm_processor_id_t dst_id,
4384 uvm_processor_id_t src_id,
4385 int src_nid,
4386 uvm_va_block_region_t region,
4387 const uvm_page_mask_t *page_mask,
4388 const uvm_page_mask_t *prefetch_page_mask,
4389 uvm_va_block_transfer_mode_t transfer_mode,
4390 uvm_page_mask_t *migrated_pages,
4391 NvU32 *copied_pages_out,
4392 uvm_tracker_t *copy_tracker)
4393 {
4394 uvm_page_mask_t *copy_mask = &block_context->make_resident.copy_resident_pages_mask;
4395 uvm_page_mask_t *src_resident_mask;
4396 uvm_page_mask_t *node_pages_mask = &block_context->make_resident.node_pages_mask;
4397 uvm_make_resident_page_tracking_t *page_tracking = &block_context->make_resident.cpu_pages_used;
4398 NvU32 copied_pages_from_src;
4399 NV_STATUS status = NV_OK;
4400 int dst_nid;
4401
4402 src_resident_mask = uvm_va_block_resident_mask_get(block, src_id, src_nid);
4403 uvm_page_mask_init_from_region(copy_mask, region, src_resident_mask);
4404
4405 if (page_mask)
4406 uvm_page_mask_and(copy_mask, copy_mask, page_mask);
4407
4408 if (UVM_ID_IS_CPU(dst_id)) {
4409 for_each_node_mask(dst_nid, page_tracking->nodes) {
4410 if (!uvm_page_mask_and(node_pages_mask, copy_mask, block_tracking_node_mask_get(block_context, dst_nid)))
4411 continue;
4412
4413 status = block_copy_resident_pages_between(block,
4414 block_context,
4415 dst_id,
4416 dst_nid,
4417 src_id,
4418 src_nid,
4419 region,
4420 node_pages_mask,
4421 prefetch_page_mask,
4422 transfer_mode,
4423 migrated_pages,
4424 &copied_pages_from_src,
4425 copy_tracker);
4426
4427 *copied_pages_out += copied_pages_from_src;
4428
4429 if (status != NV_OK)
4430 break;
4431
4432 if (!uvm_page_mask_andnot(copy_mask, copy_mask, node_pages_mask))
4433 break;
4434 }
4435 }
4436 else {
4437 status = block_copy_resident_pages_between(block,
4438 block_context,
4439 dst_id,
4440 NUMA_NO_NODE,
4441 src_id,
4442 src_nid,
4443 region,
4444 copy_mask,
4445 prefetch_page_mask,
4446 transfer_mode,
4447 migrated_pages,
4448 &copied_pages_from_src,
4449 copy_tracker);
4450 *copied_pages_out += copied_pages_from_src;
4451 }
4452
4453 return status;
4454 }
4455
4456 // Copy resident pages to the destination from all source processors in the
4457 // src_processor_mask
4458 //
4459 // The function adds the pages that were successfully copied to the output
4460 // migrated_pages mask and returns the number of pages in copied_pages. These
4461 // fields are reliable even if an error is returned.
block_copy_resident_pages_mask(uvm_va_block_t * block,uvm_va_block_context_t * block_context,uvm_processor_id_t dst_id,const uvm_processor_mask_t * src_processor_mask,uvm_va_block_region_t region,const uvm_page_mask_t * page_mask,const uvm_page_mask_t * prefetch_page_mask,uvm_va_block_transfer_mode_t transfer_mode,NvU32 max_pages_to_copy,uvm_page_mask_t * migrated_pages,NvU32 * copied_pages_out,uvm_tracker_t * tracker_out)4462 static NV_STATUS block_copy_resident_pages_mask(uvm_va_block_t *block,
4463 uvm_va_block_context_t *block_context,
4464 uvm_processor_id_t dst_id,
4465 const uvm_processor_mask_t *src_processor_mask,
4466 uvm_va_block_region_t region,
4467 const uvm_page_mask_t *page_mask,
4468 const uvm_page_mask_t *prefetch_page_mask,
4469 uvm_va_block_transfer_mode_t transfer_mode,
4470 NvU32 max_pages_to_copy,
4471 uvm_page_mask_t *migrated_pages,
4472 NvU32 *copied_pages_out,
4473 uvm_tracker_t *tracker_out)
4474 {
4475 NV_STATUS status = NV_OK;
4476 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
4477 uvm_processor_id_t src_id;
4478 uvm_processor_mask_t *search_mask;
4479
4480 *copied_pages_out = 0;
4481
4482 search_mask = uvm_processor_mask_cache_alloc();
4483 if (!search_mask)
4484 return NV_ERR_NO_MEMORY;
4485
4486 uvm_processor_mask_copy(search_mask, src_processor_mask);
4487
4488 for_each_closest_id(src_id, search_mask, dst_id, va_space) {
4489 NV_STATUS status;
4490
4491 if (UVM_ID_IS_CPU(src_id)) {
4492 int nid;
4493
4494 for_each_possible_uvm_node(nid) {
4495 status = block_copy_resident_pages_from(block,
4496 block_context,
4497 dst_id,
4498 src_id,
4499 nid,
4500 region,
4501 page_mask,
4502 prefetch_page_mask,
4503 transfer_mode,
4504 migrated_pages,
4505 copied_pages_out,
4506 tracker_out);
4507
4508 if (status != NV_OK)
4509 break;
4510 }
4511 }
4512 else {
4513 status = block_copy_resident_pages_from(block,
4514 block_context,
4515 dst_id,
4516 src_id,
4517 NUMA_NO_NODE,
4518 region,
4519 page_mask,
4520 prefetch_page_mask,
4521 transfer_mode,
4522 migrated_pages,
4523 copied_pages_out,
4524 tracker_out);
4525
4526 }
4527
4528 UVM_ASSERT(*copied_pages_out <= max_pages_to_copy);
4529
4530 if (status != NV_OK)
4531 break;
4532
4533 // Break out once we copied max pages already
4534 if (*copied_pages_out == max_pages_to_copy)
4535 break;
4536 }
4537
4538 uvm_processor_mask_cache_free(search_mask);
4539 return status;
4540 }
4541
break_read_duplication_in_region(uvm_va_block_t * block,uvm_va_block_context_t * block_context,uvm_processor_id_t dst_id,uvm_va_block_region_t region,const uvm_page_mask_t * page_mask)4542 static void break_read_duplication_in_region(uvm_va_block_t *block,
4543 uvm_va_block_context_t *block_context,
4544 uvm_processor_id_t dst_id,
4545 uvm_va_block_region_t region,
4546 const uvm_page_mask_t *page_mask)
4547 {
4548 uvm_processor_id_t id;
4549 uvm_page_mask_t *break_pages_in_region = &block_context->scratch_page_mask;
4550
4551 uvm_page_mask_init_from_region(break_pages_in_region, region, page_mask);
4552
4553 UVM_ASSERT(
4554 uvm_page_mask_subset(break_pages_in_region, uvm_va_block_resident_mask_get(block, dst_id, NUMA_NO_NODE)));
4555
4556 // Clear read_duplicated bit for all pages in region
4557 uvm_page_mask_andnot(&block->read_duplicated_pages, &block->read_duplicated_pages, break_pages_in_region);
4558
4559 // Clear residency bits for all processors other than dst_id
4560 for_each_id_in_mask(id, &block->resident) {
4561 uvm_page_mask_t *other_resident_mask;
4562
4563 // Skip the destination processor, unless it's the CPU and a specific
4564 // NUMA node is the target destination. This is because CPU-to-CPU
4565 // migrations will switch the residency from one NUMA node to another
4566 // but the resident processor will remain the CPU.
4567 if (uvm_id_equal(id, dst_id) &&
4568 (!UVM_ID_IS_CPU(dst_id) || block_context->make_resident.dest_nid == NUMA_NO_NODE))
4569 continue;
4570
4571 if (UVM_ID_IS_CPU(id)) {
4572 uvm_va_block_cpu_clear_resident_all_chunks(block, block_context, break_pages_in_region);
4573 other_resident_mask = uvm_va_block_resident_mask_get(block, UVM_ID_CPU, NUMA_NO_NODE);
4574 }
4575 else {
4576 other_resident_mask = uvm_va_block_resident_mask_get(block, id, NUMA_NO_NODE);
4577 uvm_page_mask_andnot(other_resident_mask, other_resident_mask, break_pages_in_region);
4578 }
4579
4580 if (uvm_page_mask_empty(other_resident_mask))
4581 block_clear_resident_processor(block, id);
4582 }
4583 }
4584
block_copy_set_first_touch_residency(uvm_va_block_t * block,uvm_va_block_context_t * block_context,uvm_processor_id_t dst_id,uvm_va_block_region_t region,const uvm_page_mask_t * page_mask)4585 static void block_copy_set_first_touch_residency(uvm_va_block_t *block,
4586 uvm_va_block_context_t *block_context,
4587 uvm_processor_id_t dst_id,
4588 uvm_va_block_region_t region,
4589 const uvm_page_mask_t *page_mask)
4590 {
4591 uvm_page_index_t page_index;
4592 uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(block, dst_id, NUMA_NO_NODE);
4593 uvm_page_mask_t *first_touch_mask = &block_context->make_resident.page_mask;
4594
4595 if (page_mask)
4596 uvm_page_mask_andnot(first_touch_mask, page_mask, resident_mask);
4597 else
4598 uvm_page_mask_complement(first_touch_mask, resident_mask);
4599
4600 uvm_page_mask_region_clear_outside(first_touch_mask, region);
4601
4602 for_each_va_block_page_in_mask(page_index, first_touch_mask, block) {
4603 UVM_ASSERT(!block_is_page_resident_anywhere(block, page_index));
4604 UVM_ASSERT(block_processor_page_is_populated(block, dst_id, page_index));
4605 UVM_ASSERT(block_check_resident_proximity(block, block_context, page_index, dst_id));
4606 }
4607
4608 if (UVM_ID_IS_CPU(dst_id)) {
4609 uvm_va_block_cpu_set_resident_all_chunks(block, block_context, first_touch_mask);
4610 resident_mask = uvm_va_block_resident_mask_get(block, UVM_ID_CPU, NUMA_NO_NODE);
4611 }
4612 else {
4613 uvm_page_mask_or(resident_mask, resident_mask, first_touch_mask);
4614 }
4615
4616 if (!uvm_page_mask_empty(resident_mask))
4617 block_set_resident_processor(block, dst_id);
4618
4619 // Add them to the output mask, too
4620 uvm_page_mask_or(&block_context->make_resident.pages_changed_residency,
4621 &block_context->make_resident.pages_changed_residency,
4622 first_touch_mask);
4623 }
4624
4625 // Select the set of CPU pages to be used for the migration. The pages selected
4626 // could be used for either CPU destination pages (when the destination of the
4627 // migration is the CPU) or staging pages (when the migration to the destination
4628 // processor requires staging through the CPU).
block_select_cpu_node_pages(uvm_va_block_t * block,uvm_va_block_context_t * block_context,const uvm_page_mask_t * page_mask,uvm_va_block_region_t region)4629 static void block_select_cpu_node_pages(uvm_va_block_t *block,
4630 uvm_va_block_context_t *block_context,
4631 const uvm_page_mask_t *page_mask,
4632 uvm_va_block_region_t region)
4633 {
4634 uvm_va_block_cpu_node_state_t *node_state;
4635 uvm_make_resident_page_tracking_t *tracking = &block_context->make_resident.cpu_pages_used;
4636 uvm_page_mask_t *scratch_page_mask = &block_context->scratch_page_mask;
4637 uvm_page_mask_t *node_mask;
4638 int nid;
4639
4640 if (uvm_page_mask_empty(page_mask))
4641 return;
4642
4643 block_context->scratch_node_mask = node_possible_map;
4644 uvm_page_mask_init_from_region(scratch_page_mask, region, page_mask);
4645
4646 for_each_closest_uvm_node(nid, uvm_va_block_context_get_node(block_context), block_context->scratch_node_mask) {
4647 node_state = block_node_state_get(block, nid);
4648 node_mask = block_tracking_node_mask_get(block_context, nid);
4649 if (uvm_page_mask_and(node_mask, scratch_page_mask, &node_state->allocated)) {
4650 node_set(nid, tracking->nodes);
4651 if (!uvm_page_mask_andnot(scratch_page_mask, scratch_page_mask, node_mask))
4652 return;
4653 }
4654 }
4655 }
4656
4657 // Copy resident pages from other processors to the destination.
4658 // All the pages on the destination need to be populated by the caller first.
4659 // Pages not resident anywhere else need to be zeroed out as well.
4660 // The transfer_mode is only used to tell uvm_perf_event_notify_migration()
4661 // whether the copy is for a migration or read duplication.
block_copy_resident_pages(uvm_va_block_t * block,uvm_va_block_context_t * block_context,uvm_processor_id_t dst_id,uvm_va_block_region_t region,const uvm_page_mask_t * page_mask,const uvm_page_mask_t * prefetch_page_mask,uvm_va_block_transfer_mode_t transfer_mode)4662 static NV_STATUS block_copy_resident_pages(uvm_va_block_t *block,
4663 uvm_va_block_context_t *block_context,
4664 uvm_processor_id_t dst_id,
4665 uvm_va_block_region_t region,
4666 const uvm_page_mask_t *page_mask,
4667 const uvm_page_mask_t *prefetch_page_mask,
4668 uvm_va_block_transfer_mode_t transfer_mode)
4669 {
4670 NV_STATUS status = NV_OK;
4671 NV_STATUS tracker_status;
4672 uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
4673 uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(block,
4674 dst_id,
4675 block_context->make_resident.dest_nid);
4676 NvU32 missing_pages_count;
4677 NvU32 pages_copied;
4678 NvU32 pages_copied_to_cpu = 0;
4679 uvm_processor_mask_t *src_processor_mask = NULL;
4680 uvm_page_mask_t *copy_page_mask = &block_context->make_resident.page_mask;
4681 uvm_page_mask_t *migrated_pages = &block_context->make_resident.pages_migrated;
4682 uvm_page_mask_t *pages_staged = &block_context->make_resident.pages_staged;
4683 uvm_page_mask_t *cpu_page_mask;
4684 uvm_page_mask_t *numa_resident_pages;
4685 int nid;
4686
4687 uvm_page_mask_zero(migrated_pages);
4688
4689 if (page_mask)
4690 uvm_page_mask_andnot(copy_page_mask, page_mask, resident_mask);
4691 else
4692 uvm_page_mask_complement(copy_page_mask, resident_mask);
4693
4694 missing_pages_count = uvm_page_mask_region_weight(copy_page_mask, region);
4695
4696 if (missing_pages_count == 0)
4697 goto out;
4698
4699 src_processor_mask = uvm_processor_mask_cache_alloc();
4700 if (!src_processor_mask) {
4701 status = NV_ERR_NO_MEMORY;
4702 goto out;
4703 }
4704
4705 // TODO: Bug 1753731: Add P2P2P copies staged through a GPU
4706 // TODO: Bug 1753731: When a page is resident in multiple locations due to
4707 // read-duplication, spread out the source of the copy so we don't
4708 // bottleneck on a single location.
4709
4710 uvm_processor_mask_zero(src_processor_mask);
4711
4712 if (UVM_ID_IS_GPU(dst_id)) {
4713 // If the destination is a GPU, first copy everything from processors
4714 // with copy access supported. Notably this will copy pages from the CPU
4715 // as well even if later some extra copies from CPU are required for
4716 // staged copies.
4717 uvm_processor_mask_and(src_processor_mask, block_get_can_copy_from_mask(block, dst_id), &block->resident);
4718 uvm_processor_mask_clear(src_processor_mask, dst_id);
4719
4720 cpu_page_mask = pages_staged;
4721 }
4722 else {
4723 cpu_page_mask = copy_page_mask;
4724 }
4725
4726 block_select_cpu_node_pages(block, block_context, cpu_page_mask, region);
4727
4728 if (UVM_ID_IS_GPU(dst_id)) {
4729 status = block_copy_resident_pages_mask(block,
4730 block_context,
4731 dst_id,
4732 src_processor_mask,
4733 region,
4734 copy_page_mask,
4735 prefetch_page_mask,
4736 transfer_mode,
4737 missing_pages_count,
4738 migrated_pages,
4739 &pages_copied,
4740 &local_tracker);
4741
4742 UVM_ASSERT(missing_pages_count >= pages_copied);
4743 missing_pages_count -= pages_copied;
4744
4745 if (status != NV_OK)
4746 goto out;
4747
4748 if (missing_pages_count == 0) {
4749 UVM_ASSERT(uvm_page_mask_empty(pages_staged));
4750 goto out;
4751 }
4752
4753 if (pages_copied)
4754 uvm_page_mask_andnot(copy_page_mask, copy_page_mask, migrated_pages);
4755 }
4756
4757 // Now copy from everywhere else to the CPU. This is both for when the
4758 // destination is the CPU (src_processor_mask empty) and for a staged copy
4759 // (src_processor_mask containing processors with copy access to dst_id).
4760 uvm_processor_mask_andnot(src_processor_mask, &block->resident, src_processor_mask);
4761
4762 // If the destination is the CPU but not all pages are resident on the
4763 // destination NUMA node, the CPU is still a source.
4764 numa_resident_pages = uvm_va_block_resident_mask_get(block, UVM_ID_CPU, block_context->make_resident.dest_nid);
4765 if (!UVM_ID_IS_CPU(dst_id) || uvm_page_mask_subset(copy_page_mask, numa_resident_pages)) {
4766 uvm_processor_mask_clear(src_processor_mask, dst_id);
4767 uvm_processor_mask_clear(src_processor_mask, UVM_ID_CPU);
4768 }
4769
4770
4771 if (!uvm_page_mask_empty(cpu_page_mask)) {
4772 status = block_copy_resident_pages_mask(block,
4773 block_context,
4774 UVM_ID_CPU,
4775 src_processor_mask,
4776 region,
4777 cpu_page_mask,
4778 prefetch_page_mask,
4779 transfer_mode,
4780 missing_pages_count,
4781 UVM_ID_IS_CPU(dst_id) ? migrated_pages : NULL,
4782 &pages_copied_to_cpu,
4783 &local_tracker);
4784
4785 if (status != NV_OK)
4786 goto out;
4787 }
4788
4789 // If destination is the CPU then we copied everything there above
4790 if (!UVM_ID_IS_GPU(dst_id))
4791 goto out;
4792
4793 // Add everything to the block's tracker so that the
4794 // block_copy_resident_pages_between() call below will acquire it.
4795 status = uvm_tracker_add_tracker_safe(&block->tracker, &local_tracker);
4796 if (status != NV_OK)
4797 goto out;
4798 uvm_tracker_clear(&local_tracker);
4799
4800 // Now copy staged pages from the CPU to the destination.
4801 // The staging copy above could have allocated pages on any NUMA node.
4802 // Loop over all nodes where pages were allocated and copy from those
4803 // nodes.
4804 pages_copied = 0;
4805 for_each_node_mask(nid, block_context->make_resident.cpu_pages_used.nodes) {
4806 NvU32 pages_copied_from_node;
4807 uvm_page_mask_t *node_pages_mask = &block_context->make_resident.node_pages_mask;
4808 uvm_page_mask_t *node_alloc_mask = block_tracking_node_mask_get(block_context, nid);
4809
4810 if (uvm_page_mask_and(node_pages_mask, pages_staged, node_alloc_mask)) {
4811 status = block_copy_resident_pages_between(block,
4812 block_context,
4813 dst_id,
4814 NUMA_NO_NODE,
4815 UVM_ID_CPU,
4816 nid,
4817 region,
4818 node_pages_mask,
4819 prefetch_page_mask,
4820 transfer_mode,
4821 migrated_pages,
4822 &pages_copied_from_node,
4823 &local_tracker);
4824 UVM_ASSERT(missing_pages_count >= pages_copied_from_node);
4825 missing_pages_count -= pages_copied_from_node;
4826 pages_copied += pages_copied_from_node;
4827 }
4828
4829 if (status != NV_OK)
4830 break;
4831 }
4832
4833 if (status != NV_OK)
4834 goto out;
4835
4836 // If we get here, that means we were staging the copy through the CPU and
4837 // we should copy as many pages from the CPU as we copied to the CPU.
4838 UVM_ASSERT(pages_copied == pages_copied_to_cpu);
4839
4840 out:
4841 // Add everything from the local tracker to the block's tracker.
4842 // Notably this is also needed for handling
4843 // block_copy_resident_pages_between() failures in the first loop.
4844 tracker_status = uvm_tracker_add_tracker_safe(&block->tracker, &local_tracker);
4845 uvm_tracker_deinit(&local_tracker);
4846 uvm_processor_mask_cache_free(src_processor_mask);
4847
4848 return status == NV_OK ? tracker_status : status;
4849 }
4850
uvm_va_block_make_resident_copy(uvm_va_block_t * va_block,uvm_va_block_retry_t * va_block_retry,uvm_va_block_context_t * va_block_context,uvm_processor_id_t dest_id,uvm_va_block_region_t region,const uvm_page_mask_t * page_mask,const uvm_page_mask_t * prefetch_page_mask,uvm_make_resident_cause_t cause)4851 NV_STATUS uvm_va_block_make_resident_copy(uvm_va_block_t *va_block,
4852 uvm_va_block_retry_t *va_block_retry,
4853 uvm_va_block_context_t *va_block_context,
4854 uvm_processor_id_t dest_id,
4855 uvm_va_block_region_t region,
4856 const uvm_page_mask_t *page_mask,
4857 const uvm_page_mask_t *prefetch_page_mask,
4858 uvm_make_resident_cause_t cause)
4859 {
4860 NV_STATUS status = NV_OK;
4861 uvm_processor_mask_t *unmap_processor_mask;
4862 uvm_page_mask_t *unmap_page_mask = &va_block_context->make_resident.page_mask;
4863 uvm_page_mask_t *resident_mask;
4864
4865 va_block_context->make_resident.dest_id = dest_id;
4866 va_block_context->make_resident.cause = cause;
4867 nodes_clear(va_block_context->make_resident.cpu_pages_used.nodes);
4868
4869 if (prefetch_page_mask) {
4870 UVM_ASSERT(cause == UVM_MAKE_RESIDENT_CAUSE_REPLAYABLE_FAULT ||
4871 cause == UVM_MAKE_RESIDENT_CAUSE_NON_REPLAYABLE_FAULT ||
4872 cause == UVM_MAKE_RESIDENT_CAUSE_ACCESS_COUNTER);
4873 }
4874
4875 uvm_assert_mutex_locked(&va_block->lock);
4876 UVM_ASSERT(uvm_va_block_is_hmm(va_block) || va_block->va_range->type == UVM_VA_RANGE_TYPE_MANAGED);
4877
4878 unmap_processor_mask = uvm_processor_mask_cache_alloc();
4879 if (!unmap_processor_mask) {
4880 status = NV_ERR_NO_MEMORY;
4881 goto out;
4882 }
4883
4884 resident_mask = block_resident_mask_get_alloc(va_block, dest_id, va_block_context->make_resident.dest_nid);
4885 if (!resident_mask) {
4886 status = NV_ERR_NO_MEMORY;
4887 goto out;
4888 }
4889
4890 // Unmap all mapped processors except for UVM-Lite GPUs as their mappings
4891 // are largely persistent.
4892 uvm_processor_mask_andnot(unmap_processor_mask, &va_block->mapped, block_get_uvm_lite_gpus(va_block));
4893
4894 if (page_mask)
4895 uvm_page_mask_andnot(unmap_page_mask, page_mask, resident_mask);
4896 else
4897 uvm_page_mask_complement(unmap_page_mask, resident_mask);
4898 uvm_page_mask_region_clear_outside(unmap_page_mask, region);
4899
4900 // Unmap all pages not resident on the destination
4901 status = uvm_va_block_unmap_mask(va_block, va_block_context, unmap_processor_mask, region, unmap_page_mask);
4902 if (status != NV_OK)
4903 goto out;
4904
4905 if (page_mask)
4906 uvm_page_mask_and(unmap_page_mask, page_mask, &va_block->read_duplicated_pages);
4907 else
4908 uvm_page_mask_init_from_region(unmap_page_mask, region, &va_block->read_duplicated_pages);
4909 uvm_page_mask_region_clear_outside(unmap_page_mask, region);
4910
4911 // Also unmap read-duplicated pages excluding dest_id
4912 uvm_processor_mask_clear(unmap_processor_mask, dest_id);
4913 status = uvm_va_block_unmap_mask(va_block, va_block_context, unmap_processor_mask, region, unmap_page_mask);
4914 if (status != NV_OK)
4915 goto out;
4916
4917 uvm_tools_record_read_duplicate_invalidate(va_block,
4918 dest_id,
4919 region,
4920 unmap_page_mask);
4921
4922 // Note that block_populate_pages and block_copy_resident_pages also use
4923 // va_block_context->make_resident.page_mask.
4924 unmap_page_mask = NULL;
4925
4926 status = block_populate_pages(va_block, va_block_retry, va_block_context, dest_id, region, page_mask);
4927 if (status != NV_OK)
4928 goto out;
4929
4930 status = block_copy_resident_pages(va_block,
4931 va_block_context,
4932 dest_id,
4933 region,
4934 page_mask,
4935 prefetch_page_mask,
4936 UVM_VA_BLOCK_TRANSFER_MODE_MOVE);
4937
4938 out:
4939 uvm_processor_mask_cache_free(unmap_processor_mask);
4940 return status;
4941 }
4942
block_make_resident_clear_evicted(uvm_va_block_t * va_block,uvm_processor_id_t dst_id,uvm_page_mask_t * page_mask)4943 static void block_make_resident_clear_evicted(uvm_va_block_t *va_block,
4944 uvm_processor_id_t dst_id,
4945 uvm_page_mask_t *page_mask)
4946 {
4947 uvm_va_block_gpu_state_t *dst_gpu_state = uvm_va_block_gpu_state_get(va_block, dst_id);
4948
4949 UVM_ASSERT(dst_gpu_state);
4950
4951 if (!uvm_page_mask_andnot(&dst_gpu_state->evicted, &dst_gpu_state->evicted, page_mask))
4952 uvm_processor_mask_clear(&va_block->evicted_gpus, dst_id);
4953 }
4954
block_make_resident_update_state(uvm_va_block_t * va_block,uvm_va_block_context_t * va_block_context,uvm_processor_id_t dst_id,uvm_va_block_region_t region,uvm_page_mask_t * copy_mask,uvm_make_resident_cause_t cause)4955 static void block_make_resident_update_state(uvm_va_block_t *va_block,
4956 uvm_va_block_context_t *va_block_context,
4957 uvm_processor_id_t dst_id,
4958 uvm_va_block_region_t region,
4959 uvm_page_mask_t *copy_mask,
4960 uvm_make_resident_cause_t cause)
4961 {
4962 if (UVM_ID_IS_CPU(dst_id)) {
4963 // CPU chunks may not have been allocated on the preferred NUMA node. So,
4964 // the residency has to be updated based on the chunk's NUMA ID.
4965 uvm_va_block_cpu_set_resident_all_chunks(va_block, va_block_context, copy_mask);
4966 }
4967 else {
4968 uvm_page_mask_t *dst_resident_mask = uvm_va_block_resident_mask_get(va_block, dst_id, NUMA_NO_NODE);
4969
4970 uvm_page_mask_or(dst_resident_mask, dst_resident_mask, copy_mask);
4971 }
4972
4973 block_set_resident_processor(va_block, dst_id);
4974
4975 // Accumulate the pages that migrated into the output mask.
4976 uvm_page_mask_or(&va_block_context->make_resident.pages_changed_residency,
4977 &va_block_context->make_resident.pages_changed_residency,
4978 copy_mask);
4979
4980 // Any move operation implies that mappings have been removed from all
4981 // non-UVM-Lite GPUs.
4982 if (!uvm_va_block_is_hmm(va_block))
4983 uvm_page_mask_andnot(&va_block->maybe_mapped_pages, &va_block->maybe_mapped_pages, copy_mask);
4984
4985 // If we are migrating due to an eviction, set the GPU as evicted and
4986 // mark the evicted pages. If we are migrating away from the CPU this
4987 // means that those pages are not evicted.
4988 if (cause == UVM_MAKE_RESIDENT_CAUSE_EVICTION) {
4989 uvm_processor_id_t src_id;
4990
4991 UVM_ASSERT(UVM_ID_IS_CPU(dst_id));
4992
4993 // Note that the destination is the CPU so this loop excludes it.
4994 for_each_gpu_id_in_mask(src_id, &va_block_context->make_resident.all_involved_processors) {
4995 uvm_va_block_gpu_state_t *src_gpu_state = uvm_va_block_gpu_state_get(va_block, src_id);
4996
4997 UVM_ASSERT(src_gpu_state);
4998
4999 uvm_page_mask_or(&src_gpu_state->evicted, &src_gpu_state->evicted, copy_mask);
5000 uvm_processor_mask_set(&va_block->evicted_gpus, src_id);
5001 }
5002 }
5003 else if (UVM_ID_IS_GPU(dst_id) && uvm_processor_mask_test(&va_block->evicted_gpus, dst_id))
5004 block_make_resident_clear_evicted(va_block, dst_id, copy_mask);
5005 }
5006
uvm_va_block_make_resident_finish(uvm_va_block_t * va_block,uvm_va_block_context_t * va_block_context,uvm_va_block_region_t region,const uvm_page_mask_t * page_mask)5007 void uvm_va_block_make_resident_finish(uvm_va_block_t *va_block,
5008 uvm_va_block_context_t *va_block_context,
5009 uvm_va_block_region_t region,
5010 const uvm_page_mask_t *page_mask)
5011 {
5012 uvm_page_mask_t *migrated_pages = &va_block_context->make_resident.pages_migrated;
5013 uvm_processor_id_t dst_id = va_block_context->make_resident.dest_id;
5014
5015 uvm_assert_mutex_locked(&va_block->lock);
5016
5017 if (page_mask)
5018 uvm_page_mask_and(migrated_pages, migrated_pages, page_mask);
5019
5020 if (!uvm_page_mask_empty(migrated_pages)) {
5021 // The migrated pages are now resident on the destination.
5022 block_make_resident_update_state(va_block,
5023 va_block_context,
5024 dst_id,
5025 region,
5026 migrated_pages,
5027 va_block_context->make_resident.cause);
5028 }
5029
5030 // Pages that weren't resident anywhere else were populated at the
5031 // destination directly. Mark them as resident now.
5032 block_copy_set_first_touch_residency(va_block, va_block_context, dst_id, region, page_mask);
5033
5034 // Break read duplication and clear residency from other processors.
5035 break_read_duplication_in_region(va_block, va_block_context, dst_id, region, page_mask);
5036
5037 // Update eviction heuristics, if needed. Notably this could repeat the call
5038 // done in block_set_resident_processor(), but that doesn't do anything bad
5039 // and it's simpler to keep it in both places.
5040 //
5041 // Skip this if we didn't do anything (the input region and/or page mask was
5042 // empty).
5043 if (uvm_processor_mask_test(&va_block->resident, dst_id))
5044 block_mark_memory_used(va_block, dst_id);
5045
5046 // Check state of all chunks after residency change.
5047 // TODO: Bug 4207783: Check both CPU and GPU chunks.
5048 UVM_ASSERT(block_check_cpu_chunks(va_block));
5049 }
5050
uvm_va_block_make_resident(uvm_va_block_t * va_block,uvm_va_block_retry_t * va_block_retry,uvm_va_block_context_t * va_block_context,uvm_processor_id_t dest_id,uvm_va_block_region_t region,const uvm_page_mask_t * page_mask,const uvm_page_mask_t * prefetch_page_mask,uvm_make_resident_cause_t cause)5051 NV_STATUS uvm_va_block_make_resident(uvm_va_block_t *va_block,
5052 uvm_va_block_retry_t *va_block_retry,
5053 uvm_va_block_context_t *va_block_context,
5054 uvm_processor_id_t dest_id,
5055 uvm_va_block_region_t region,
5056 const uvm_page_mask_t *page_mask,
5057 const uvm_page_mask_t *prefetch_page_mask,
5058 uvm_make_resident_cause_t cause)
5059 {
5060 NV_STATUS status;
5061
5062 status = uvm_va_block_make_resident_copy(va_block,
5063 va_block_retry,
5064 va_block_context,
5065 dest_id,
5066 region,
5067 page_mask,
5068 prefetch_page_mask,
5069 cause);
5070 if (status != NV_OK)
5071 return status;
5072
5073 uvm_va_block_make_resident_finish(va_block,
5074 va_block_context,
5075 region,
5076 page_mask);
5077
5078 return NV_OK;
5079 }
5080
5081 // Combination function which prepares the input {region, page_mask} for
5082 // entering read-duplication. It:
5083 // - Unmaps all processors but revoke_id
5084 // - Revokes write access from revoke_id
block_prep_read_duplicate_mapping(uvm_va_block_t * va_block,uvm_va_block_context_t * va_block_context,uvm_processor_id_t revoke_id,uvm_va_block_region_t region,const uvm_page_mask_t * page_mask)5085 static NV_STATUS block_prep_read_duplicate_mapping(uvm_va_block_t *va_block,
5086 uvm_va_block_context_t *va_block_context,
5087 uvm_processor_id_t revoke_id,
5088 uvm_va_block_region_t region,
5089 const uvm_page_mask_t *page_mask)
5090 {
5091 uvm_processor_mask_t *unmap_processor_mask;
5092 uvm_processor_id_t unmap_id;
5093 uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
5094 NV_STATUS status, tracker_status;
5095
5096 unmap_processor_mask = uvm_processor_mask_cache_alloc();
5097 if (!unmap_processor_mask)
5098 return NV_ERR_NO_MEMORY;
5099
5100 // Unmap everybody except revoke_id
5101 uvm_processor_mask_andnot(unmap_processor_mask, &va_block->mapped, block_get_uvm_lite_gpus(va_block));
5102 uvm_processor_mask_clear(unmap_processor_mask, revoke_id);
5103
5104 for_each_id_in_mask(unmap_id, unmap_processor_mask) {
5105 status = uvm_va_block_unmap(va_block, va_block_context, unmap_id, region, page_mask, &local_tracker);
5106 if (status != NV_OK)
5107 goto out;
5108 }
5109
5110 // Revoke WRITE/ATOMIC access permissions from the remaining mapped
5111 // processor.
5112 status = uvm_va_block_revoke_prot(va_block,
5113 va_block_context,
5114 revoke_id,
5115 region,
5116 page_mask,
5117 UVM_PROT_READ_WRITE,
5118 &local_tracker);
5119 if (status != NV_OK)
5120 goto out;
5121
5122 out:
5123 tracker_status = uvm_tracker_add_tracker_safe(&va_block->tracker, &local_tracker);
5124 uvm_tracker_deinit(&local_tracker);
5125 uvm_processor_mask_cache_free(unmap_processor_mask);
5126 return status == NV_OK ? tracker_status : status;
5127 }
5128
uvm_va_block_make_resident_read_duplicate(uvm_va_block_t * va_block,uvm_va_block_retry_t * va_block_retry,uvm_va_block_context_t * va_block_context,uvm_processor_id_t dest_id,uvm_va_block_region_t region,const uvm_page_mask_t * page_mask,const uvm_page_mask_t * prefetch_page_mask,uvm_make_resident_cause_t cause)5129 NV_STATUS uvm_va_block_make_resident_read_duplicate(uvm_va_block_t *va_block,
5130 uvm_va_block_retry_t *va_block_retry,
5131 uvm_va_block_context_t *va_block_context,
5132 uvm_processor_id_t dest_id,
5133 uvm_va_block_region_t region,
5134 const uvm_page_mask_t *page_mask,
5135 const uvm_page_mask_t *prefetch_page_mask,
5136 uvm_make_resident_cause_t cause)
5137 {
5138 NV_STATUS status = NV_OK;
5139 uvm_processor_id_t src_id;
5140 uvm_page_mask_t *dst_resident_mask;
5141 uvm_page_mask_t *migrated_pages;
5142 uvm_page_mask_t *staged_pages;
5143 uvm_page_mask_t *scratch_residency_mask;
5144
5145 // TODO: Bug 3660922: need to implement HMM read duplication support.
5146 UVM_ASSERT(!uvm_va_block_is_hmm(va_block));
5147
5148 va_block_context->make_resident.dest_id = dest_id;
5149 va_block_context->make_resident.cause = cause;
5150 nodes_clear(va_block_context->make_resident.cpu_pages_used.nodes);
5151
5152 if (prefetch_page_mask) {
5153 // TODO: Bug 1877578: investigate automatic read-duplicate policies
5154 UVM_ASSERT(cause == UVM_MAKE_RESIDENT_CAUSE_REPLAYABLE_FAULT ||
5155 cause == UVM_MAKE_RESIDENT_CAUSE_NON_REPLAYABLE_FAULT ||
5156 cause == UVM_MAKE_RESIDENT_CAUSE_ACCESS_COUNTER);
5157 }
5158
5159 uvm_assert_mutex_locked(&va_block->lock);
5160 UVM_ASSERT(!uvm_va_block_is_dead(va_block));
5161
5162 scratch_residency_mask = kmem_cache_alloc(g_uvm_page_mask_cache, NV_UVM_GFP_FLAGS);
5163 if (!scratch_residency_mask)
5164 return NV_ERR_NO_MEMORY;
5165
5166 // For pages that are entering read-duplication we need to unmap remote
5167 // mappings and revoke RW and higher access permissions.
5168 //
5169 // The current implementation:
5170 // - Unmaps pages from all processors but the one with the resident copy
5171 // - Revokes write access from the processor with the resident copy
5172 for_each_id_in_mask(src_id, &va_block->resident) {
5173 // Note that the below calls to block_populate_pages and
5174 // block_copy_resident_pages also use
5175 // va_block_context->make_resident.page_mask.
5176 uvm_page_mask_t *preprocess_page_mask = &va_block_context->make_resident.page_mask;
5177 const uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, src_id, NUMA_NO_NODE);
5178 UVM_ASSERT(!uvm_page_mask_empty(resident_mask));
5179
5180 if (page_mask)
5181 uvm_page_mask_andnot(preprocess_page_mask, page_mask, &va_block->read_duplicated_pages);
5182 else
5183 uvm_page_mask_complement(preprocess_page_mask, &va_block->read_duplicated_pages);
5184
5185 // If there are no pages that need to be unmapped/revoked, skip to the
5186 // next processor
5187 if (!uvm_page_mask_and(preprocess_page_mask, preprocess_page_mask, resident_mask))
5188 continue;
5189
5190 status = block_prep_read_duplicate_mapping(va_block, va_block_context, src_id, region, preprocess_page_mask);
5191 if (status != NV_OK)
5192 goto out;
5193 }
5194
5195 status = block_populate_pages(va_block, va_block_retry, va_block_context, dest_id, region, page_mask);
5196 if (status != NV_OK)
5197 goto out;
5198
5199 status = block_copy_resident_pages(va_block,
5200 va_block_context,
5201 dest_id,
5202 region,
5203 page_mask,
5204 prefetch_page_mask,
5205 UVM_VA_BLOCK_TRANSFER_MODE_COPY);
5206 if (status != NV_OK)
5207 goto out;
5208
5209 // Pages that weren't resident anywhere else were populated at the
5210 // destination directly. Mark them as resident now, since there were no
5211 // errors from block_copy_resident_pages() above.
5212 migrated_pages = &va_block_context->make_resident.pages_migrated;
5213 uvm_page_mask_init_from_region(scratch_residency_mask, region, page_mask);
5214 uvm_page_mask_andnot(scratch_residency_mask, scratch_residency_mask, migrated_pages);
5215
5216 if (!uvm_page_mask_empty(scratch_residency_mask))
5217 block_copy_set_first_touch_residency(va_block, va_block_context, dest_id, region, scratch_residency_mask);
5218
5219 staged_pages = &va_block_context->make_resident.pages_staged;
5220 if (!UVM_ID_IS_CPU(dest_id) && !uvm_page_mask_empty(staged_pages)) {
5221 uvm_va_block_cpu_set_resident_all_chunks(va_block, va_block_context, staged_pages);
5222 block_set_resident_processor(va_block, UVM_ID_CPU);
5223 uvm_page_mask_or(&va_block->read_duplicated_pages, &va_block->read_duplicated_pages, staged_pages);
5224 uvm_tools_record_read_duplicate(va_block, UVM_ID_CPU, region, staged_pages);
5225 }
5226
5227 if (!uvm_page_mask_empty(migrated_pages)) {
5228 if (UVM_ID_IS_CPU(dest_id)) {
5229 // Check if the CPU is already in the resident set of processors.
5230 // We need to do this since we can't have multiple NUMA nodes with
5231 // resident pages.
5232 // If any of the migrate pages were already resident on the CPU, the
5233 // residency has to be switched to the destination NUMA node.
5234 if (uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU) &&
5235 uvm_page_mask_and(scratch_residency_mask,
5236 uvm_va_block_resident_mask_get(va_block, UVM_ID_CPU, NUMA_NO_NODE),
5237 migrated_pages)) {
5238 uvm_va_block_cpu_clear_resident_all_chunks(va_block, va_block_context, scratch_residency_mask);
5239 }
5240
5241 uvm_va_block_cpu_set_resident_all_chunks(va_block, va_block_context, migrated_pages);
5242 }
5243 else {
5244 dst_resident_mask = uvm_va_block_resident_mask_get(va_block, dest_id, NUMA_NO_NODE);
5245 uvm_page_mask_or(dst_resident_mask, dst_resident_mask, migrated_pages);
5246 }
5247
5248 block_set_resident_processor(va_block, dest_id);
5249 uvm_page_mask_or(&va_block->read_duplicated_pages, &va_block->read_duplicated_pages, migrated_pages);
5250 uvm_tools_record_read_duplicate(va_block, dest_id, region, migrated_pages);
5251 }
5252
5253 UVM_ASSERT(cause != UVM_MAKE_RESIDENT_CAUSE_EVICTION);
5254 if (UVM_ID_IS_GPU(dest_id) && uvm_processor_mask_test(&va_block->evicted_gpus, dest_id))
5255 block_make_resident_clear_evicted(va_block, dest_id, migrated_pages);
5256
5257 // Update eviction heuristics, if needed. Notably this could repeat the call
5258 // done in block_set_resident_processor(), but that doesn't do anything bad
5259 // and it's simpler to keep it in both places.
5260 //
5261 // Skip this if we didn't do anything (the input region and/or page mask was
5262 // empty).
5263 if (uvm_processor_mask_test(&va_block->resident, dest_id))
5264 block_mark_memory_used(va_block, dest_id);
5265
5266 // Check state of all chunks after residency change.
5267 // TODO: Bug 4207783: Check both CPU and GPU chunks.
5268 UVM_ASSERT(block_check_cpu_chunks(va_block));
5269 out:
5270 kmem_cache_free(g_uvm_page_mask_cache, scratch_residency_mask);
5271 return status;
5272 }
5273
5274 // Looks up the current CPU mapping state of page from the
5275 // block->cpu.pte_bits bitmaps. If write access is enabled,
5276 // UVM_PROT_READ_WRITE_ATOMIC is returned instead of UVM_PROT_READ_WRITE, since
5277 // write access implies atomic access for CPUs.
block_page_prot_cpu(uvm_va_block_t * block,uvm_page_index_t page_index)5278 static uvm_prot_t block_page_prot_cpu(uvm_va_block_t *block, uvm_page_index_t page_index)
5279 {
5280 uvm_prot_t prot;
5281
5282 UVM_ASSERT(!uvm_va_block_is_dead(block));
5283
5284 if (uvm_page_mask_test(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE], page_index))
5285 prot = UVM_PROT_READ_WRITE_ATOMIC;
5286 else if (uvm_page_mask_test(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ], page_index))
5287 prot = UVM_PROT_READ_ONLY;
5288 else
5289 prot = UVM_PROT_NONE;
5290
5291 return prot;
5292 }
5293
5294 // Looks up the current GPU mapping state of page from the
5295 // block->gpus[i]->pte_bits bitmaps.
block_page_prot_gpu(uvm_va_block_t * block,uvm_gpu_t * gpu,uvm_page_index_t page_index)5296 static uvm_prot_t block_page_prot_gpu(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_page_index_t page_index)
5297 {
5298 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
5299 uvm_prot_t prot;
5300
5301 UVM_ASSERT(!uvm_va_block_is_dead(block));
5302
5303 if (!gpu_state)
5304 return UVM_PROT_NONE;
5305
5306 if (uvm_page_mask_test(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_ATOMIC], page_index))
5307 prot = UVM_PROT_READ_WRITE_ATOMIC;
5308 else if (uvm_page_mask_test(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_WRITE], page_index))
5309 prot = UVM_PROT_READ_WRITE;
5310 else if (uvm_page_mask_test(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ], page_index))
5311 prot = UVM_PROT_READ_ONLY;
5312 else
5313 prot = UVM_PROT_NONE;
5314
5315 return prot;
5316 }
5317
block_page_prot(uvm_va_block_t * block,uvm_processor_id_t id,uvm_page_index_t page_index)5318 static uvm_prot_t block_page_prot(uvm_va_block_t *block, uvm_processor_id_t id, uvm_page_index_t page_index)
5319 {
5320 if (UVM_ID_IS_CPU(id))
5321 return block_page_prot_cpu(block, page_index);
5322 else
5323 return block_page_prot_gpu(block, block_get_gpu(block, id), page_index);
5324 }
5325
5326 // Returns true if the block has any valid CPU PTE mapping in the block region.
block_has_valid_mapping_cpu(uvm_va_block_t * block,uvm_va_block_region_t region)5327 static bool block_has_valid_mapping_cpu(uvm_va_block_t *block, uvm_va_block_region_t region)
5328 {
5329 size_t valid_page;
5330
5331 UVM_ASSERT(region.outer <= uvm_va_block_num_cpu_pages(block));
5332
5333 // Early-out: check whether any address in this block has a CPU mapping
5334 if (!uvm_processor_mask_test(&block->mapped, UVM_ID_CPU)) {
5335 UVM_ASSERT(uvm_page_mask_empty(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ]));
5336 UVM_ASSERT(uvm_page_mask_empty(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE]));
5337 return false;
5338 }
5339
5340 // All valid mappings have at least read permissions so we only need to
5341 // inspect the read bits.
5342 valid_page = uvm_va_block_first_page_in_mask(region, &block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ]);
5343 if (valid_page == region.outer)
5344 return false;
5345
5346 UVM_ASSERT(block_page_prot_cpu(block, valid_page) != UVM_PROT_NONE);
5347 return true;
5348 }
5349
block_check_chunk_indirect_peers(uvm_va_block_t * block,uvm_gpu_t * gpu,uvm_gpu_chunk_t * chunk)5350 static bool block_check_chunk_indirect_peers(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_gpu_chunk_t *chunk)
5351 {
5352 uvm_gpu_t *accessing_gpu;
5353 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
5354
5355 if (!uvm_pmm_sysmem_mappings_indirect_supported())
5356 return true;
5357
5358 for_each_va_space_gpu_in_mask(accessing_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)]) {
5359 NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(&gpu->pmm, chunk, accessing_gpu);
5360 uvm_reverse_map_t reverse_map;
5361 size_t num_mappings;
5362
5363 num_mappings = uvm_pmm_sysmem_mappings_dma_to_virt(&accessing_gpu->pmm_reverse_sysmem_mappings,
5364 peer_addr,
5365 uvm_gpu_chunk_get_size(chunk),
5366 &reverse_map,
5367 1);
5368 UVM_ASSERT(num_mappings == 1);
5369 UVM_ASSERT(reverse_map.va_block == block);
5370 UVM_ASSERT(reverse_map.region.first == chunk->va_block_page_index);
5371 UVM_ASSERT(uvm_va_block_region_size(reverse_map.region) == uvm_gpu_chunk_get_size(chunk));
5372
5373 uvm_va_block_release_no_destroy(reverse_map.va_block);
5374 }
5375
5376 return true;
5377 }
5378
5379 // Sanity check the given GPU's chunks array
block_check_gpu_chunks(uvm_va_block_t * block,uvm_gpu_id_t id)5380 static bool block_check_gpu_chunks(uvm_va_block_t *block, uvm_gpu_id_t id)
5381 {
5382 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, id);
5383 uvm_gpu_t *gpu;
5384 size_t i, num_chunks;
5385 uvm_page_index_t page_index;
5386 uvm_chunk_size_t chunk_size;
5387
5388 if (!gpu_state)
5389 return true;
5390
5391 gpu = block_get_gpu(block, id);
5392
5393 num_chunks = block_num_gpu_chunks(block, gpu);
5394 for (page_index = 0, i = 0; i < num_chunks; i++) {
5395 uvm_gpu_chunk_t *chunk = gpu_state->chunks[i];
5396 size_t chunk_index = block_gpu_chunk_index(block, gpu, page_index, &chunk_size);
5397
5398 if (chunk_index != i) {
5399 UVM_ERR_PRINT("chunk index mismatch: calculated %zu, is in %zu. VA block [0x%llx, 0x%llx) GPU %u page_index: %u\n",
5400 chunk_index,
5401 i,
5402 block->start,
5403 block->end + 1,
5404 uvm_id_value(id),
5405 page_index);
5406 return false;
5407 }
5408
5409 if (chunk) {
5410 if (chunk_size != uvm_gpu_chunk_get_size(chunk)) {
5411 UVM_ERR_PRINT("chunk size mismatch: calc %u, actual %u. VA block [0x%llx, 0x%llx) GPU: %u page_index: %u chunk index: %zu\n",
5412 chunk_size,
5413 uvm_gpu_chunk_get_size(chunk),
5414 block->start,
5415 block->end + 1,
5416 uvm_id_value(id),
5417 page_index,
5418 i);
5419 return false;
5420 }
5421
5422 if (chunk->state != UVM_PMM_GPU_CHUNK_STATE_ALLOCATED) {
5423 UVM_ERR_PRINT("Invalid chunk state %s. VA block [0x%llx, 0x%llx) GPU: %u page_index: %u chunk index: %zu chunk_size: %u\n",
5424 uvm_pmm_gpu_chunk_state_string(chunk->state),
5425 block->start,
5426 block->end + 1,
5427 uvm_id_value(id),
5428 page_index,
5429 i,
5430 chunk_size);
5431 return false;
5432 }
5433
5434 UVM_ASSERT(chunk->va_block == block);
5435 UVM_ASSERT(chunk->va_block_page_index == page_index);
5436
5437 UVM_ASSERT(block_check_chunk_indirect_peers(block, gpu, chunk));
5438 }
5439
5440 page_index += chunk_size / PAGE_SIZE;
5441 }
5442
5443 return true;
5444 }
5445
block_check_chunks(uvm_va_block_t * va_block)5446 static bool block_check_chunks(uvm_va_block_t *va_block)
5447 {
5448 uvm_gpu_id_t id;
5449
5450 for_each_gpu_id(id) {
5451 if (!block_check_gpu_chunks(va_block, id))
5452 return false;
5453 }
5454
5455 return block_check_cpu_chunks(va_block);
5456 }
5457
5458 typedef struct
5459 {
5460 uvm_processor_mask_t atomic_mappings;
5461 uvm_processor_mask_t write_mappings;
5462 uvm_processor_mask_t read_mappings;
5463 uvm_processor_mask_t lite_read_mappings;
5464 uvm_processor_mask_t lite_atomic_mappings;
5465 uvm_processor_mask_t remaining_mappings;
5466 uvm_processor_mask_t temp_mappings;
5467 uvm_processor_mask_t resident_processors;
5468 uvm_processor_mask_t native_atomics;
5469 uvm_processor_mask_t non_native_atomics;
5470 uvm_processor_mask_t residency_accessible_from;
5471 uvm_processor_mask_t residency_has_native_atomics;
5472 } mapping_masks_t;
5473
5474 // Sanity checks for page mappings
block_check_mappings_page(uvm_va_block_t * block,uvm_va_block_context_t * block_context,uvm_page_index_t page_index)5475 static bool block_check_mappings_page(uvm_va_block_t *block,
5476 uvm_va_block_context_t *block_context,
5477 uvm_page_index_t page_index)
5478 {
5479 uvm_processor_mask_t *atomic_mappings, *write_mappings, *read_mappings;
5480 uvm_processor_mask_t *lite_read_mappings, *lite_atomic_mappings;
5481 uvm_processor_mask_t *remaining_mappings, *temp_mappings;
5482 uvm_processor_mask_t *resident_processors;
5483 uvm_processor_mask_t *native_atomics, *non_native_atomics;
5484 uvm_processor_mask_t *residency_accessible_from;
5485 uvm_processor_mask_t *residency_has_native_atomics;
5486 uvm_processor_id_t residency, id;
5487 uvm_va_range_t *va_range = block->va_range;
5488 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
5489 uvm_processor_id_t preferred_location = va_range ?
5490 uvm_va_range_get_policy(va_range)->preferred_location :
5491 UVM_ID_INVALID;
5492 const uvm_processor_mask_t *uvm_lite_gpus = block_get_uvm_lite_gpus(block);
5493 mapping_masks_t *mapping_masks = uvm_kvmalloc(sizeof(*mapping_masks));
5494
5495 // Since all subsequent checks are skipped if mapping_masks allocation
5496 // fails, assert so that assertion messages can be seen on non-release
5497 // builds.
5498 UVM_ASSERT(mapping_masks);
5499
5500 if (!mapping_masks)
5501 return true;
5502
5503 atomic_mappings = &mapping_masks->atomic_mappings;
5504 write_mappings = &mapping_masks->write_mappings;
5505 read_mappings = &mapping_masks->read_mappings;
5506
5507 block_page_authorized_processors(block, page_index, UVM_PROT_READ_WRITE_ATOMIC, atomic_mappings);
5508 block_page_authorized_processors(block, page_index, UVM_PROT_READ_WRITE, write_mappings);
5509 block_page_authorized_processors(block, page_index, UVM_PROT_READ_ONLY, read_mappings);
5510
5511 // Each access bit implies all accesses below it
5512 UVM_ASSERT(uvm_processor_mask_subset(atomic_mappings, write_mappings));
5513 UVM_ASSERT(uvm_processor_mask_subset(write_mappings, read_mappings));
5514 UVM_ASSERT(uvm_processor_mask_subset(read_mappings, &block->mapped));
5515
5516 resident_processors = &mapping_masks->resident_processors;
5517
5518 uvm_va_block_page_resident_processors(block, page_index, resident_processors);
5519 UVM_ASSERT(uvm_processor_mask_subset(resident_processors, &block->resident));
5520
5521 remaining_mappings = &mapping_masks->remaining_mappings;
5522 temp_mappings = &mapping_masks->temp_mappings;
5523
5524 // Sanity check block_get_mapped_processors
5525 uvm_processor_mask_copy(remaining_mappings, read_mappings);
5526 for_each_id_in_mask(residency, resident_processors) {
5527 block_get_mapped_processors(block, block_context, residency, page_index, temp_mappings);
5528 UVM_ASSERT(uvm_processor_mask_subset(temp_mappings, remaining_mappings));
5529 uvm_processor_mask_andnot(remaining_mappings, remaining_mappings, temp_mappings);
5530 }
5531
5532 // Any remaining mappings point to non-resident locations, so they must be
5533 // UVM-Lite mappings.
5534 UVM_ASSERT(uvm_processor_mask_subset(remaining_mappings, uvm_lite_gpus));
5535
5536 residency = uvm_processor_mask_find_first_id(resident_processors);
5537
5538 residency_accessible_from = &mapping_masks->residency_accessible_from;
5539 residency_has_native_atomics = &mapping_masks->residency_has_native_atomics;
5540
5541 if (uvm_processor_mask_get_count(resident_processors) > 0) {
5542 residency_accessible_from = &va_space->accessible_from[uvm_id_value(residency)];
5543 residency_has_native_atomics = &va_space->has_native_atomics[uvm_id_value(residency)];
5544 }
5545
5546 // If the page is not resident, there should be no valid mappings
5547 UVM_ASSERT_MSG(uvm_processor_mask_get_count(resident_processors) > 0 ||
5548 uvm_processor_mask_get_count(read_mappings) == 0,
5549 "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx - SWA: 0x%lx - RD: 0x%lx\n",
5550 *resident_processors->bitmap,
5551 *read_mappings->bitmap, *write_mappings->bitmap, *atomic_mappings->bitmap,
5552 *va_space->system_wide_atomics_enabled_processors.bitmap,
5553 *block->read_duplicated_pages.bitmap);
5554
5555 // Test read_duplicated_pages mask
5556 UVM_ASSERT_MSG((!uvm_page_mask_test(&block->read_duplicated_pages, page_index) &&
5557 uvm_processor_mask_get_count(resident_processors) <= 1) ||
5558 (uvm_page_mask_test(&block->read_duplicated_pages, page_index) &&
5559 uvm_processor_mask_get_count(resident_processors) >= 1),
5560 "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx - SWA: 0x%lx - RD: 0x%lx\n",
5561 *resident_processors->bitmap,
5562 *read_mappings->bitmap,
5563 *write_mappings->bitmap,
5564 *atomic_mappings->bitmap,
5565 *va_space->system_wide_atomics_enabled_processors.bitmap,
5566 *block->read_duplicated_pages.bitmap);
5567
5568 if (!uvm_processor_mask_empty(uvm_lite_gpus))
5569 UVM_ASSERT(UVM_ID_IS_VALID(preferred_location));
5570
5571 lite_read_mappings = &mapping_masks->lite_read_mappings;
5572 lite_atomic_mappings = &mapping_masks->lite_atomic_mappings;
5573
5574 // UVM-Lite checks. Since the range group is made non-migratable before the
5575 // actual migrations for that range group happen, we can only make those
5576 // checks which are valid on both migratable and non-migratable range
5577 // groups.
5578 uvm_processor_mask_and(lite_read_mappings, read_mappings, uvm_lite_gpus);
5579 uvm_processor_mask_and(lite_atomic_mappings, atomic_mappings, uvm_lite_gpus);
5580
5581 // Any mapping from a UVM-Lite GPU must be atomic...
5582 UVM_ASSERT(uvm_processor_mask_equal(lite_read_mappings, lite_atomic_mappings));
5583
5584 // ... and must have access to preferred_location
5585 if (UVM_ID_IS_VALID(preferred_location)) {
5586 const uvm_processor_mask_t *preferred_location_accessible_from;
5587
5588 preferred_location_accessible_from = &va_space->accessible_from[uvm_id_value(preferred_location)];
5589 UVM_ASSERT(uvm_processor_mask_subset(lite_atomic_mappings, preferred_location_accessible_from));
5590 }
5591
5592 for_each_id_in_mask(id, lite_atomic_mappings)
5593 UVM_ASSERT(uvm_processor_mask_test(&va_space->can_access[uvm_id_value(id)], preferred_location));
5594
5595 // Exclude uvm_lite_gpus from mappings' masks after UVM-Lite tests
5596 uvm_processor_mask_andnot(read_mappings, read_mappings, uvm_lite_gpus);
5597 uvm_processor_mask_andnot(write_mappings, write_mappings, uvm_lite_gpus);
5598 uvm_processor_mask_andnot(atomic_mappings, atomic_mappings, uvm_lite_gpus);
5599
5600 // Pages set to zero in maybe_mapped_pages must not be mapped on any
5601 // non-UVM-Lite GPU
5602 if (!uvm_va_block_is_hmm(block) && !uvm_page_mask_test(&block->maybe_mapped_pages, page_index)) {
5603 UVM_ASSERT_MSG(uvm_processor_mask_get_count(read_mappings) == 0,
5604 "Resident: 0x%lx - Mappings Block: 0x%lx / Page R: 0x%lx W: 0x%lx A: 0x%lx\n",
5605 *resident_processors->bitmap,
5606 *block->mapped.bitmap,
5607 *read_mappings->bitmap, *write_mappings->bitmap, *atomic_mappings->bitmap);
5608 }
5609
5610 // atomic mappings from GPUs with disabled system-wide atomics are treated
5611 // as write mappings. Therefore, we remove them from the atomic mappings mask
5612 uvm_processor_mask_and(atomic_mappings, atomic_mappings, &va_space->system_wide_atomics_enabled_processors);
5613
5614 if (!uvm_processor_mask_empty(read_mappings)) {
5615 // Read-duplicate: if a page is resident in multiple locations, it
5616 // must be resident locally on each mapped processor.
5617 if (uvm_processor_mask_get_count(resident_processors) > 1) {
5618 UVM_ASSERT_MSG(uvm_processor_mask_subset(read_mappings, resident_processors),
5619 "Read-duplicate copies from remote processors\n"
5620 "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx - SWA: 0x%lx - RD: 0x%lx\n",
5621 *resident_processors->bitmap,
5622 *read_mappings->bitmap, *write_mappings->bitmap, *atomic_mappings->bitmap,
5623 *va_space->system_wide_atomics_enabled_processors.bitmap,
5624 *block->read_duplicated_pages.bitmap);
5625 }
5626 else {
5627 // Processors with mappings must have access to the processor that
5628 // has the valid copy
5629 UVM_ASSERT_MSG(uvm_processor_mask_subset(read_mappings, residency_accessible_from),
5630 "Not all processors have access to %s\n"
5631 "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx -"
5632 "Access: 0x%lx - Native Atomics: 0x%lx - SWA: 0x%lx\n",
5633 uvm_va_space_processor_name(va_space, residency),
5634 *resident_processors->bitmap,
5635 *read_mappings->bitmap,
5636 *write_mappings->bitmap,
5637 *atomic_mappings->bitmap,
5638 *residency_accessible_from->bitmap,
5639 *residency_has_native_atomics->bitmap,
5640 *va_space->system_wide_atomics_enabled_processors.bitmap);
5641 for_each_id_in_mask(id, read_mappings) {
5642 UVM_ASSERT(uvm_processor_mask_test(&va_space->can_access[uvm_id_value(id)], residency));
5643
5644 if (uvm_processor_mask_test(&va_space->indirect_peers[uvm_id_value(residency)], id)) {
5645 uvm_gpu_t *resident_gpu = uvm_va_space_get_gpu(va_space, residency);
5646 uvm_gpu_t *mapped_gpu = uvm_va_space_get_gpu(va_space, id);
5647 uvm_gpu_chunk_t *chunk = block_phys_page_chunk(block,
5648 block_phys_page(residency, NUMA_NO_NODE, page_index),
5649 NULL);
5650
5651 // This function will assert if no mapping exists
5652 (void)uvm_pmm_gpu_indirect_peer_addr(&resident_gpu->pmm, chunk, mapped_gpu);
5653 }
5654 }
5655 }
5656 }
5657
5658 // If any processor has a writable mapping, there must only be one copy of
5659 // the page in the system
5660 if (!uvm_processor_mask_empty(write_mappings)) {
5661 UVM_ASSERT_MSG(uvm_processor_mask_get_count(resident_processors) == 1,
5662 "Too many resident copies for pages with write_mappings\n"
5663 "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx - SWA: 0x%lx - RD: 0x%lx\n",
5664 *resident_processors->bitmap,
5665 *read_mappings->bitmap,
5666 *write_mappings->bitmap,
5667 *atomic_mappings->bitmap,
5668 *va_space->system_wide_atomics_enabled_processors.bitmap,
5669 *block->read_duplicated_pages.bitmap);
5670 }
5671
5672 if (!uvm_processor_mask_empty(atomic_mappings)) {
5673
5674 native_atomics = &mapping_masks->native_atomics;
5675
5676 uvm_processor_mask_and(native_atomics, atomic_mappings, residency_has_native_atomics);
5677
5678 if (uvm_processor_mask_empty(native_atomics)) {
5679 // No other faultable processor should be able to write
5680 uvm_processor_mask_and(write_mappings, write_mappings, &va_space->faultable_processors);
5681
5682 UVM_ASSERT_MSG(uvm_processor_mask_get_count(write_mappings) == 1,
5683 "Too many write mappings to %s from processors with non-native atomics\n"
5684 "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx -"
5685 "Access: 0x%lx - Native Atomics: 0x%lx - SWA: 0x%lx\n",
5686 uvm_va_space_processor_name(va_space, residency),
5687 *resident_processors->bitmap,
5688 *read_mappings->bitmap,
5689 *write_mappings->bitmap,
5690 *atomic_mappings->bitmap,
5691 *residency_accessible_from->bitmap,
5692 *residency_has_native_atomics->bitmap,
5693 *va_space->system_wide_atomics_enabled_processors.bitmap);
5694
5695 // Only one processor outside of the native group can have atomics enabled
5696 UVM_ASSERT_MSG(uvm_processor_mask_get_count(atomic_mappings) == 1,
5697 "Too many atomics mappings to %s from processors with non-native atomics\n"
5698 "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx -"
5699 "Access: 0x%lx - Native Atomics: 0x%lx - SWA: 0x%lx\n",
5700 uvm_va_space_processor_name(va_space, residency),
5701 *resident_processors->bitmap,
5702 *read_mappings->bitmap,
5703 *write_mappings->bitmap,
5704 *atomic_mappings->bitmap,
5705 *residency_accessible_from->bitmap,
5706 *residency_has_native_atomics->bitmap,
5707 *va_space->system_wide_atomics_enabled_processors.bitmap);
5708 }
5709 else {
5710
5711 non_native_atomics = &mapping_masks->non_native_atomics;
5712
5713 // One or more processors within the native group have atomics enabled.
5714 // All processors outside of that group may have write but not atomic
5715 // permissions.
5716 uvm_processor_mask_andnot(non_native_atomics, atomic_mappings, residency_has_native_atomics);
5717
5718 UVM_ASSERT_MSG(uvm_processor_mask_empty(non_native_atomics),
5719 "atomic mappings to %s from processors native and non-native\n"
5720 "Resident: 0x%lx - Mappings R: 0x%lx W: 0x%lx A: 0x%lx -"
5721 "Access: 0x%lx - Native Atomics: 0x%lx - SWA: 0x%lx\n",
5722 uvm_va_space_processor_name(va_space, residency),
5723 *resident_processors->bitmap,
5724 *read_mappings->bitmap,
5725 *write_mappings->bitmap,
5726 *atomic_mappings->bitmap,
5727 *residency_accessible_from->bitmap,
5728 *residency_has_native_atomics->bitmap,
5729 *va_space->system_wide_atomics_enabled_processors.bitmap);
5730 }
5731 }
5732
5733 uvm_kvfree(mapping_masks);
5734 return true;
5735 }
5736
block_check_mappings_ptes(uvm_va_block_t * block,uvm_va_block_context_t * block_context,uvm_gpu_t * gpu)5737 static bool block_check_mappings_ptes(uvm_va_block_t *block, uvm_va_block_context_t *block_context, uvm_gpu_t *gpu)
5738 {
5739 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
5740 uvm_va_block_gpu_state_t *resident_gpu_state;
5741 uvm_pte_bits_gpu_t pte_bit;
5742 uvm_processor_id_t resident_id;
5743 uvm_prot_t prot;
5744 NvU32 big_page_size;
5745 size_t num_big_pages, big_page_index;
5746 uvm_va_block_region_t big_region, chunk_region;
5747 uvm_gpu_chunk_t *chunk;
5748
5749 if (!gpu_state->page_table_range_4k.table)
5750 UVM_ASSERT(!gpu_state->activated_4k);
5751
5752 if (!gpu_state->page_table_range_big.table) {
5753 UVM_ASSERT(!gpu_state->initialized_big);
5754 UVM_ASSERT(!gpu_state->activated_big);
5755 }
5756
5757 // It's only safe to check the PTE mappings if we have page tables. See
5758 // uvm_va_block_get_gpu_va_space.
5759 if (!block_gpu_has_page_tables(block, gpu)) {
5760 UVM_ASSERT(!uvm_processor_mask_test(&block->mapped, gpu->id));
5761 return true;
5762 }
5763
5764 big_page_size = uvm_va_block_gpu_big_page_size(block, gpu);
5765 num_big_pages = uvm_va_block_num_big_pages(block, big_page_size);
5766
5767 if (block_gpu_supports_2m(block, gpu)) {
5768 if (gpu_state->page_table_range_big.table || gpu_state->page_table_range_4k.table) {
5769 // 2M blocks require the 2M entry to be allocated for the lower
5770 // ranges to also be allocated.
5771 UVM_ASSERT(gpu_state->page_table_range_2m.table);
5772 }
5773 else if (gpu_state->page_table_range_2m.table) {
5774 // If the 2M entry is present but the lower ones aren't, the PTE
5775 // must be 2M.
5776 UVM_ASSERT(gpu_state->pte_is_2m);
5777 }
5778 }
5779 else {
5780 UVM_ASSERT(!gpu_state->page_table_range_2m.table);
5781 if (num_big_pages == 0)
5782 UVM_ASSERT(!gpu_state->page_table_range_big.table);
5783 }
5784
5785 // If we have the big table and it's in use then it must have been
5786 // initialized, even if it doesn't currently contain active PTEs.
5787 if ((!block_gpu_supports_2m(block, gpu) && gpu_state->page_table_range_big.table) ||
5788 (block_gpu_supports_2m(block, gpu) && !gpu_state->pte_is_2m && gpu_state->activated_big))
5789 UVM_ASSERT(gpu_state->initialized_big);
5790
5791 if (gpu_state->pte_is_2m) {
5792 UVM_ASSERT(block_gpu_supports_2m(block, gpu));
5793 UVM_ASSERT(gpu_state->page_table_range_2m.table);
5794 UVM_ASSERT(bitmap_empty(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK));
5795 UVM_ASSERT(!gpu_state->force_4k_ptes);
5796
5797 // GPU architectures which support 2M pages only support 64K as the big
5798 // page size. All of the 2M code assumes that
5799 // MAX_BIG_PAGES_PER_UVM_VA_BLOCK covers a 2M PTE exactly (bitmap_full,
5800 // bitmap_complement, etc).
5801 BUILD_BUG_ON((UVM_PAGE_SIZE_2M / UVM_PAGE_SIZE_64K) != MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
5802
5803 prot = block_page_prot_gpu(block, gpu, 0);
5804
5805 // All page permissions match
5806 for (pte_bit = 0; pte_bit < UVM_PTE_BITS_GPU_MAX; pte_bit++) {
5807 if (prot == UVM_PROT_NONE || pte_bit > get_gpu_pte_bit_index(prot))
5808 UVM_ASSERT(uvm_page_mask_empty(&gpu_state->pte_bits[pte_bit]));
5809 else
5810 UVM_ASSERT(uvm_page_mask_full(&gpu_state->pte_bits[pte_bit]));
5811 }
5812
5813 if (prot != UVM_PROT_NONE) {
5814 resident_id = block_gpu_get_processor_to_map(block, block_context, gpu, 0);
5815
5816 // block_check_resident_proximity verifies that no closer processor
5817 // has a resident page, so we don't need to check that all pages
5818 // have the same resident_id.
5819
5820 // block_check_mappings_page verifies that all pages marked resident
5821 // are backed by populated memory.
5822
5823 // The mapped processor should be fully resident and physically-
5824 // contiguous.
5825 UVM_ASSERT(uvm_page_mask_full(uvm_va_block_resident_mask_get(block, resident_id, NUMA_NO_NODE)));
5826
5827 if (UVM_ID_IS_GPU(resident_id)) {
5828 resident_gpu_state = uvm_va_block_gpu_state_get(block, resident_id);
5829 UVM_ASSERT(resident_gpu_state);
5830 UVM_ASSERT(uvm_gpu_chunk_get_size(resident_gpu_state->chunks[0]) == UVM_CHUNK_SIZE_2M);
5831 }
5832 else {
5833 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page_resident(block, 0);
5834 int chunk_nid = uvm_cpu_chunk_get_numa_node(chunk);
5835
5836 UVM_ASSERT(uvm_page_mask_full(&block->cpu.allocated));
5837 UVM_ASSERT(chunk);
5838 UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_2M);
5839 UVM_ASSERT(uvm_va_block_cpu_is_region_resident_on(block,
5840 chunk_nid,
5841 uvm_va_block_region_from_block(block)));
5842 }
5843 }
5844 }
5845 else if (!bitmap_empty(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)) {
5846 UVM_ASSERT(gpu_state->page_table_range_big.table);
5847 UVM_ASSERT(!gpu_state->force_4k_ptes);
5848 UVM_ASSERT(num_big_pages > 0);
5849 UVM_ASSERT(gpu_state->initialized_big);
5850
5851 for (big_page_index = 0; big_page_index < num_big_pages; big_page_index++) {
5852 big_region = uvm_va_block_big_page_region(block, big_page_index, big_page_size);
5853
5854 if (!test_bit(big_page_index, gpu_state->big_ptes)) {
5855 // If there are valid mappings but this isn't a big PTE, the
5856 // mapping must be using the 4k PTEs.
5857 if (!uvm_page_mask_region_empty(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ], big_region))
5858 UVM_ASSERT(gpu_state->page_table_range_4k.table);
5859 continue;
5860 }
5861
5862 prot = block_page_prot_gpu(block, gpu, big_region.first);
5863
5864 // All page permissions match
5865 for (pte_bit = 0; pte_bit < UVM_PTE_BITS_GPU_MAX; pte_bit++) {
5866 if (prot == UVM_PROT_NONE || pte_bit > get_gpu_pte_bit_index(prot))
5867 UVM_ASSERT(uvm_page_mask_region_empty(&gpu_state->pte_bits[pte_bit], big_region));
5868 else
5869 UVM_ASSERT(uvm_page_mask_region_full(&gpu_state->pte_bits[pte_bit], big_region));
5870 }
5871
5872 if (prot != UVM_PROT_NONE) {
5873 resident_id = block_gpu_get_processor_to_map(block, block_context, gpu, big_region.first);
5874
5875 // The mapped processor should be fully resident and physically-
5876 // contiguous. Exception: UVM-Lite GPUs always map the preferred
5877 // location even if the memory is resident elsewhere. Skip the
5878 // residency check but still verify contiguity.
5879 if (!uvm_processor_mask_test(block_get_uvm_lite_gpus(block), gpu->id)) {
5880 UVM_ASSERT(
5881 uvm_page_mask_region_full(uvm_va_block_resident_mask_get(block, resident_id, NUMA_NO_NODE),
5882 big_region));
5883 }
5884
5885 if (UVM_ID_IS_CPU(resident_id)) {
5886 int resident_nid = block_get_page_node_residency(block, big_region.first);
5887 uvm_va_block_cpu_node_state_t *node_state = block_node_state_get(block, resident_nid);
5888 uvm_cpu_chunk_t *chunk;
5889
5890 UVM_ASSERT(resident_nid != NUMA_NO_NODE);
5891 UVM_ASSERT(uvm_page_mask_region_full(&node_state->allocated, big_region));
5892 chunk = uvm_cpu_chunk_get_chunk_for_page(block, resident_nid, big_region.first);
5893 UVM_ASSERT(gpu->parent->can_map_sysmem_with_large_pages);
5894 UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) >= uvm_va_block_region_size(big_region));
5895 UVM_ASSERT(uvm_page_mask_region_full(&node_state->resident, big_region));
5896 }
5897 else {
5898 // Check GPU chunks
5899 chunk = block_phys_page_chunk(block,
5900 block_phys_page(resident_id, NUMA_NO_NODE, big_region.first),
5901 NULL);
5902 chunk_region = uvm_va_block_chunk_region(block, uvm_gpu_chunk_get_size(chunk), big_region.first);
5903 UVM_ASSERT(uvm_va_block_region_contains_region(chunk_region, big_region));
5904 }
5905 }
5906 }
5907 }
5908
5909 return true;
5910 }
5911
block_check_mappings(uvm_va_block_t * block,uvm_va_block_context_t * block_context)5912 static bool block_check_mappings(uvm_va_block_t *block, uvm_va_block_context_t *block_context)
5913 {
5914 uvm_page_index_t page_index;
5915 uvm_processor_id_t id;
5916
5917 // Verify the master masks, since block_check_mappings_page relies on them
5918 for_each_id(id) {
5919 const uvm_page_mask_t *resident_mask, *map_mask;
5920
5921 if (UVM_ID_IS_GPU(id) && !uvm_va_block_gpu_state_get(block, id)) {
5922 UVM_ASSERT(!uvm_processor_mask_test(&block->resident, id));
5923 UVM_ASSERT(!uvm_processor_mask_test(&block->mapped, id));
5924 UVM_ASSERT(!uvm_processor_mask_test(&block->evicted_gpus, id));
5925 continue;
5926 }
5927
5928 resident_mask = uvm_va_block_resident_mask_get(block, id, NUMA_NO_NODE);
5929 UVM_ASSERT(uvm_processor_mask_test(&block->resident, id) == !uvm_page_mask_empty(resident_mask));
5930
5931 map_mask = uvm_va_block_map_mask_get(block, id);
5932 UVM_ASSERT(uvm_processor_mask_test(&block->mapped, id) == !uvm_page_mask_empty(map_mask));
5933
5934 if (UVM_ID_IS_GPU(id)) {
5935 const uvm_page_mask_t *evicted_mask = block_evicted_mask_get(block, id);
5936 UVM_ASSERT(uvm_processor_mask_test(&block->evicted_gpus, id) == !uvm_page_mask_empty(evicted_mask));
5937
5938 // Pages cannot be resident if they are marked as evicted
5939 UVM_ASSERT(!uvm_page_mask_intersects(evicted_mask, resident_mask));
5940
5941 // Pages cannot be resident on a GPU with no memory
5942 if (!block_processor_has_memory(block, id))
5943 UVM_ASSERT(!uvm_processor_mask_test(&block->resident, id));
5944 }
5945 }
5946
5947 // Check that every page has coherent mappings
5948 for_each_va_block_page(page_index, block)
5949 block_check_mappings_page(block, block_context, page_index);
5950
5951 for_each_gpu_id(id) {
5952 if (uvm_va_block_gpu_state_get(block, id)) {
5953 uvm_gpu_t *gpu = block_get_gpu(block, id);
5954
5955 // Check big and/or 2M PTE state
5956 block_check_mappings_ptes(block, block_context, gpu);
5957 }
5958 }
5959
5960 return true;
5961 }
5962
5963 // See the comments on uvm_va_block_unmap
block_unmap_cpu(uvm_va_block_t * block,uvm_va_block_context_t * block_context,uvm_va_block_region_t region,const uvm_page_mask_t * unmap_pages)5964 static void block_unmap_cpu(uvm_va_block_t *block,
5965 uvm_va_block_context_t *block_context,
5966 uvm_va_block_region_t region,
5967 const uvm_page_mask_t *unmap_pages)
5968 {
5969 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
5970 uvm_pte_bits_cpu_t pte_bit;
5971 bool unmapped_something = false;
5972 uvm_va_block_region_t subregion;
5973 NvU32 num_mapped_processors;
5974
5975 // Early-out if nothing in the region is mapped or being unmapped.
5976 if (!block_has_valid_mapping_cpu(block, region) ||
5977 (unmap_pages && !uvm_page_mask_intersects(unmap_pages, &block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ])))
5978 return;
5979
5980 // We can't actually unmap HMM ranges from the CPU here.
5981 // Unmapping happens as part of migrate_vma_setup().
5982 if (uvm_va_block_is_hmm(block)) {
5983 UVM_ASSERT(!uvm_va_block_is_hmm(block));
5984 return;
5985 }
5986
5987 num_mapped_processors = uvm_processor_mask_get_count(&block->mapped);
5988
5989 // If we are unmapping a page which we are tracking due to CPU faults with
5990 // correct permissions, clear the info. This will cover both the unmap and
5991 // revoke cases (since we implement CPU revocation by unmap + map)
5992 if (block->cpu.fault_authorized.first_fault_stamp &&
5993 uvm_page_mask_region_test(unmap_pages, region, block->cpu.fault_authorized.page_index))
5994 block->cpu.fault_authorized.first_fault_stamp = 0;
5995
5996 for_each_va_block_subregion_in_mask(subregion, unmap_pages, region) {
5997 if (!block_has_valid_mapping_cpu(block, subregion))
5998 continue;
5999
6000 unmap_mapping_range(va_space->mapping,
6001 uvm_va_block_region_start(block, subregion),
6002 uvm_va_block_region_size(subregion), 1);
6003
6004 for (pte_bit = 0; pte_bit < UVM_PTE_BITS_CPU_MAX; pte_bit++)
6005 uvm_page_mask_region_clear(&block->cpu.pte_bits[pte_bit], subregion);
6006
6007 // If the CPU is the only processor with mappings we can safely mark
6008 // the pages as fully unmapped
6009 if (num_mapped_processors == 1 && !uvm_va_block_is_hmm(block))
6010 uvm_page_mask_region_clear(&block->maybe_mapped_pages, subregion);
6011
6012 unmapped_something = true;
6013 }
6014
6015 if (!unmapped_something)
6016 return;
6017
6018 // Check whether the block has any more mappings
6019 if (uvm_page_mask_empty(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ])) {
6020 UVM_ASSERT(uvm_page_mask_empty(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE]));
6021 uvm_processor_mask_clear(&block->mapped, UVM_ID_CPU);
6022 }
6023
6024 UVM_ASSERT(block_check_mappings(block, block_context));
6025 }
6026
6027 // Given a mask of mapped pages, returns true if any of the pages in the mask
6028 // are mapped remotely by the given GPU.
block_has_remote_mapping_gpu(uvm_va_block_t * block,uvm_page_mask_t * scratch_page_mask,uvm_gpu_id_t gpu_id,const uvm_page_mask_t * mapped_pages)6029 static bool block_has_remote_mapping_gpu(uvm_va_block_t *block,
6030 uvm_page_mask_t *scratch_page_mask,
6031 uvm_gpu_id_t gpu_id,
6032 const uvm_page_mask_t *mapped_pages)
6033 {
6034 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu_id);
6035
6036 if (!gpu_state)
6037 return false;
6038
6039 // The caller must ensure that all pages of the input mask are really mapped
6040 UVM_ASSERT(uvm_page_mask_subset(mapped_pages, &gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ]));
6041
6042 // UVM-Lite GPUs map the preferred location if it's accessible, regardless
6043 // of the resident location.
6044 if (uvm_processor_mask_test(block_get_uvm_lite_gpus(block), gpu_id)) {
6045 if (uvm_page_mask_empty(mapped_pages))
6046 return false;
6047
6048 return !uvm_va_policy_preferred_location_equal(uvm_va_range_get_policy(block->va_range), gpu_id, NUMA_NO_NODE);
6049 }
6050
6051 // Remote pages are pages which are mapped but not resident locally
6052 return uvm_page_mask_andnot(scratch_page_mask, mapped_pages, &gpu_state->resident);
6053 }
6054
6055 // Writes pte_clear_val to the 4k PTEs covered by clear_page_mask. If
6056 // clear_page_mask is NULL, all 4k PTEs in the {block, gpu} are written.
6057 //
6058 // If tlb_batch is provided, the 4k PTEs written are added to the batch. The
6059 // caller is responsible for ending the TLB batch with the appropriate membar.
block_gpu_pte_clear_4k(uvm_va_block_t * block,uvm_gpu_t * gpu,const uvm_page_mask_t * clear_page_mask,NvU64 pte_clear_val,uvm_pte_batch_t * pte_batch,uvm_tlb_batch_t * tlb_batch)6060 static void block_gpu_pte_clear_4k(uvm_va_block_t *block,
6061 uvm_gpu_t *gpu,
6062 const uvm_page_mask_t *clear_page_mask,
6063 NvU64 pte_clear_val,
6064 uvm_pte_batch_t *pte_batch,
6065 uvm_tlb_batch_t *tlb_batch)
6066 {
6067 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
6068 uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
6069 uvm_gpu_phys_address_t pte_addr;
6070 NvU32 pte_size = uvm_mmu_pte_size(tree, UVM_PAGE_SIZE_4K);
6071 uvm_va_block_region_t region = uvm_va_block_region_from_block(block);
6072 uvm_va_block_region_t subregion;
6073 size_t num_ptes, ptes_per_page = PAGE_SIZE / UVM_PAGE_SIZE_4K;
6074
6075 for_each_va_block_subregion_in_mask(subregion, clear_page_mask, region) {
6076 num_ptes = uvm_va_block_region_num_pages(subregion) * ptes_per_page;
6077
6078 pte_addr = uvm_page_table_range_entry_address(tree,
6079 &gpu_state->page_table_range_4k,
6080 subregion.first * ptes_per_page);
6081
6082 uvm_pte_batch_clear_ptes(pte_batch, pte_addr, pte_clear_val, pte_size, num_ptes);
6083
6084 if (tlb_batch) {
6085 uvm_tlb_batch_invalidate(tlb_batch,
6086 uvm_va_block_region_start(block, subregion),
6087 uvm_va_block_region_size(subregion),
6088 UVM_PAGE_SIZE_4K,
6089 UVM_MEMBAR_NONE);
6090 }
6091 }
6092 }
6093
6094 // Writes the 4k PTEs covered by write_page_mask using memory from resident_id
6095 // with new_prot permissions. new_prot must not be UVM_PROT_NONE: use
6096 // block_gpu_pte_clear_4k instead.
6097 //
6098 // If write_page_mask is NULL, all 4k PTEs in the {block, gpu} are written.
6099 //
6100 // If tlb_batch is provided, the 4k PTEs written are added to the batch. The
6101 // caller is responsible for ending the TLB batch with the appropriate membar.
block_gpu_pte_write_4k(uvm_va_block_t * block,uvm_gpu_t * gpu,uvm_processor_id_t resident_id,uvm_prot_t new_prot,const uvm_page_mask_t * write_page_mask,uvm_pte_batch_t * pte_batch,uvm_tlb_batch_t * tlb_batch)6102 static void block_gpu_pte_write_4k(uvm_va_block_t *block,
6103 uvm_gpu_t *gpu,
6104 uvm_processor_id_t resident_id,
6105 uvm_prot_t new_prot,
6106 const uvm_page_mask_t *write_page_mask,
6107 uvm_pte_batch_t *pte_batch,
6108 uvm_tlb_batch_t *tlb_batch)
6109 {
6110 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
6111 uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
6112 NvU32 pte_size = uvm_mmu_pte_size(tree, UVM_PAGE_SIZE_4K);
6113 const size_t ptes_per_page = PAGE_SIZE / UVM_PAGE_SIZE_4K;
6114 uvm_va_block_region_t contig_region = {0};
6115 uvm_gpu_phys_address_t contig_addr = {0};
6116 uvm_gpu_phys_address_t page_addr = {0};
6117 uvm_page_index_t page_index;
6118 NvU64 pte_flags = block_gpu_pte_flag_cacheable(block, gpu, resident_id);
6119 int contig_nid = NUMA_NO_NODE;
6120
6121 UVM_ASSERT(new_prot != UVM_PROT_NONE);
6122 UVM_ASSERT(UVM_ID_IS_VALID(resident_id));
6123
6124 for_each_va_block_page_in_mask(page_index, write_page_mask, block) {
6125 uvm_gpu_phys_address_t pte_addr;
6126 size_t i;
6127 int nid = NUMA_NO_NODE;
6128
6129 if (UVM_ID_IS_CPU(resident_id)) {
6130 nid = block_get_page_node_residency(block, page_index);
6131 UVM_ASSERT(nid != NUMA_NO_NODE);
6132
6133 // Assume that this mapping will be used to write to the page
6134 if (new_prot > UVM_PROT_READ_ONLY && !uvm_va_block_is_hmm(block))
6135 block_mark_cpu_page_dirty(block, page_index, nid);
6136 }
6137
6138 if (page_index >= contig_region.outer || nid != contig_nid) {
6139 contig_region = block_phys_contig_region(block, page_index, resident_id, nid);
6140 contig_addr = block_phys_page_address(block, block_phys_page(resident_id, nid, contig_region.first), gpu);
6141 page_addr = contig_addr;
6142 contig_nid = nid;
6143 }
6144
6145 page_addr.address = contig_addr.address + (page_index - contig_region.first) * PAGE_SIZE;
6146
6147 pte_addr = uvm_page_table_range_entry_address(tree,
6148 &gpu_state->page_table_range_4k,
6149 page_index * ptes_per_page);
6150
6151 // Handle PAGE_SIZE > GPU PTE size
6152 for (i = 0; i < ptes_per_page; i++) {
6153 NvU64 pte_val = tree->hal->make_pte(page_addr.aperture, page_addr.address, new_prot, pte_flags);
6154 uvm_pte_batch_write_pte(pte_batch, pte_addr, pte_val, pte_size);
6155 page_addr.address += UVM_PAGE_SIZE_4K;
6156 pte_addr.address += pte_size;
6157 }
6158
6159 if (tlb_batch) {
6160 NvU64 page_virt_addr = uvm_va_block_cpu_page_address(block, page_index);
6161 uvm_tlb_batch_invalidate(tlb_batch, page_virt_addr, PAGE_SIZE, UVM_PAGE_SIZE_4K, UVM_MEMBAR_NONE);
6162 }
6163 }
6164 }
6165
6166 // Writes all 4k PTEs under the big PTE regions described by big_ptes_covered.
6167 // This is used to initialize the 4k PTEs when splitting 2M and big PTEs. It
6168 // only writes 4k PTEs, not big PTEs.
6169 //
6170 // For those 4k PTEs, new_pages_mask indicates which ones should inherit the
6171 // mapping from the corresponding big page (0) and which ones should be written
6172 // using memory from resident_id and new_prot (1). Unlike the other pte_write
6173 // functions, new_prot may be UVM_PROT_NONE.
6174 //
6175 // If resident_id is UVM_ID_INVALID, this function looks up the resident ID
6176 // which should inherit the current permissions. new_prot must be UVM_PROT_NONE
6177 // in this case.
6178 //
6179 // new_pages_mask must not be NULL.
6180 //
6181 // No TLB invalidates are required since we've set up the lower PTEs to never be
6182 // cached by the GPU's MMU when covered by larger PTEs.
block_gpu_pte_big_split_write_4k(uvm_va_block_t * block,uvm_va_block_context_t * block_context,uvm_gpu_t * gpu,uvm_processor_id_t resident_id,uvm_prot_t new_prot,const unsigned long * big_ptes_covered,const uvm_page_mask_t * new_pages_mask,uvm_pte_batch_t * pte_batch)6183 static void block_gpu_pte_big_split_write_4k(uvm_va_block_t *block,
6184 uvm_va_block_context_t *block_context,
6185 uvm_gpu_t *gpu,
6186 uvm_processor_id_t resident_id,
6187 uvm_prot_t new_prot,
6188 const unsigned long *big_ptes_covered,
6189 const uvm_page_mask_t *new_pages_mask,
6190 uvm_pte_batch_t *pte_batch)
6191 {
6192 uvm_va_block_region_t big_region;
6193 size_t big_page_index;
6194 uvm_processor_id_t curr_resident_id;
6195 uvm_prot_t curr_prot;
6196 NvU32 big_page_size = uvm_va_block_gpu_big_page_size(block, gpu);
6197
6198 if (UVM_ID_IS_INVALID(resident_id))
6199 UVM_ASSERT(new_prot == UVM_PROT_NONE);
6200
6201 for_each_set_bit(big_page_index, big_ptes_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) {
6202 big_region = uvm_va_block_big_page_region(block, big_page_index, big_page_size);
6203
6204 curr_prot = block_page_prot_gpu(block, gpu, big_region.first);
6205
6206 // The unmap path doesn't know the current residency ahead of time, so
6207 // we have to look it up.
6208 if (UVM_ID_IS_INVALID(resident_id)) {
6209 curr_resident_id = block_gpu_get_processor_to_map(block, block_context, gpu, big_region.first);
6210 }
6211 else {
6212 // Check that we aren't changing the aperture of the existing
6213 // mappings. It could be legal in some cases (switching from {RO, A}
6214 // to {RO, B} for example) but we'd need to issue TLB membars.
6215 if (curr_prot != UVM_PROT_NONE) {
6216 UVM_ASSERT(uvm_id_equal(block_gpu_get_processor_to_map(block,
6217 block_context,
6218 gpu,
6219 big_region.first),
6220 resident_id));
6221 }
6222
6223 curr_resident_id = resident_id;
6224 }
6225
6226 // pages in new_pages_mask under this big page get new_prot
6227 uvm_page_mask_zero(&block_context->scratch_page_mask);
6228 uvm_page_mask_region_fill(&block_context->scratch_page_mask, big_region);
6229 if (uvm_page_mask_and(&block_context->scratch_page_mask, &block_context->scratch_page_mask, new_pages_mask)) {
6230 if (new_prot == UVM_PROT_NONE) {
6231 block_gpu_pte_clear_4k(block, gpu, &block_context->scratch_page_mask, 0, pte_batch, NULL);
6232 }
6233 else {
6234 block_gpu_pte_write_4k(block,
6235 gpu,
6236 curr_resident_id,
6237 new_prot,
6238 &block_context->scratch_page_mask,
6239 pte_batch,
6240 NULL);
6241 }
6242 }
6243
6244 // All other pages under this big page inherit curr_prot
6245 uvm_page_mask_zero(&block_context->scratch_page_mask);
6246 uvm_page_mask_region_fill(&block_context->scratch_page_mask, big_region);
6247 if (uvm_page_mask_andnot(&block_context->scratch_page_mask, &block_context->scratch_page_mask, new_pages_mask)) {
6248 if (curr_prot == UVM_PROT_NONE) {
6249 block_gpu_pte_clear_4k(block, gpu, &block_context->scratch_page_mask, 0, pte_batch, NULL);
6250 }
6251 else {
6252 block_gpu_pte_write_4k(block,
6253 gpu,
6254 curr_resident_id,
6255 curr_prot,
6256 &block_context->scratch_page_mask,
6257 pte_batch,
6258 NULL);
6259 }
6260 }
6261 }
6262 }
6263
6264 // Writes pte_clear_val to the big PTEs in big_ptes_mask. If big_ptes_mask is
6265 // NULL, all big PTEs in the {block, gpu} are cleared.
6266 //
6267 // If tlb_batch is provided, the big PTEs written are added to the batch. The
6268 // caller is responsible for ending the TLB batch with the appropriate membar.
block_gpu_pte_clear_big(uvm_va_block_t * block,uvm_gpu_t * gpu,const unsigned long * big_ptes_mask,NvU64 pte_clear_val,uvm_pte_batch_t * pte_batch,uvm_tlb_batch_t * tlb_batch)6269 static void block_gpu_pte_clear_big(uvm_va_block_t *block,
6270 uvm_gpu_t *gpu,
6271 const unsigned long *big_ptes_mask,
6272 NvU64 pte_clear_val,
6273 uvm_pte_batch_t *pte_batch,
6274 uvm_tlb_batch_t *tlb_batch)
6275 {
6276 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
6277 uvm_gpu_va_space_t *gpu_va_space = uvm_va_block_get_gpu_va_space(block, gpu);
6278 NvU32 big_page_size = gpu_va_space->page_tables.big_page_size;
6279 uvm_gpu_phys_address_t pte_addr;
6280 NvU32 pte_size = uvm_mmu_pte_size(&gpu_va_space->page_tables, big_page_size);
6281 size_t big_page_index;
6282 DECLARE_BITMAP(big_ptes_to_clear, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6283
6284 if (big_ptes_mask)
6285 bitmap_copy(big_ptes_to_clear, big_ptes_mask, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6286 else
6287 bitmap_set(big_ptes_to_clear, 0, uvm_va_block_num_big_pages(block, big_page_size));
6288
6289 for_each_set_bit(big_page_index, big_ptes_to_clear, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) {
6290 pte_addr = uvm_page_table_range_entry_address(&gpu_va_space->page_tables,
6291 &gpu_state->page_table_range_big,
6292 big_page_index);
6293 uvm_pte_batch_clear_ptes(pte_batch, pte_addr, pte_clear_val, pte_size, 1);
6294
6295 if (tlb_batch) {
6296 uvm_tlb_batch_invalidate(tlb_batch,
6297 uvm_va_block_big_page_addr(block, big_page_index, big_page_size),
6298 big_page_size,
6299 big_page_size,
6300 UVM_MEMBAR_NONE);
6301 }
6302 }
6303 }
6304
6305 // Writes the big PTEs in big_ptes_mask using memory from resident_id with
6306 // new_prot permissions. new_prot must not be UVM_PROT_NONE: use
6307 // block_gpu_pte_clear_big instead.
6308 //
6309 // Unlike block_gpu_pte_clear_big, big_ptes_mask must not be NULL.
6310 //
6311 // If tlb_batch is provided, the big PTEs written are added to the batch. The
6312 // caller is responsible for ending the TLB batch with the appropriate membar.
block_gpu_pte_write_big(uvm_va_block_t * block,uvm_gpu_t * gpu,uvm_processor_id_t resident_id,uvm_prot_t new_prot,const unsigned long * big_ptes_mask,uvm_pte_batch_t * pte_batch,uvm_tlb_batch_t * tlb_batch)6313 static void block_gpu_pte_write_big(uvm_va_block_t *block,
6314 uvm_gpu_t *gpu,
6315 uvm_processor_id_t resident_id,
6316 uvm_prot_t new_prot,
6317 const unsigned long *big_ptes_mask,
6318 uvm_pte_batch_t *pte_batch,
6319 uvm_tlb_batch_t *tlb_batch)
6320 {
6321 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
6322 uvm_gpu_va_space_t *gpu_va_space = uvm_va_block_get_gpu_va_space(block, gpu);
6323 uvm_page_tree_t *tree = &gpu_va_space->page_tables;
6324 NvU32 big_page_size = tree->big_page_size;
6325 NvU32 pte_size = uvm_mmu_pte_size(tree, big_page_size);
6326 size_t big_page_index;
6327 uvm_va_block_region_t contig_region = {0};
6328 uvm_gpu_phys_address_t contig_addr = {0};
6329 uvm_gpu_phys_address_t page_addr = {0};
6330 NvU64 pte_flags = block_gpu_pte_flag_cacheable(block, gpu, resident_id);
6331 int contig_nid = NUMA_NO_NODE;
6332
6333 UVM_ASSERT(new_prot != UVM_PROT_NONE);
6334 UVM_ASSERT(UVM_ID_IS_VALID(resident_id));
6335 UVM_ASSERT(big_ptes_mask);
6336
6337 if (!bitmap_empty(big_ptes_mask, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)) {
6338 UVM_ASSERT(uvm_va_block_num_big_pages(block, big_page_size) > 0);
6339
6340 if (!gpu->parent->can_map_sysmem_with_large_pages)
6341 UVM_ASSERT(UVM_ID_IS_GPU(resident_id));
6342 }
6343
6344 for_each_set_bit(big_page_index, big_ptes_mask, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) {
6345 NvU64 pte_val;
6346 uvm_gpu_phys_address_t pte_addr;
6347 uvm_va_block_region_t big_region = uvm_va_block_big_page_region(block, big_page_index, big_page_size);
6348 int nid = NUMA_NO_NODE;
6349
6350 if (UVM_ID_IS_CPU(resident_id)) {
6351 nid = block_get_page_node_residency(block, big_region.first);
6352 UVM_ASSERT(nid != NUMA_NO_NODE);
6353
6354 // Assume that this mapping will be used to write to the page
6355 if (new_prot > UVM_PROT_READ_ONLY && !uvm_va_block_is_hmm(block)) {
6356 uvm_page_index_t page_index;
6357
6358 for_each_va_block_page_in_region(page_index, big_region)
6359 block_mark_cpu_page_dirty(block, page_index, nid);
6360 }
6361 }
6362
6363 if (big_region.first >= contig_region.outer || nid != contig_nid) {
6364 contig_region = block_phys_contig_region(block, big_region.first, resident_id, nid);
6365 contig_addr = block_phys_page_address(block, block_phys_page(resident_id, nid, contig_region.first), gpu);
6366 page_addr = contig_addr;
6367 contig_nid = nid;
6368 }
6369
6370 page_addr.address = contig_addr.address + (big_region.first - contig_region.first) * PAGE_SIZE;
6371
6372 pte_addr = uvm_page_table_range_entry_address(tree, &gpu_state->page_table_range_big, big_page_index);
6373 pte_val = tree->hal->make_pte(page_addr.aperture, page_addr.address, new_prot, pte_flags);
6374 uvm_pte_batch_write_pte(pte_batch, pte_addr, pte_val, pte_size);
6375
6376 if (tlb_batch) {
6377 uvm_tlb_batch_invalidate(tlb_batch,
6378 uvm_va_block_region_start(block, big_region),
6379 big_page_size,
6380 big_page_size,
6381 UVM_MEMBAR_NONE);
6382 }
6383 }
6384 }
6385
6386 // Switches any mix of valid or invalid 4k PTEs under the big PTEs in
6387 // big_ptes_to_merge to an unmapped big PTE. This also ends both pte_batch and
6388 // tlb_batch in order to poison the now-unused 4k PTEs.
6389 //
6390 // The 4k PTEs are invalidated with the specified membar.
block_gpu_pte_merge_big_and_end(uvm_va_block_t * block,uvm_va_block_context_t * block_context,uvm_gpu_t * gpu,const unsigned long * big_ptes_to_merge,uvm_push_t * push,uvm_pte_batch_t * pte_batch,uvm_tlb_batch_t * tlb_batch,uvm_membar_t tlb_membar)6391 static void block_gpu_pte_merge_big_and_end(uvm_va_block_t *block,
6392 uvm_va_block_context_t *block_context,
6393 uvm_gpu_t *gpu,
6394 const unsigned long *big_ptes_to_merge,
6395 uvm_push_t *push,
6396 uvm_pte_batch_t *pte_batch,
6397 uvm_tlb_batch_t *tlb_batch,
6398 uvm_membar_t tlb_membar)
6399 {
6400 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
6401 uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
6402 NvU32 big_page_size = tree->big_page_size;
6403 NvU64 unmapped_pte_val = tree->hal->unmapped_pte(big_page_size);
6404 size_t big_page_index;
6405 DECLARE_BITMAP(dummy_big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6406
6407 UVM_ASSERT(!bitmap_empty(big_ptes_to_merge, MAX_BIG_PAGES_PER_UVM_VA_BLOCK));
6408 UVM_ASSERT(!bitmap_and(dummy_big_ptes, gpu_state->big_ptes, big_ptes_to_merge, MAX_BIG_PAGES_PER_UVM_VA_BLOCK));
6409
6410 // We can be called with the 4k PTEs in two cases:
6411 // 1) 4k PTEs allocated. In this case the 4k PTEs are currently active.
6412 //
6413 // 2) 4k PTEs unallocated. In this case the GPU may not have invalid 4k PTEs
6414 // active under the big PTE, depending on whether neighboring blocks
6415 // caused the page tables to be allocated.
6416 //
6417 // In both cases we need to invalidate the 4k PTEs in case the GPU MMU has
6418 // them cached.
6419
6420 // Each big PTE is currently invalid so the 4ks are active (or unallocated).
6421 // First make the big PTEs unmapped to disable future lookups of the 4ks
6422 // under it. We can't directly transition the entry from valid 4k PTEs to
6423 // valid big PTEs, because that could cause the GPU TLBs to cache the same
6424 // VA in different cache lines. That could cause memory ordering to not be
6425 // maintained.
6426 block_gpu_pte_clear_big(block, gpu, big_ptes_to_merge, unmapped_pte_val, pte_batch, tlb_batch);
6427
6428 // Now invalidate the big PTEs we just wrote as well as all 4ks under them.
6429 // Subsequent MMU fills will stop at the now-unmapped big PTEs, so we only
6430 // need to invalidate the 4k PTEs without actually writing them.
6431 for_each_set_bit(big_page_index, big_ptes_to_merge, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) {
6432 uvm_tlb_batch_invalidate(tlb_batch,
6433 uvm_va_block_big_page_addr(block, big_page_index, big_page_size),
6434 big_page_size,
6435 big_page_size | UVM_PAGE_SIZE_4K,
6436 UVM_MEMBAR_NONE);
6437 }
6438
6439 // End the batches for the caller. We need to do this here in order to
6440 // poison the 4ks below.
6441 uvm_pte_batch_end(pte_batch);
6442 uvm_tlb_batch_end(tlb_batch, push, tlb_membar);
6443
6444 // As a guard against bad PTE writes/TLB invalidates, fill the now-unused
6445 // PTEs with a pattern which will trigger fatal faults on access. We have to
6446 // do this after the TLB invalidate of the big PTEs, or the GPU might use
6447 // the new values.
6448 if (UVM_IS_DEBUG() && gpu_state->page_table_range_4k.table) {
6449 uvm_page_mask_init_from_big_ptes(block, gpu, &block_context->scratch_page_mask, big_ptes_to_merge);
6450 uvm_pte_batch_begin(push, pte_batch);
6451 block_gpu_pte_clear_4k(block,
6452 gpu,
6453 &block_context->scratch_page_mask,
6454 tree->hal->poisoned_pte(),
6455 pte_batch,
6456 NULL);
6457 uvm_pte_batch_end(pte_batch);
6458 }
6459 }
6460
6461 // Writes 0 (invalid) to the 2M PTE for this {block, gpu}.
6462 //
6463 // If tlb_batch is provided, the 2M PTE is added to the batch. The caller is
6464 // responsible for ending the TLB batch with the appropriate membar.
block_gpu_pte_clear_2m(uvm_va_block_t * block,uvm_gpu_t * gpu,uvm_pte_batch_t * pte_batch,uvm_tlb_batch_t * tlb_batch)6465 static void block_gpu_pte_clear_2m(uvm_va_block_t *block,
6466 uvm_gpu_t *gpu,
6467 uvm_pte_batch_t *pte_batch,
6468 uvm_tlb_batch_t *tlb_batch)
6469 {
6470 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
6471 uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
6472 uvm_gpu_phys_address_t pte_addr = uvm_page_table_range_entry_address(tree, &gpu_state->page_table_range_2m, 0);
6473 NvU32 pte_size = uvm_mmu_pte_size(tree, UVM_PAGE_SIZE_2M);
6474
6475 // uvm_pte_batch_write_pte only writes the lower 8 bytes of the 16-byte PTE,
6476 // which would cause a problem when trying to make the entry invalid since
6477 // both halves must be 0. Using uvm_pte_batch_clear_ptes writes the entire
6478 // 16 bytes.
6479 uvm_pte_batch_clear_ptes(pte_batch, pte_addr, 0, pte_size, 1);
6480
6481 if (tlb_batch)
6482 uvm_tlb_batch_invalidate(tlb_batch, block->start, UVM_PAGE_SIZE_2M, UVM_PAGE_SIZE_2M, UVM_MEMBAR_NONE);
6483 }
6484
6485 // Writes the 2M PTE for {block, gpu} using memory from resident_id with
6486 // new_prot permissions. new_prot must not be UVM_PROT_NONE: use
6487 // block_gpu_pte_clear_2m instead.
6488 //
6489 // If tlb_batch is provided, the 2M PTE is added to the batch. The caller is
6490 // responsible for ending the TLB batch with the appropriate membar.
block_gpu_pte_write_2m(uvm_va_block_t * block,uvm_gpu_t * gpu,uvm_processor_id_t resident_id,uvm_prot_t new_prot,uvm_pte_batch_t * pte_batch,uvm_tlb_batch_t * tlb_batch)6491 static void block_gpu_pte_write_2m(uvm_va_block_t *block,
6492 uvm_gpu_t *gpu,
6493 uvm_processor_id_t resident_id,
6494 uvm_prot_t new_prot,
6495 uvm_pte_batch_t *pte_batch,
6496 uvm_tlb_batch_t *tlb_batch)
6497 {
6498 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
6499 uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
6500 uvm_gpu_phys_address_t pte_addr = uvm_page_table_range_entry_address(tree, &gpu_state->page_table_range_2m, 0);
6501 uvm_gpu_phys_address_t page_addr;
6502 NvU32 pte_size = uvm_mmu_pte_size(tree, UVM_PAGE_SIZE_2M);
6503 NvU64 pte_val;
6504 NvU64 pte_flags = block_gpu_pte_flag_cacheable(block, gpu, resident_id);
6505 int nid = NUMA_NO_NODE;
6506
6507 UVM_ASSERT(new_prot != UVM_PROT_NONE);
6508 UVM_ASSERT(UVM_ID_IS_VALID(resident_id));
6509
6510 if (UVM_ID_IS_CPU(resident_id)) {
6511 nid = block_get_page_node_residency(block, 0);
6512 UVM_ASSERT(nid != NUMA_NO_NODE);
6513 if (!uvm_va_block_is_hmm(block))
6514 block_mark_cpu_page_dirty(block, 0, nid);
6515 }
6516
6517 page_addr = block_phys_page_address(block, block_phys_page(resident_id, nid, 0), gpu);
6518 pte_val = tree->hal->make_pte(page_addr.aperture, page_addr.address, new_prot, pte_flags);
6519 uvm_pte_batch_write_pte(pte_batch, pte_addr, pte_val, pte_size);
6520
6521 if (tlb_batch)
6522 uvm_tlb_batch_invalidate(tlb_batch, block->start, UVM_PAGE_SIZE_2M, UVM_PAGE_SIZE_2M, UVM_MEMBAR_NONE);
6523 }
6524
block_gpu_needs_to_activate_table(uvm_va_block_t * block,uvm_gpu_t * gpu)6525 static bool block_gpu_needs_to_activate_table(uvm_va_block_t *block, uvm_gpu_t *gpu)
6526 {
6527 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
6528
6529 if (!block_gpu_supports_2m(block, gpu))
6530 return false;
6531
6532 if ((gpu_state->page_table_range_big.table && !gpu_state->activated_big) ||
6533 (gpu_state->page_table_range_4k.table && !gpu_state->activated_4k))
6534 return true;
6535
6536 return false;
6537 }
6538
6539 // Only used if 2M PTEs are supported. Either transitions a 2M PTE to a PDE, or
6540 // activates a newly-allocated page table (big or 4k) while the other is already
6541 // active. The caller must have already written the new PTEs under the table
6542 // with the appropriate membar.
block_gpu_write_pde(uvm_va_block_t * block,uvm_gpu_t * gpu,uvm_push_t * push,uvm_tlb_batch_t * tlb_batch)6543 static void block_gpu_write_pde(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_push_t *push, uvm_tlb_batch_t *tlb_batch)
6544 {
6545 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
6546 uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
6547
6548 if (!gpu_state->pte_is_2m)
6549 UVM_ASSERT(block_gpu_needs_to_activate_table(block, gpu));
6550
6551 UVM_ASSERT(gpu_state->page_table_range_big.table || gpu_state->page_table_range_4k.table);
6552
6553 // We always need a membar to order PDE/PTE writes with the TLB invalidate.
6554 // write_pde will do a MEMBAR_SYS by default.
6555 if (uvm_page_table_range_aperture(&gpu_state->page_table_range_2m) == UVM_APERTURE_VID)
6556 uvm_push_set_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU);
6557 uvm_page_tree_write_pde(tree, &gpu_state->page_table_range_2m, push);
6558
6559 gpu->parent->host_hal->wait_for_idle(push);
6560
6561 // Invalidate just the PDE
6562 uvm_tlb_batch_invalidate(tlb_batch, block->start, UVM_PAGE_SIZE_2M, UVM_PAGE_SIZE_2M, UVM_MEMBAR_NONE);
6563
6564 if (gpu_state->page_table_range_big.table)
6565 gpu_state->activated_big = true;
6566
6567 if (gpu_state->page_table_range_4k.table)
6568 gpu_state->activated_4k = true;
6569 }
6570
6571 // Called to switch the 2M PTE (valid or invalid) to a PDE. The caller should
6572 // have written all lower PTEs as appropriate into the given pte_batch already.
6573 // This function ends the PTE batch, activates the 2M PDE, and does a TLB
6574 // invalidate.
6575 //
6576 // The caller does not need to do any TLB invalidates since none of the lower
6577 // PTEs could be cached.
block_gpu_pte_finish_split_2m(uvm_va_block_t * block,uvm_gpu_t * gpu,uvm_push_t * push,uvm_pte_batch_t * pte_batch,uvm_tlb_batch_t * tlb_batch,uvm_membar_t tlb_membar)6578 static void block_gpu_pte_finish_split_2m(uvm_va_block_t *block,
6579 uvm_gpu_t *gpu,
6580 uvm_push_t *push,
6581 uvm_pte_batch_t *pte_batch,
6582 uvm_tlb_batch_t *tlb_batch,
6583 uvm_membar_t tlb_membar)
6584 {
6585 uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
6586 uvm_prot_t curr_prot = block_page_prot_gpu(block, gpu, 0);
6587
6588 // Step 1: Make the 2M entry invalid. We can't directly transition from a
6589 // valid 2M PTE to valid lower PTEs, because that could cause the
6590 // GPU TLBs to cache the same VA in different cache lines. That
6591 // could cause memory ordering to not be maintained.
6592 //
6593 // If the 2M PTE is already invalid, no TLB invalidate is needed.
6594
6595 if (curr_prot == UVM_PROT_NONE) {
6596 // If we aren't downgrading, then we don't need a membar.
6597 UVM_ASSERT(tlb_membar == UVM_MEMBAR_NONE);
6598
6599 // End the batch, which pushes a membar to ensure that the caller's PTE
6600 // writes below 2M are observed before the PDE write we're about to do.
6601 uvm_pte_batch_end(pte_batch);
6602 }
6603 else {
6604 // The 64k and 4k PTEs can't possibly be cached since the 2M entry is
6605 // not yet a PDE, so we just need to invalidate this single 2M entry.
6606 uvm_tlb_batch_begin(tree, tlb_batch);
6607 block_gpu_pte_clear_2m(block, gpu, pte_batch, tlb_batch);
6608
6609 // Make sure the PTE writes are observed before the TLB invalidate
6610 uvm_pte_batch_end(pte_batch);
6611 uvm_tlb_batch_end(tlb_batch, push, tlb_membar);
6612 }
6613
6614 // Step 2: Switch the 2M entry from invalid to a PDE. This activates the
6615 // smaller PTEs.
6616 uvm_tlb_batch_begin(tree, tlb_batch);
6617 block_gpu_write_pde(block, gpu, push, tlb_batch);
6618 uvm_tlb_batch_end(tlb_batch, push, UVM_MEMBAR_NONE);
6619 }
6620
6621 // Switches any mix of valid or invalid 4k or 64k PTEs to an invalid 2M PTE.
6622 // Any lower PTEs are invalidated with the specified membar.
block_gpu_pte_merge_2m(uvm_va_block_t * block,uvm_va_block_context_t * block_context,uvm_gpu_t * gpu,uvm_push_t * push,uvm_membar_t tlb_membar)6623 static void block_gpu_pte_merge_2m(uvm_va_block_t *block,
6624 uvm_va_block_context_t *block_context,
6625 uvm_gpu_t *gpu,
6626 uvm_push_t *push,
6627 uvm_membar_t tlb_membar)
6628 {
6629 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
6630 uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
6631 uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch;
6632 uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch;
6633 NvU32 tlb_inval_sizes;
6634
6635 UVM_ASSERT(!gpu_state->pte_is_2m);
6636 UVM_ASSERT(gpu_state->page_table_range_big.table || gpu_state->page_table_range_4k.table);
6637
6638 // The 2M entry is currently a PDE, so first make it invalid. We can't
6639 // directly transition the entry from a valid PDE to a valid 2M PTE, because
6640 // that could cause the GPU TLBs to cache the same VA in different cache
6641 // lines. That could cause memory ordering to not be maintained.
6642 uvm_pte_batch_begin(push, pte_batch);
6643 block_gpu_pte_clear_2m(block, gpu, pte_batch, NULL);
6644 uvm_pte_batch_end(pte_batch);
6645
6646 // Now invalidate both the 2M entry we just wrote as well as all lower-level
6647 // entries which could be cached. Subsequent MMU fills will stop at the now-
6648 // invalid 2M entry, so we only need to invalidate the lower PTEs without
6649 // actually writing them.
6650 tlb_inval_sizes = UVM_PAGE_SIZE_2M;
6651 if (gpu_state->page_table_range_big.table)
6652 tlb_inval_sizes |= UVM_PAGE_SIZE_64K;
6653
6654 // Strictly-speaking we only need to invalidate those 4k ranges which are
6655 // not covered by a big pte. However, any such invalidate will require
6656 // enough 4k invalidates to force the TLB batching to invalidate everything
6657 // anyway, so just do the simpler thing.
6658 if (!bitmap_full(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK))
6659 tlb_inval_sizes |= UVM_PAGE_SIZE_4K;
6660
6661 uvm_tlb_batch_begin(tree, tlb_batch);
6662 uvm_tlb_batch_invalidate(tlb_batch, block->start, UVM_PAGE_SIZE_2M, tlb_inval_sizes, UVM_MEMBAR_NONE);
6663 uvm_tlb_batch_end(tlb_batch, push, tlb_membar);
6664
6665 // As a guard against bad PTE writes/TLB invalidates, fill the now-unused
6666 // PTEs with a pattern which will trigger fatal faults on access. We have to
6667 // do this after the TLB invalidate of the 2M entry, or the GPU might use
6668 // the new values.
6669 if (UVM_IS_DEBUG()) {
6670 uvm_pte_batch_begin(push, pte_batch);
6671
6672 if (gpu_state->page_table_range_big.table) {
6673 block_gpu_pte_clear_big(block,
6674 gpu,
6675 NULL,
6676 tree->hal->poisoned_pte(),
6677 pte_batch,
6678 NULL);
6679 }
6680
6681 if (gpu_state->page_table_range_4k.table) {
6682 block_gpu_pte_clear_4k(block,
6683 gpu,
6684 NULL,
6685 tree->hal->poisoned_pte(),
6686 pte_batch,
6687 NULL);
6688 }
6689
6690 uvm_pte_batch_end(pte_batch);
6691 }
6692 }
6693
block_pte_op_membar(block_pte_op_t pte_op,uvm_gpu_t * gpu,uvm_processor_id_t resident_id)6694 static uvm_membar_t block_pte_op_membar(block_pte_op_t pte_op, uvm_gpu_t *gpu, uvm_processor_id_t resident_id)
6695 {
6696 // Permissions upgrades (MAP) don't need membars
6697 if (pte_op == BLOCK_PTE_OP_MAP)
6698 return UVM_MEMBAR_NONE;
6699
6700 UVM_ASSERT(UVM_ID_IS_VALID(resident_id));
6701 UVM_ASSERT(pte_op == BLOCK_PTE_OP_REVOKE);
6702
6703 return uvm_hal_downgrade_membar_type(gpu, uvm_id_equal(gpu->id, resident_id));
6704 }
6705
6706 // Write the 2M PTE for {block, gpu} to the memory on resident_id with new_prot
6707 // permissions. If the 2M entry is currently a PDE, it is first merged into a
6708 // PTE.
6709 //
6710 // new_prot must not be UVM_PROT_NONE: use block_gpu_unmap_to_2m instead.
6711 //
6712 // pte_op specifies whether this is a MAP or REVOKE operation, which determines
6713 // the TLB membar required.
block_gpu_map_to_2m(uvm_va_block_t * block,uvm_va_block_context_t * block_context,uvm_gpu_t * gpu,uvm_processor_id_t resident_id,uvm_prot_t new_prot,uvm_push_t * push,block_pte_op_t pte_op)6714 static void block_gpu_map_to_2m(uvm_va_block_t *block,
6715 uvm_va_block_context_t *block_context,
6716 uvm_gpu_t *gpu,
6717 uvm_processor_id_t resident_id,
6718 uvm_prot_t new_prot,
6719 uvm_push_t *push,
6720 block_pte_op_t pte_op)
6721 {
6722 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
6723 uvm_gpu_va_space_t *gpu_va_space = uvm_va_block_get_gpu_va_space(block, gpu);
6724 uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch;
6725 uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch;
6726 uvm_membar_t tlb_membar;
6727
6728 UVM_ASSERT(new_prot != UVM_PROT_NONE);
6729
6730 // If we have a mix of big and 4k PTEs, we have to first merge them to an
6731 // invalid 2M PTE.
6732 if (!gpu_state->pte_is_2m) {
6733 block_gpu_pte_merge_2m(block, block_context, gpu, push, UVM_MEMBAR_NONE);
6734
6735 gpu_state->pte_is_2m = true;
6736 bitmap_zero(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6737 }
6738
6739 // Write the new permissions
6740 uvm_pte_batch_begin(push, pte_batch);
6741 uvm_tlb_batch_begin(&gpu_va_space->page_tables, tlb_batch);
6742
6743 block_gpu_pte_write_2m(block, gpu, resident_id, new_prot, pte_batch, tlb_batch);
6744
6745 uvm_pte_batch_end(pte_batch);
6746
6747 tlb_membar = block_pte_op_membar(pte_op, gpu, resident_id);
6748 uvm_tlb_batch_end(tlb_batch, push, tlb_membar);
6749 }
6750
6751 // Combination split + map operation, called when only part of a 2M PTE mapping
6752 // is being changed. This splits an existing valid or invalid 2M PTE into the
6753 // mix of big and 4k PTEs described by block_context->mapping.new_pte_state.
6754 //
6755 // The PTEs covering the pages in pages_to_write are written to the memory on
6756 // resident_id with new_prot permissions. new_prot must not be UVM_PROT_NONE.
6757 //
6758 // The PTEs covering the pages not set in pages_to_write inherit the mapping of
6759 // the current 2M PTE. If the current mapping is valid, it must target
6760 // resident_id.
6761 //
6762 // pte_op specifies whether this is a MAP or REVOKE operation, which determines
6763 // the TLB membar required.
block_gpu_map_split_2m(uvm_va_block_t * block,uvm_va_block_context_t * block_context,uvm_gpu_t * gpu,uvm_processor_id_t resident_id,const uvm_page_mask_t * pages_to_write,uvm_prot_t new_prot,uvm_push_t * push,block_pte_op_t pte_op)6764 static void block_gpu_map_split_2m(uvm_va_block_t *block,
6765 uvm_va_block_context_t *block_context,
6766 uvm_gpu_t *gpu,
6767 uvm_processor_id_t resident_id,
6768 const uvm_page_mask_t *pages_to_write,
6769 uvm_prot_t new_prot,
6770 uvm_push_t *push,
6771 block_pte_op_t pte_op)
6772 {
6773 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
6774 uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
6775 uvm_va_block_new_pte_state_t *new_pte_state = &block_context->mapping.new_pte_state;
6776 uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch;
6777 uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch;
6778 uvm_prot_t curr_prot = block_page_prot_gpu(block, gpu, 0);
6779 uvm_membar_t tlb_membar;
6780 DECLARE_BITMAP(big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6781 DECLARE_BITMAP(big_ptes_inherit, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6782 DECLARE_BITMAP(big_ptes_new_prot, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6783
6784 UVM_ASSERT(gpu_state->pte_is_2m);
6785
6786 if (!gpu_state->page_table_range_4k.table)
6787 UVM_ASSERT(bitmap_full(new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK));
6788
6789 uvm_pte_batch_begin(push, pte_batch);
6790
6791 // Since the 2M entry is active as a PTE, the GPU MMU can't fetch entries
6792 // from the lower levels. This means we don't need to issue a TLB invalidate
6793 // when writing those levels.
6794
6795 // Cases to handle:
6796 // 1) Big PTEs which inherit curr_prot
6797 // 2) Big PTEs which get new_prot
6798 // 3) Big PTEs which are split to 4k
6799 // a) 4k PTEs which inherit curr_prot under the split big PTEs
6800 // b) 4k PTEs which get new_prot under the split big PTEs
6801
6802 // Compute the big PTEs which will need to be split to 4k, if any.
6803 bitmap_complement(big_ptes_split, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6804
6805 if (gpu_state->page_table_range_big.table) {
6806 // Case 1: Write the big PTEs which will inherit the 2M permissions, if
6807 // any. These are the big PTEs which are unchanged (uncovered) by the
6808 // operation.
6809 bitmap_andnot(big_ptes_inherit,
6810 new_pte_state->big_ptes,
6811 new_pte_state->big_ptes_covered,
6812 MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6813
6814 if (curr_prot == UVM_PROT_NONE) {
6815 block_gpu_pte_clear_big(block,
6816 gpu,
6817 big_ptes_inherit,
6818 tree->hal->unmapped_pte(UVM_PAGE_SIZE_64K),
6819 pte_batch,
6820 NULL);
6821 }
6822 else {
6823 block_gpu_pte_write_big(block, gpu, resident_id, curr_prot, big_ptes_inherit, pte_batch, NULL);
6824 }
6825
6826 // Case 2: Write the new big PTEs
6827 bitmap_and(big_ptes_new_prot,
6828 new_pte_state->big_ptes,
6829 new_pte_state->big_ptes_covered,
6830 MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6831 block_gpu_pte_write_big(block, gpu, resident_id, new_prot, big_ptes_new_prot, pte_batch, NULL);
6832
6833 // Case 3: Write the big PTEs which cover 4k PTEs
6834 block_gpu_pte_clear_big(block, gpu, big_ptes_split, 0, pte_batch, NULL);
6835
6836 // We just wrote all possible big PTEs, so mark them as initialized
6837 gpu_state->initialized_big = true;
6838 }
6839 else {
6840 UVM_ASSERT(bitmap_empty(new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK));
6841 }
6842
6843 // Cases 3a and 3b: Write all 4k PTEs under all now-split big PTEs
6844 block_gpu_pte_big_split_write_4k(block,
6845 block_context,
6846 gpu,
6847 resident_id,
6848 new_prot,
6849 big_ptes_split,
6850 pages_to_write,
6851 pte_batch);
6852
6853 // Activate the 2M PDE. This ends the pte_batch and issues a single TLB
6854 // invalidate for the 2M entry.
6855 tlb_membar = block_pte_op_membar(pte_op, gpu, resident_id);
6856 block_gpu_pte_finish_split_2m(block, gpu, push, pte_batch, tlb_batch, tlb_membar);
6857
6858 gpu_state->pte_is_2m = false;
6859 bitmap_copy(gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6860 }
6861
6862 // Split the existing 2M PTE into big and 4k PTEs. No permissions are changed.
6863 //
6864 // new_big_ptes specifies which PTEs should be big. NULL means all PTEs should
6865 // be 4k.
block_gpu_split_2m(uvm_va_block_t * block,uvm_va_block_context_t * block_context,uvm_gpu_t * gpu,const unsigned long * new_big_ptes,uvm_push_t * push)6866 static void block_gpu_split_2m(uvm_va_block_t *block,
6867 uvm_va_block_context_t *block_context,
6868 uvm_gpu_t *gpu,
6869 const unsigned long *new_big_ptes,
6870 uvm_push_t *push)
6871 {
6872 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
6873 uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
6874 uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch;
6875 uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch;
6876 uvm_prot_t curr_prot = block_page_prot_gpu(block, gpu, 0);
6877 DECLARE_BITMAP(new_big_ptes_local, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6878 DECLARE_BITMAP(big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6879 NvU64 unmapped_pte_val;
6880 uvm_processor_id_t curr_residency;
6881
6882 UVM_ASSERT(gpu_state->pte_is_2m);
6883
6884 if (new_big_ptes)
6885 bitmap_copy(new_big_ptes_local, new_big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6886 else
6887 bitmap_zero(new_big_ptes_local, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6888
6889 if (!bitmap_empty(new_big_ptes_local, MAX_BIG_PAGES_PER_UVM_VA_BLOCK))
6890 UVM_ASSERT(gpu_state->page_table_range_big.table);
6891
6892 // We're splitting from 2M to big only, so we'll be writing all big PTEs
6893 if (gpu_state->page_table_range_big.table)
6894 gpu_state->initialized_big = true;
6895
6896 // Cases to handle:
6897 // 1) Big PTEs which inherit curr_prot
6898 // 2) Big PTEs which are split to 4k
6899 // a) 4k PTEs inherit curr_prot under the split big PTEs
6900
6901 // big_ptes_split will cover the 4k regions
6902 bitmap_complement(big_ptes_split, new_big_ptes_local, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6903 uvm_page_mask_init_from_big_ptes(block, gpu, &block_context->mapping.big_split_page_mask, big_ptes_split);
6904
6905 uvm_pte_batch_begin(push, pte_batch);
6906
6907 // Since the 2M entry is active as a PTE, the GPU MMU can't fetch entries
6908 // from the lower levels. This means we don't need to issue a TLB invalidate
6909 // when writing those levels.
6910
6911 if (curr_prot == UVM_PROT_NONE) {
6912 unmapped_pte_val = tree->hal->unmapped_pte(tree->big_page_size);
6913
6914 // Case 2a: Clear the 4k PTEs under big_ptes_split
6915 block_gpu_pte_clear_4k(block, gpu, &block_context->mapping.big_split_page_mask, 0, pte_batch, NULL);
6916
6917 // Case 1: Make the remaining big PTEs unmapped
6918 block_gpu_pte_clear_big(block, gpu, new_big_ptes_local, unmapped_pte_val, pte_batch, NULL);
6919 }
6920 else {
6921 curr_residency = block_gpu_get_processor_to_map(block, block_context, gpu, 0);
6922
6923 // Case 2a: Write the new 4k PTEs under big_ptes_split
6924 block_gpu_pte_write_4k(block,
6925 gpu,
6926 curr_residency,
6927 curr_prot,
6928 &block_context->mapping.big_split_page_mask,
6929 pte_batch,
6930 NULL);
6931
6932 // Case 1: Write the new big PTEs
6933 block_gpu_pte_write_big(block, gpu, curr_residency, curr_prot, new_big_ptes_local, pte_batch, NULL);
6934 }
6935
6936 // Case 2: Make big_ptes_split invalid to activate the 4k PTEs
6937 if (gpu_state->page_table_range_big.table)
6938 block_gpu_pte_clear_big(block, gpu, big_ptes_split, 0, pte_batch, NULL);
6939
6940 // Activate the 2M PDE. This ends the pte_batch and issues a single TLB
6941 // invalidate for the 2M entry. No membar is necessary since we aren't
6942 // changing permissions.
6943 block_gpu_pte_finish_split_2m(block, gpu, push, pte_batch, tlb_batch, UVM_MEMBAR_NONE);
6944
6945 gpu_state->pte_is_2m = false;
6946 bitmap_copy(gpu_state->big_ptes, new_big_ptes_local, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6947 }
6948
6949 // Split the big PTEs in big_ptes_to_split into 4k PTEs. No permissions are
6950 // changed.
6951 //
6952 // big_ptes_to_split must not be NULL.
block_gpu_split_big(uvm_va_block_t * block,uvm_va_block_context_t * block_context,uvm_gpu_t * gpu,const unsigned long * big_ptes_to_split,uvm_push_t * push)6953 static void block_gpu_split_big(uvm_va_block_t *block,
6954 uvm_va_block_context_t *block_context,
6955 uvm_gpu_t *gpu,
6956 const unsigned long *big_ptes_to_split,
6957 uvm_push_t *push)
6958 {
6959 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
6960 uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
6961 uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch;
6962 uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch;
6963 NvU32 big_page_size = tree->big_page_size;
6964 uvm_va_block_region_t big_region;
6965 uvm_processor_id_t resident_id;
6966 size_t big_page_index;
6967 uvm_prot_t curr_prot;
6968 DECLARE_BITMAP(big_ptes_valid, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6969
6970 UVM_ASSERT(!gpu_state->pte_is_2m);
6971 UVM_ASSERT(bitmap_subset(big_ptes_to_split, gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK));
6972 UVM_ASSERT(!bitmap_empty(big_ptes_to_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK));
6973
6974 uvm_pte_batch_begin(push, pte_batch);
6975 uvm_tlb_batch_begin(tree, tlb_batch);
6976
6977 // Write all 4k PTEs under all big PTEs which are being split. We'll make
6978 // the big PTEs inactive below after flushing these writes. No TLB
6979 // invalidate is needed since the big PTE is active.
6980 bitmap_zero(big_ptes_valid, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
6981 for_each_set_bit(big_page_index, big_ptes_to_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) {
6982 big_region = uvm_va_block_big_page_region(block, big_page_index, big_page_size);
6983 curr_prot = block_page_prot_gpu(block, gpu, big_region.first);
6984
6985 uvm_page_mask_zero(&block_context->mapping.big_split_page_mask);
6986 uvm_page_mask_region_fill(&block_context->mapping.big_split_page_mask, big_region);
6987 if (curr_prot == UVM_PROT_NONE) {
6988 block_gpu_pte_clear_4k(block, gpu, &block_context->mapping.big_split_page_mask, 0, pte_batch, NULL);
6989 }
6990 else {
6991 __set_bit(big_page_index, big_ptes_valid);
6992
6993 resident_id = block_gpu_get_processor_to_map(block, block_context, gpu, big_region.first);
6994
6995 block_gpu_pte_write_4k(block,
6996 gpu,
6997 resident_id,
6998 curr_prot,
6999 &block_context->mapping.big_split_page_mask,
7000 pte_batch,
7001 NULL);
7002 }
7003 }
7004
7005 // Unmap the big PTEs which are valid and are being split to 4k. We can't
7006 // directly transition from a valid big PTE to valid lower PTEs, because
7007 // that could cause the GPU TLBs to cache the same VA in different cache
7008 // lines. That could cause memory ordering to not be maintained.
7009 block_gpu_pte_clear_big(block, gpu, big_ptes_valid, tree->hal->unmapped_pte(big_page_size), pte_batch, tlb_batch);
7010
7011 // End the batches. We have to commit the membars and TLB invalidates
7012 // before we finish splitting formerly-big PTEs. No membar is necessary
7013 // since we aren't changing permissions.
7014 uvm_pte_batch_end(pte_batch);
7015 uvm_tlb_batch_end(tlb_batch, push, UVM_MEMBAR_NONE);
7016
7017 // Finish the split by switching the big PTEs from unmapped to invalid. This
7018 // causes the GPU MMU to start reading the 4k PTEs instead of stopping at
7019 // the unmapped big PTEs.
7020 uvm_pte_batch_begin(push, pte_batch);
7021 uvm_tlb_batch_begin(tree, tlb_batch);
7022
7023 block_gpu_pte_clear_big(block, gpu, big_ptes_to_split, 0, pte_batch, tlb_batch);
7024
7025 uvm_pte_batch_end(pte_batch);
7026
7027 // Finally, activate the page tables if they're inactive
7028 if (block_gpu_needs_to_activate_table(block, gpu))
7029 block_gpu_write_pde(block, gpu, push, tlb_batch);
7030
7031 uvm_tlb_batch_end(tlb_batch, push, UVM_MEMBAR_NONE);
7032
7033 bitmap_andnot(gpu_state->big_ptes, gpu_state->big_ptes, big_ptes_to_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
7034 }
7035
7036 // Changes permissions on some pre-existing mix of big and 4k PTEs into some
7037 // other mix of big and 4k PTEs, as described by
7038 // block_context->mapping.new_pte_state.
7039 //
7040 // The PTEs covering the pages in pages_to_write are written to the memory on
7041 // resident_id with new_prot permissions. new_prot must not be UVM_PROT_NONE.
7042 //
7043 // pte_op specifies whether this is a MAP or REVOKE operation, which determines
7044 // the TLB membar required.
block_gpu_map_big_and_4k(uvm_va_block_t * block,uvm_va_block_context_t * block_context,uvm_gpu_t * gpu,uvm_processor_id_t resident_id,const uvm_page_mask_t * pages_to_write,uvm_prot_t new_prot,uvm_push_t * push,block_pte_op_t pte_op)7045 static void block_gpu_map_big_and_4k(uvm_va_block_t *block,
7046 uvm_va_block_context_t *block_context,
7047 uvm_gpu_t *gpu,
7048 uvm_processor_id_t resident_id,
7049 const uvm_page_mask_t *pages_to_write,
7050 uvm_prot_t new_prot,
7051 uvm_push_t *push,
7052 block_pte_op_t pte_op)
7053 {
7054 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
7055 uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
7056 uvm_va_block_new_pte_state_t *new_pte_state = &block_context->mapping.new_pte_state;
7057 uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch;
7058 uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch;
7059 DECLARE_BITMAP(big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
7060 DECLARE_BITMAP(big_ptes_before_or_after, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
7061 DECLARE_BITMAP(big_ptes_merge, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
7062 DECLARE_BITMAP(big_ptes_mask, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
7063 uvm_va_block_region_t big_region;
7064 size_t big_page_index;
7065 NvU32 big_page_size = tree->big_page_size;
7066 uvm_membar_t tlb_membar = block_pte_op_membar(pte_op, gpu, resident_id);
7067
7068 UVM_ASSERT(!gpu_state->pte_is_2m);
7069
7070 uvm_pte_batch_begin(push, pte_batch);
7071 uvm_tlb_batch_begin(tree, tlb_batch);
7072
7073 // All of these cases might be perfomed in the same call:
7074 // 1) Split currently-big PTEs to 4k
7075 // a) Write new 4k PTEs which inherit curr_prot under the split big PTEs
7076 // b) Write new 4k PTEs which get new_prot under the split big PTEs
7077 // 2) Merge currently-4k PTEs to big with new_prot
7078 // 3) Write currently-big PTEs which wholly get new_prot
7079 // 4) Write currently-4k PTEs which get new_prot
7080 // 5) Initialize big PTEs which are not covered by this operation
7081
7082 // Cases 1a and 1b: Write all 4k PTEs under all currently-big PTEs which are
7083 // being split. We'll make the big PTEs inactive below after flushing these
7084 // writes. No TLB invalidate is needed since the big PTE is active.
7085 //
7086 // Mask computation: big_before && !big_after
7087 bitmap_andnot(big_ptes_split, gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
7088
7089 block_gpu_pte_big_split_write_4k(block,
7090 block_context,
7091 gpu,
7092 resident_id,
7093 new_prot,
7094 big_ptes_split,
7095 pages_to_write,
7096 pte_batch);
7097
7098 // Case 4: Write the 4k PTEs which weren't covered by a big PTE before, and
7099 // remain uncovered after the operation.
7100 //
7101 // Mask computation: !big_before && !big_after
7102 bitmap_or(big_ptes_before_or_after, gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
7103 uvm_page_mask_init_from_big_ptes(block, gpu, &block_context->scratch_page_mask, big_ptes_before_or_after);
7104 if (uvm_page_mask_andnot(&block_context->scratch_page_mask, pages_to_write, &block_context->scratch_page_mask)) {
7105 block_gpu_pte_write_4k(block,
7106 gpu,
7107 resident_id,
7108 new_prot,
7109 &block_context->scratch_page_mask,
7110 pte_batch,
7111 tlb_batch);
7112 }
7113
7114 // Case 5: If the big page table is newly-allocated, make sure that all big
7115 // PTEs we aren't otherwise writing (that is, those which cover 4k PTEs) are
7116 // all initialized to invalid.
7117 //
7118 // The similar case of making newly-allocated big PTEs unmapped when no
7119 // lower 4k table is present is handled by having
7120 // block_gpu_compute_new_pte_state set new_pte_state->big_ptes
7121 // appropriately.
7122 if (gpu_state->page_table_range_big.table && !gpu_state->initialized_big) {
7123 // TODO: Bug 1766424: If we have the 4k page table already, we could
7124 // attempt to merge all uncovered big PTE regions when first
7125 // allocating the big table. That's probably not worth doing.
7126 UVM_ASSERT(gpu_state->page_table_range_4k.table);
7127 UVM_ASSERT(bitmap_empty(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK));
7128 bitmap_complement(big_ptes_mask, new_pte_state->big_ptes, uvm_va_block_num_big_pages(block, big_page_size));
7129 block_gpu_pte_clear_big(block, gpu, big_ptes_mask, 0, pte_batch, tlb_batch);
7130 gpu_state->initialized_big = true;
7131 }
7132
7133 // Case 1 (step 1): Unmap the currently-big PTEs which are valid and are
7134 // being split to 4k. We can't directly transition from a valid big PTE to
7135 // valid lower PTEs, because that could cause the GPU TLBs to cache the same
7136 // VA in different cache lines. That could cause memory ordering to not be
7137 // maintained.
7138 bitmap_zero(big_ptes_mask, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
7139 for_each_set_bit(big_page_index, big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) {
7140 big_region = uvm_va_block_big_page_region(block, big_page_index, big_page_size);
7141 if (uvm_page_mask_test(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ], big_region.first))
7142 __set_bit(big_page_index, big_ptes_mask);
7143 }
7144
7145 block_gpu_pte_clear_big(block, gpu, big_ptes_mask, tree->hal->unmapped_pte(big_page_size), pte_batch, tlb_batch);
7146
7147 // Case 3: Write the currently-big PTEs which remain big PTEs, and are
7148 // wholly changing permissions.
7149 //
7150 // Mask computation: big_before && big_after && covered
7151 bitmap_and(big_ptes_mask, gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
7152 if (bitmap_and(big_ptes_mask, big_ptes_mask, new_pte_state->big_ptes_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK))
7153 block_gpu_pte_write_big(block, gpu, resident_id, new_prot, big_ptes_mask, pte_batch, tlb_batch);
7154
7155 // Case 2 (step 1): Merge the new big PTEs and end the batches, now that
7156 // we've done all of the independent PTE writes we can. This also merges
7157 // newly-allocated uncovered big PTEs to unmapped (see
7158 // block_gpu_compute_new_pte_state).
7159 //
7160 // Mask computation: !big_before && big_after
7161 if (bitmap_andnot(big_ptes_merge, new_pte_state->big_ptes, gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)) {
7162 // This writes the newly-big PTEs to unmapped and ends the PTE and TLB
7163 // batches.
7164 block_gpu_pte_merge_big_and_end(block,
7165 block_context,
7166 gpu,
7167 big_ptes_merge,
7168 push,
7169 pte_batch,
7170 tlb_batch,
7171 tlb_membar);
7172
7173 // Remove uncovered big PTEs. We needed to merge them to unmapped above,
7174 // but they shouldn't get new_prot below.
7175 bitmap_and(big_ptes_merge, big_ptes_merge, new_pte_state->big_ptes_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
7176 }
7177 else {
7178 // End the batches. We have to commit the membars and TLB invalidates
7179 // before we finish splitting formerly-big PTEs.
7180 uvm_pte_batch_end(pte_batch);
7181 uvm_tlb_batch_end(tlb_batch, push, tlb_membar);
7182 }
7183
7184 if (!bitmap_empty(big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) ||
7185 !bitmap_empty(big_ptes_merge, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) ||
7186 block_gpu_needs_to_activate_table(block, gpu)) {
7187
7188 uvm_pte_batch_begin(push, pte_batch);
7189 uvm_tlb_batch_begin(tree, tlb_batch);
7190
7191 // Case 1 (step 2): Finish splitting our big PTEs, if we have any, by
7192 // switching them from unmapped to invalid. This causes the GPU MMU to
7193 // start reading the 4k PTEs instead of stopping at the unmapped big
7194 // PTEs.
7195 block_gpu_pte_clear_big(block, gpu, big_ptes_split, 0, pte_batch, tlb_batch);
7196
7197 // Case 2 (step 2): Finish merging our big PTEs, if we have any, by
7198 // switching them from unmapped to new_prot.
7199 block_gpu_pte_write_big(block, gpu, resident_id, new_prot, big_ptes_merge, pte_batch, tlb_batch);
7200
7201 uvm_pte_batch_end(pte_batch);
7202
7203 // Finally, activate the page tables if they're inactive
7204 if (block_gpu_needs_to_activate_table(block, gpu))
7205 block_gpu_write_pde(block, gpu, push, tlb_batch);
7206
7207 uvm_tlb_batch_end(tlb_batch, push, UVM_MEMBAR_NONE);
7208 }
7209
7210 // Update gpu_state
7211 bitmap_copy(gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
7212 }
7213
7214 // Unmap all PTEs for {block, gpu}. If the 2M entry is currently a PDE, it is
7215 // merged into a PTE.
block_gpu_unmap_to_2m(uvm_va_block_t * block,uvm_va_block_context_t * block_context,uvm_gpu_t * gpu,uvm_push_t * push,uvm_membar_t tlb_membar)7216 static void block_gpu_unmap_to_2m(uvm_va_block_t *block,
7217 uvm_va_block_context_t *block_context,
7218 uvm_gpu_t *gpu,
7219 uvm_push_t *push,
7220 uvm_membar_t tlb_membar)
7221 {
7222 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
7223 uvm_gpu_va_space_t *gpu_va_space = uvm_va_block_get_gpu_va_space(block, gpu);
7224 uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch;
7225 uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch;
7226
7227 if (gpu_state->pte_is_2m) {
7228 // If we're already mapped as a valid 2M PTE, just write it to invalid
7229 uvm_pte_batch_begin(push, pte_batch);
7230 uvm_tlb_batch_begin(&gpu_va_space->page_tables, tlb_batch);
7231
7232 block_gpu_pte_clear_2m(block, gpu, pte_batch, tlb_batch);
7233
7234 uvm_pte_batch_end(pte_batch);
7235 uvm_tlb_batch_end(tlb_batch, push, tlb_membar);
7236 }
7237 else {
7238 // Otherwise we have a mix of big and 4K PTEs which need to be merged
7239 // into an invalid 2M PTE.
7240 block_gpu_pte_merge_2m(block, block_context, gpu, push, tlb_membar);
7241
7242 gpu_state->pte_is_2m = true;
7243 bitmap_zero(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
7244 }
7245 }
7246
7247 // Combination split + unmap operation, called when only part of a valid 2M PTE
7248 // mapping is being unmapped. The 2M PTE is split into a mix of valid and
7249 // invalid big and/or 4k PTEs, as described by
7250 // block_context->mapping.new_pte_state.
7251 //
7252 // The PTEs covering the pages in pages_to_unmap are cleared (unmapped).
7253 //
7254 // The PTEs covering the pages not set in pages_to_unmap inherit the mapping of
7255 // the current 2M PTE.
block_gpu_unmap_split_2m(uvm_va_block_t * block,uvm_va_block_context_t * block_context,uvm_gpu_t * gpu,const uvm_page_mask_t * pages_to_unmap,uvm_push_t * push,uvm_membar_t tlb_membar)7256 static void block_gpu_unmap_split_2m(uvm_va_block_t *block,
7257 uvm_va_block_context_t *block_context,
7258 uvm_gpu_t *gpu,
7259 const uvm_page_mask_t *pages_to_unmap,
7260 uvm_push_t *push,
7261 uvm_membar_t tlb_membar)
7262 {
7263 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
7264 uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
7265 uvm_va_block_new_pte_state_t *new_pte_state = &block_context->mapping.new_pte_state;
7266 uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch;
7267 uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch;
7268 uvm_prot_t curr_prot = block_page_prot_gpu(block, gpu, 0);
7269 uvm_processor_id_t resident_id;
7270 DECLARE_BITMAP(big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
7271 DECLARE_BITMAP(big_ptes_inherit, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
7272 DECLARE_BITMAP(big_ptes_new_prot, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
7273
7274 UVM_ASSERT(gpu_state->pte_is_2m);
7275
7276 resident_id = block_gpu_get_processor_to_map(block, block_context, gpu, 0);
7277
7278 uvm_pte_batch_begin(push, pte_batch);
7279
7280 // Since the 2M entry is active as a PTE, the GPU MMU can't fetch entries
7281 // from the lower levels. This means we don't need to issue a TLB invalidate
7282 // when writing those levels.
7283
7284 // Cases to handle:
7285 // 1) Big PTEs which inherit curr_prot
7286 // 2) Big PTEs which get unmapped
7287 // 3) Big PTEs which are split to 4k
7288 // a) 4k PTEs which inherit curr_prot under the split big PTEs
7289 // b) 4k PTEs which get unmapped under the split big PTEs
7290
7291 // Compute the big PTEs which will need to be split to 4k, if any.
7292 bitmap_complement(big_ptes_split, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
7293
7294 if (gpu_state->page_table_range_big.table) {
7295 // Case 1: Write the big PTEs which will inherit the 2M permissions, if
7296 // any. These are the big PTEs which are unchanged (uncovered) by the
7297 // operation.
7298 bitmap_andnot(big_ptes_inherit,
7299 new_pte_state->big_ptes,
7300 new_pte_state->big_ptes_covered,
7301 MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
7302
7303 block_gpu_pte_write_big(block, gpu, resident_id, curr_prot, big_ptes_inherit, pte_batch, NULL);
7304
7305 // Case 2: Clear the new big PTEs which get unmapped (those not covering
7306 // 4ks)
7307 bitmap_and(big_ptes_new_prot,
7308 new_pte_state->big_ptes,
7309 new_pte_state->big_ptes_covered,
7310 MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
7311
7312 block_gpu_pte_clear_big(block,
7313 gpu,
7314 big_ptes_new_prot,
7315 tree->hal->unmapped_pte(UVM_PAGE_SIZE_64K),
7316 pte_batch,
7317 NULL);
7318
7319 // Case 3: Write the big PTEs which cover 4k PTEs
7320 block_gpu_pte_clear_big(block, gpu, big_ptes_split, 0, pte_batch, NULL);
7321
7322 // We just wrote all possible big PTEs, so mark them as initialized
7323 gpu_state->initialized_big = true;
7324 }
7325 else {
7326 UVM_ASSERT(bitmap_empty(new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK));
7327 UVM_ASSERT(bitmap_full(new_pte_state->big_ptes_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK));
7328 }
7329
7330 // Cases 3a and 3b: Write all 4k PTEs under all now-split big PTEs
7331 block_gpu_pte_big_split_write_4k(block,
7332 block_context,
7333 gpu,
7334 resident_id,
7335 UVM_PROT_NONE,
7336 big_ptes_split,
7337 pages_to_unmap,
7338 pte_batch);
7339
7340 // And activate the 2M PDE. This ends the pte_batch and issues a single TLB
7341 // invalidate for the 2M entry.
7342 block_gpu_pte_finish_split_2m(block, gpu, push, pte_batch, tlb_batch, tlb_membar);
7343
7344 gpu_state->pte_is_2m = false;
7345 bitmap_copy(gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
7346 }
7347
7348 // Unmap some pre-existing mix of big and 4k PTEs into some other mix of big
7349 // and 4k PTEs.
7350 //
7351 // The PTEs covering the pages in pages_to_unmap are cleared (unmapped).
block_gpu_unmap_big_and_4k(uvm_va_block_t * block,uvm_va_block_context_t * block_context,uvm_gpu_t * gpu,const uvm_page_mask_t * pages_to_unmap,uvm_push_t * push,uvm_membar_t tlb_membar)7352 static void block_gpu_unmap_big_and_4k(uvm_va_block_t *block,
7353 uvm_va_block_context_t *block_context,
7354 uvm_gpu_t *gpu,
7355 const uvm_page_mask_t *pages_to_unmap,
7356 uvm_push_t *push,
7357 uvm_membar_t tlb_membar)
7358 {
7359 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
7360 uvm_page_tree_t *tree = &uvm_va_block_get_gpu_va_space(block, gpu)->page_tables;
7361 uvm_va_block_new_pte_state_t *new_pte_state = &block_context->mapping.new_pte_state;
7362 uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch;
7363 uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch;
7364 DECLARE_BITMAP(big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
7365 DECLARE_BITMAP(big_ptes_before_or_after, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
7366 DECLARE_BITMAP(big_ptes_mask, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
7367 NvU32 big_page_size = tree->big_page_size;
7368 NvU64 unmapped_pte_val = tree->hal->unmapped_pte(big_page_size);
7369
7370 UVM_ASSERT(!gpu_state->pte_is_2m);
7371
7372 uvm_pte_batch_begin(push, pte_batch);
7373 uvm_tlb_batch_begin(tree, tlb_batch);
7374
7375 // All of these cases might be perfomed in the same call:
7376 // 1) Split currently-big PTEs to 4k
7377 // a) Write new 4k PTEs which inherit curr_prot under the split big PTEs
7378 // b) Clear new 4k PTEs which get unmapped under the split big PTEs
7379 // 2) Merge currently-4k PTEs to unmapped big
7380 // 3) Clear currently-big PTEs which wholly get unmapped
7381 // 4) Clear currently-4k PTEs which get unmapped
7382 // 5) Initialize big PTEs which are not covered by this operation
7383
7384 // Cases 1a and 1b: Write all 4k PTEs under all currently-big PTEs which are
7385 // being split. We'll make the big PTEs inactive below after flushing these
7386 // writes. No TLB invalidate is needed since the big PTE is active.
7387 //
7388 // Mask computation: big_before && !big_after
7389 bitmap_andnot(big_ptes_split, gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
7390
7391 block_gpu_pte_big_split_write_4k(block,
7392 block_context,
7393 gpu,
7394 UVM_ID_INVALID,
7395 UVM_PROT_NONE,
7396 big_ptes_split,
7397 pages_to_unmap,
7398 pte_batch);
7399
7400 // Case 4: Clear the 4k PTEs which weren't covered by a big PTE before, and
7401 // remain uncovered after the unmap.
7402 //
7403 // Mask computation: !big_before && !big_after
7404 bitmap_or(big_ptes_before_or_after, gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
7405 uvm_page_mask_init_from_big_ptes(block, gpu, &block_context->scratch_page_mask, big_ptes_before_or_after);
7406 if (uvm_page_mask_andnot(&block_context->scratch_page_mask, pages_to_unmap, &block_context->scratch_page_mask))
7407 block_gpu_pte_clear_4k(block, gpu, &block_context->scratch_page_mask, 0, pte_batch, tlb_batch);
7408
7409 // Case 5: If the big page table is newly-allocated, make sure that all big
7410 // PTEs we aren't otherwise writing (that is, those which cover 4k PTEs) are
7411 // all initialized to invalid.
7412 //
7413 // The similar case of making newly-allocated big PTEs unmapped when no
7414 // lower 4k table is present is handled by having
7415 // block_gpu_compute_new_pte_state set new_pte_state->big_ptes
7416 // appropriately.
7417 if (gpu_state->page_table_range_big.table && !gpu_state->initialized_big) {
7418 // TODO: Bug 1766424: If we have the 4k page table already, we could
7419 // attempt to merge all uncovered big PTE regions when first
7420 // allocating the big table. That's probably not worth doing.
7421 UVM_ASSERT(gpu_state->page_table_range_4k.table);
7422 UVM_ASSERT(bitmap_empty(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK));
7423 bitmap_complement(big_ptes_mask, new_pte_state->big_ptes, uvm_va_block_num_big_pages(block, big_page_size));
7424 block_gpu_pte_clear_big(block, gpu, big_ptes_mask, 0, pte_batch, tlb_batch);
7425 gpu_state->initialized_big = true;
7426 }
7427
7428 // Case 3 and step 1 of case 1: Unmap both currently-big PTEs which are
7429 // getting wholly unmapped, and those currently-big PTEs which are being
7430 // split to 4k. We can't directly transition from a valid big PTE to valid
7431 // lower PTEs, because that could cause the GPU TLBs to cache the same VA in
7432 // different cache lines. That could cause memory ordering to not be
7433 // maintained.
7434 //
7435 // Mask computation: (big_before && big_after && covered) ||
7436 // (big_before && !big_after)
7437 bitmap_and(big_ptes_mask, gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
7438 bitmap_and(big_ptes_mask, big_ptes_mask, new_pte_state->big_ptes_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
7439 bitmap_or(big_ptes_mask, big_ptes_mask, big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
7440 block_gpu_pte_clear_big(block, gpu, big_ptes_mask, unmapped_pte_val, pte_batch, tlb_batch);
7441
7442 // Case 2: Merge the new big PTEs and end the batches, now that we've done
7443 // all of the independent PTE writes we can.
7444 //
7445 // Mask computation: !big_before && big_after
7446 if (bitmap_andnot(big_ptes_mask, new_pte_state->big_ptes, gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)) {
7447 // This writes the newly-big PTEs to unmapped and ends the PTE and TLB
7448 // batches.
7449 block_gpu_pte_merge_big_and_end(block,
7450 block_context,
7451 gpu,
7452 big_ptes_mask,
7453 push,
7454 pte_batch,
7455 tlb_batch,
7456 tlb_membar);
7457 }
7458 else {
7459 // End the batches. We have to commit the membars and TLB invalidates
7460 // before we finish splitting formerly-big PTEs.
7461 uvm_pte_batch_end(pte_batch);
7462 uvm_tlb_batch_end(tlb_batch, push, tlb_membar);
7463 }
7464
7465 if (!bitmap_empty(big_ptes_split, MAX_BIG_PAGES_PER_UVM_VA_BLOCK) ||
7466 block_gpu_needs_to_activate_table(block, gpu)) {
7467 uvm_pte_batch_begin(push, pte_batch);
7468 uvm_tlb_batch_begin(tree, tlb_batch);
7469
7470 // Case 1 (step 2): Finish splitting our big PTEs, if we have any, by
7471 // switching them from unmapped to invalid. This causes the GPU MMU to
7472 // start reading the 4k PTEs instead of stopping at the unmapped big
7473 // PTEs.
7474 block_gpu_pte_clear_big(block, gpu, big_ptes_split, 0, pte_batch, tlb_batch);
7475
7476 uvm_pte_batch_end(pte_batch);
7477
7478 // Finally, activate the page tables if they're inactive
7479 if (block_gpu_needs_to_activate_table(block, gpu))
7480 block_gpu_write_pde(block, gpu, push, tlb_batch);
7481
7482 uvm_tlb_batch_end(tlb_batch, push, UVM_MEMBAR_NONE);
7483 }
7484
7485 // Update gpu_state
7486 bitmap_copy(gpu_state->big_ptes, new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
7487 }
7488
7489 // When PTE state is about to change (for example due to a map/unmap/revoke
7490 // operation), this function decides how to split and merge the PTEs in response
7491 // to that operation.
7492 //
7493 // The operation is described with the two page masks:
7494 //
7495 // - pages_changing indicates which pages will have their PTE mappings changed
7496 // on the GPU in some way as a result of the operation (for example, which
7497 // pages will actually have their mapping permissions upgraded).
7498 //
7499 // - page_mask_after indicates which pages on this GPU will have exactly the
7500 // same PTE attributes (permissions, residency) as pages_changing after the
7501 // operation is applied.
7502 //
7503 // PTEs are merged eagerly.
block_gpu_compute_new_pte_state(uvm_va_block_t * block,uvm_gpu_t * gpu,uvm_processor_id_t resident_id,const uvm_page_mask_t * pages_changing,const uvm_page_mask_t * page_mask_after,uvm_va_block_new_pte_state_t * new_pte_state)7504 static void block_gpu_compute_new_pte_state(uvm_va_block_t *block,
7505 uvm_gpu_t *gpu,
7506 uvm_processor_id_t resident_id,
7507 const uvm_page_mask_t *pages_changing,
7508 const uvm_page_mask_t *page_mask_after,
7509 uvm_va_block_new_pte_state_t *new_pte_state)
7510 {
7511 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
7512 uvm_va_block_region_t big_region_all, big_page_region, region;
7513 NvU32 big_page_size;
7514 uvm_page_index_t page_index;
7515 size_t big_page_index;
7516 DECLARE_BITMAP(big_ptes_not_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
7517 bool can_make_new_big_ptes;
7518
7519 memset(new_pte_state, 0, sizeof(*new_pte_state));
7520 new_pte_state->needs_4k = true;
7521
7522 // TODO: Bug 1676485: Force a specific page size for perf testing
7523
7524 if (gpu_state->force_4k_ptes)
7525 return;
7526
7527 // Limit HMM GPU allocations to PAGE_SIZE since migrate_vma_*(),
7528 // hmm_range_fault(), and make_device_exclusive_range() don't handle folios
7529 // yet. Also, it makes mremap() difficult since the new address may not
7530 // align with the GPU block size otherwise.
7531 // If PAGE_SIZE is 64K, the code following this check is OK since 64K
7532 // big_pages is supported on all HMM supported GPUs (Turing+).
7533 // TODO: Bug 3368756: add support for transparent huge pages (THP).
7534 if (uvm_va_block_is_hmm(block) && PAGE_SIZE == UVM_PAGE_SIZE_4K)
7535 return;
7536
7537 UVM_ASSERT(uvm_page_mask_subset(pages_changing, page_mask_after));
7538
7539 // If all pages in the 2M mask have the same attributes after the
7540 // operation is applied, we can use a 2M PTE.
7541 if (block_gpu_supports_2m(block, gpu) && uvm_page_mask_full(page_mask_after) &&
7542 (UVM_ID_IS_INVALID(resident_id) ||
7543 is_block_phys_contig(block, resident_id, block_get_page_node_residency(block, 0)))) {
7544 new_pte_state->pte_is_2m = true;
7545 new_pte_state->needs_4k = false;
7546 return;
7547 }
7548
7549 // Find big PTEs with matching attributes
7550
7551 // Can this block fit any big pages?
7552 big_page_size = uvm_va_block_gpu_big_page_size(block, gpu);
7553 big_region_all = uvm_va_block_big_page_region_all(block, big_page_size);
7554 if (big_region_all.first >= big_region_all.outer)
7555 return;
7556
7557 new_pte_state->needs_4k = false;
7558
7559 can_make_new_big_ptes = true;
7560
7561 // Big pages can be used when mapping sysmem if the GPU supports it (Pascal+).
7562 if (UVM_ID_IS_CPU(resident_id) && !gpu->parent->can_map_sysmem_with_large_pages)
7563 can_make_new_big_ptes = false;
7564
7565 // We must not fail during teardown: unmap (resident_id == UVM_ID_INVALID)
7566 // with no splits required. That means we should avoid allocating PTEs
7567 // which are only needed for merges.
7568 //
7569 // This only matters if we're merging to big PTEs. If we're merging to 2M,
7570 // then we must already have the 2M level (since it has to be allocated
7571 // before the lower levels).
7572 //
7573 // If pte_is_2m already and we don't have a big table, we're splitting so we
7574 // have to allocate.
7575 if (UVM_ID_IS_INVALID(resident_id) && !gpu_state->page_table_range_big.table && !gpu_state->pte_is_2m)
7576 can_make_new_big_ptes = false;
7577
7578 for_each_va_block_page_in_region_mask(page_index, pages_changing, big_region_all) {
7579 uvm_cpu_chunk_t *chunk = NULL;
7580 int nid = NUMA_NO_NODE;
7581
7582 if (UVM_ID_IS_CPU(resident_id)) {
7583 nid = block_get_page_node_residency(block, page_index);
7584 UVM_ASSERT(nid != NUMA_NO_NODE);
7585 chunk = uvm_cpu_chunk_get_chunk_for_page(block, nid, page_index);
7586 }
7587
7588 big_page_index = uvm_va_block_big_page_index(block, page_index, big_page_size);
7589 big_page_region = uvm_va_block_big_page_region(block, big_page_index, big_page_size);
7590
7591 __set_bit(big_page_index, new_pte_state->big_ptes_covered);
7592
7593 // When mapping sysmem, we can use big pages only if we are mapping all
7594 // pages in the big page subregion and the CPU pages backing the
7595 // subregion are physically contiguous.
7596 if (can_make_new_big_ptes &&
7597 uvm_page_mask_region_full(page_mask_after, big_page_region) &&
7598 (!UVM_ID_IS_CPU(resident_id) ||
7599 (uvm_cpu_chunk_get_size(chunk) >= big_page_size &&
7600 uvm_va_block_cpu_is_region_resident_on(block, nid, big_page_region))))
7601 __set_bit(big_page_index, new_pte_state->big_ptes);
7602
7603 if (!test_bit(big_page_index, new_pte_state->big_ptes))
7604 new_pte_state->needs_4k = true;
7605
7606 // Skip to the end of the region
7607 page_index = big_page_region.outer - 1;
7608 }
7609
7610 if (!new_pte_state->needs_4k) {
7611 // All big page regions in pages_changing will be big PTEs. Now check if
7612 // there are any unaligned pages outside of big_region_all which are
7613 // changing.
7614 region = uvm_va_block_region(0, big_region_all.first);
7615 if (!uvm_page_mask_region_empty(pages_changing, region)) {
7616 new_pte_state->needs_4k = true;
7617 }
7618 else {
7619 region = uvm_va_block_region(big_region_all.outer, uvm_va_block_num_cpu_pages(block));
7620 if (!uvm_page_mask_region_empty(pages_changing, region))
7621 new_pte_state->needs_4k = true;
7622 }
7623 }
7624
7625 // Now add in the PTEs which should be big but weren't covered by this
7626 // operation.
7627 //
7628 // Note that we can't assume that a given page table range has been
7629 // initialized if it's present here, since it could have been allocated by a
7630 // thread which had to restart its operation due to allocation retry.
7631 if (gpu_state->pte_is_2m || (block_gpu_supports_2m(block, gpu) && !gpu_state->page_table_range_2m.table)) {
7632 // We're splitting a 2M PTE so all of the uncovered big PTE regions will
7633 // become big PTEs which inherit the 2M permissions. If we haven't
7634 // allocated the 2M table yet, it will start as a 2M PTE until the lower
7635 // levels are allocated, so it's the same split case regardless of
7636 // whether this operation will need to retry a later allocation.
7637 bitmap_complement(big_ptes_not_covered, new_pte_state->big_ptes_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
7638 }
7639 else if (!gpu_state->page_table_range_4k.table && !new_pte_state->needs_4k) {
7640 // If we don't have 4k PTEs and we won't be allocating them for this
7641 // operation, all of our PTEs need to be big.
7642 UVM_ASSERT(!bitmap_empty(new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK));
7643 bitmap_zero(big_ptes_not_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
7644 bitmap_set(big_ptes_not_covered, 0, uvm_va_block_num_big_pages(block, big_page_size));
7645 }
7646 else {
7647 // Otherwise, add in all of the currently-big PTEs which are unchanging.
7648 // They won't be written, but they need to be carried into the new
7649 // gpu_state->big_ptes when it's updated.
7650 bitmap_andnot(big_ptes_not_covered,
7651 gpu_state->big_ptes,
7652 new_pte_state->big_ptes_covered,
7653 MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
7654 }
7655
7656 bitmap_or(new_pte_state->big_ptes, new_pte_state->big_ptes, big_ptes_not_covered, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
7657 }
7658
7659 // Wrapper around uvm_page_tree_get_ptes() and uvm_page_tree_alloc_table() that
7660 // handles allocation retry. If the block lock has been unlocked and relocked as
7661 // part of the allocation, NV_ERR_MORE_PROCESSING_REQUIRED is returned to signal
7662 // to the caller that the operation likely needs to be restarted. If that
7663 // happens, the pending tracker is added to the block's tracker.
block_alloc_pt_range_with_retry(uvm_va_block_t * va_block,uvm_gpu_t * gpu,NvU32 page_size,uvm_page_table_range_t * page_table_range,uvm_tracker_t * pending_tracker)7664 static NV_STATUS block_alloc_pt_range_with_retry(uvm_va_block_t *va_block,
7665 uvm_gpu_t *gpu,
7666 NvU32 page_size,
7667 uvm_page_table_range_t *page_table_range,
7668 uvm_tracker_t *pending_tracker)
7669 {
7670 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id);
7671 uvm_gpu_va_space_t *gpu_va_space = uvm_va_block_get_gpu_va_space(va_block, gpu);
7672 uvm_page_tree_t *page_tables = &gpu_va_space->page_tables;
7673 uvm_va_block_test_t *va_block_test = uvm_va_block_get_test(va_block);
7674 uvm_page_table_range_t local_range;
7675 NV_STATUS status;
7676
7677 // Blocks may contain large PTEs without starting on a PTE boundary or
7678 // having an aligned size. Cover the PTEs of this size in the block's
7679 // interior so we match uvm_va_block_gpu_state_t::big_ptes.
7680 NvU64 start = UVM_ALIGN_UP(va_block->start, page_size);
7681 NvU64 size = UVM_ALIGN_DOWN(va_block->end + 1, page_size) - start;
7682
7683 // VA blocks which can use the 2MB level as either a PTE or a PDE need to
7684 // account for the PDE specially, so they must use uvm_page_tree_alloc_table
7685 // to allocate the lower levels.
7686 bool use_alloc_table = block_gpu_supports_2m(va_block, gpu) && page_size < UVM_PAGE_SIZE_2M;
7687
7688 UVM_ASSERT(page_table_range->table == NULL);
7689
7690 if (va_block_test && va_block_test->page_table_allocation_retry_force_count > 0) {
7691 --va_block_test->page_table_allocation_retry_force_count;
7692 status = NV_ERR_NO_MEMORY;
7693 }
7694 else if (use_alloc_table) {
7695 // Pascal+: 4k/64k tables under a 2M entry
7696 UVM_ASSERT(gpu_state->page_table_range_2m.table);
7697 status = uvm_page_tree_alloc_table(page_tables,
7698 page_size,
7699 UVM_PMM_ALLOC_FLAGS_NONE,
7700 &gpu_state->page_table_range_2m,
7701 page_table_range);
7702 }
7703 else {
7704 // 4k/big tables on pre-Pascal, and the 2M entry on Pascal+
7705 status = uvm_page_tree_get_ptes(page_tables,
7706 page_size,
7707 start,
7708 size,
7709 UVM_PMM_ALLOC_FLAGS_NONE,
7710 page_table_range);
7711 }
7712
7713 if (status == NV_OK)
7714 goto allocated;
7715
7716 if (status != NV_ERR_NO_MEMORY)
7717 return status;
7718
7719 // Before unlocking the block lock, any pending work on the block has to be
7720 // added to the block's tracker.
7721 if (pending_tracker) {
7722 status = uvm_tracker_add_tracker_safe(&va_block->tracker, pending_tracker);
7723 if (status != NV_OK)
7724 return status;
7725 }
7726
7727 // Unlock the va block and retry with eviction enabled
7728 uvm_mutex_unlock(&va_block->lock);
7729
7730 if (use_alloc_table) {
7731 // Although we don't hold the block lock here, it's safe to pass
7732 // gpu_state->page_table_range_2m to the page tree code because we know
7733 // that the 2m range has already been allocated, and that it can't go
7734 // away while we have the va_space lock held.
7735 status = uvm_page_tree_alloc_table(page_tables,
7736 page_size,
7737 UVM_PMM_ALLOC_FLAGS_EVICT,
7738 &gpu_state->page_table_range_2m,
7739 &local_range);
7740 }
7741 else {
7742 status = uvm_page_tree_get_ptes(page_tables,
7743 page_size,
7744 start,
7745 size,
7746 UVM_PMM_ALLOC_FLAGS_EVICT,
7747 &local_range);
7748 }
7749
7750 uvm_mutex_lock(&va_block->lock);
7751
7752 if (status != NV_OK)
7753 return status;
7754
7755 status = NV_ERR_MORE_PROCESSING_REQUIRED;
7756
7757 if (page_table_range->table) {
7758 // A different caller allocated the page tables in the meantime, release the
7759 // local copy.
7760 uvm_page_tree_put_ptes(page_tables, &local_range);
7761 return status;
7762 }
7763
7764 *page_table_range = local_range;
7765
7766 allocated:
7767 // Mark the 2M PTE as active when we first allocate it, since we don't have
7768 // any PTEs below it yet.
7769 if (page_size == UVM_PAGE_SIZE_2M) {
7770 UVM_ASSERT(!gpu_state->pte_is_2m);
7771 gpu_state->pte_is_2m = true;
7772 }
7773 else if (page_size != UVM_PAGE_SIZE_4K) {
7774 // uvm_page_tree_get_ptes initializes big PTEs to invalid.
7775 // uvm_page_tree_alloc_table does not, so we'll have to do it later.
7776 if (use_alloc_table)
7777 UVM_ASSERT(!gpu_state->initialized_big);
7778 else
7779 gpu_state->initialized_big = true;
7780 }
7781
7782 return status;
7783 }
7784
7785 // Helper which allocates all page table ranges necessary for the given page
7786 // sizes. See block_alloc_pt_range_with_retry.
block_alloc_ptes_with_retry(uvm_va_block_t * va_block,uvm_gpu_t * gpu,NvU32 page_sizes,uvm_tracker_t * pending_tracker)7787 static NV_STATUS block_alloc_ptes_with_retry(uvm_va_block_t *va_block,
7788 uvm_gpu_t *gpu,
7789 NvU32 page_sizes,
7790 uvm_tracker_t *pending_tracker)
7791 {
7792 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id);
7793 uvm_gpu_va_space_t *gpu_va_space = uvm_va_block_get_gpu_va_space(va_block, gpu);
7794 uvm_page_table_range_t *range;
7795 NvU32 page_size;
7796 NV_STATUS status, final_status = NV_OK;
7797
7798 UVM_ASSERT(gpu_state);
7799
7800 // Blocks which can map 2M PTE/PDEs must always allocate the 2MB level first
7801 // in order to allocate the levels below.
7802 if (block_gpu_supports_2m(va_block, gpu))
7803 page_sizes |= UVM_PAGE_SIZE_2M;
7804
7805 UVM_ASSERT((page_sizes & gpu_va_space->page_tables.hal->page_sizes()) == page_sizes);
7806
7807 for_each_chunk_size_rev(page_size, page_sizes) {
7808 if (page_size == UVM_PAGE_SIZE_2M)
7809 range = &gpu_state->page_table_range_2m;
7810 else if (page_size == UVM_PAGE_SIZE_4K)
7811 range = &gpu_state->page_table_range_4k;
7812 else
7813 range = &gpu_state->page_table_range_big;
7814
7815 if (range->table)
7816 continue;
7817
7818 if (page_size == UVM_PAGE_SIZE_2M) {
7819 UVM_ASSERT(!gpu_state->pte_is_2m);
7820 UVM_ASSERT(!gpu_state->page_table_range_big.table);
7821 UVM_ASSERT(!gpu_state->page_table_range_4k.table);
7822 }
7823 else if (page_size != UVM_PAGE_SIZE_4K) {
7824 UVM_ASSERT(uvm_va_block_num_big_pages(va_block, uvm_va_block_gpu_big_page_size(va_block, gpu)) > 0);
7825 UVM_ASSERT(bitmap_empty(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK));
7826 }
7827
7828 status = block_alloc_pt_range_with_retry(va_block, gpu, page_size, range, pending_tracker);
7829
7830 // Keep going to allocate the remaining levels even if the allocation
7831 // requires a retry, since we'll likely still need them when we retry
7832 // anyway.
7833 if (status == NV_ERR_MORE_PROCESSING_REQUIRED)
7834 final_status = NV_ERR_MORE_PROCESSING_REQUIRED;
7835 else if (status != NV_OK)
7836 return status;
7837 }
7838
7839 return final_status;
7840 }
7841
block_alloc_ptes_new_state(uvm_va_block_t * va_block,uvm_gpu_t * gpu,uvm_va_block_new_pte_state_t * new_pte_state,uvm_tracker_t * pending_tracker)7842 static NV_STATUS block_alloc_ptes_new_state(uvm_va_block_t *va_block,
7843 uvm_gpu_t *gpu,
7844 uvm_va_block_new_pte_state_t *new_pte_state,
7845 uvm_tracker_t *pending_tracker)
7846 {
7847 NvU32 page_sizes = 0;
7848
7849 if (new_pte_state->pte_is_2m) {
7850 page_sizes |= UVM_PAGE_SIZE_2M;
7851 }
7852 else {
7853 if (!bitmap_empty(new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK))
7854 page_sizes |= uvm_va_block_gpu_big_page_size(va_block, gpu);
7855
7856 if (new_pte_state->needs_4k)
7857 page_sizes |= UVM_PAGE_SIZE_4K;
7858 else
7859 UVM_ASSERT(!bitmap_empty(new_pte_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK));
7860 }
7861
7862 return block_alloc_ptes_with_retry(va_block, gpu, page_sizes, pending_tracker);
7863 }
7864
7865 // Make sure that GMMU PDEs down to PDE1 are populated for the given VA block.
7866 // This is currently used on ATS systems to prevent GPUs from inadvertently
7867 // accessing sysmem via ATS because there is no PDE1 in the GMMU page tables,
7868 // which is where the NOATS bit resides.
7869 //
7870 // The current implementation simply pre-allocates the PTEs for the VA Block,
7871 // which is wasteful because the GPU may never need them.
7872 //
7873 // TODO: Bug 2064188: Change the MMU code to be able to directly refcount PDE1
7874 // page table entries without having to request PTEs.
block_pre_populate_pde1_gpu(uvm_va_block_t * block,uvm_gpu_va_space_t * gpu_va_space,uvm_tracker_t * pending_tracker)7875 static NV_STATUS block_pre_populate_pde1_gpu(uvm_va_block_t *block,
7876 uvm_gpu_va_space_t *gpu_va_space,
7877 uvm_tracker_t *pending_tracker)
7878 {
7879 NvU32 page_sizes;
7880 NvU32 big_page_size;
7881 uvm_gpu_t *gpu;
7882 uvm_va_block_gpu_state_t *gpu_state;
7883
7884 UVM_ASSERT(block);
7885 UVM_ASSERT(gpu_va_space);
7886 UVM_ASSERT(gpu_va_space->ats.enabled);
7887 UVM_ASSERT(uvm_gpu_va_space_state(gpu_va_space) == UVM_GPU_VA_SPACE_STATE_ACTIVE);
7888
7889 gpu = gpu_va_space->gpu;
7890 big_page_size = gpu_va_space->page_tables.big_page_size;
7891
7892 gpu_state = block_gpu_state_get_alloc(block, gpu);
7893 if (!gpu_state)
7894 return NV_ERR_NO_MEMORY;
7895
7896 // If the VA Block supports 2M pages, allocate the 2M PTE only, as it
7897 // requires less memory
7898 if (block_gpu_supports_2m(block, gpu))
7899 page_sizes = UVM_PAGE_SIZE_2M;
7900 else if (uvm_va_block_num_big_pages(block, big_page_size) > 0)
7901 page_sizes = big_page_size;
7902 else
7903 page_sizes = UVM_PAGE_SIZE_4K;
7904
7905 return block_alloc_ptes_with_retry(block, gpu, page_sizes, pending_tracker);
7906 }
7907
block_pre_populate_pde1_all_gpus(uvm_va_block_t * block,uvm_tracker_t * pending_tracker)7908 static NV_STATUS block_pre_populate_pde1_all_gpus(uvm_va_block_t *block, uvm_tracker_t *pending_tracker)
7909 {
7910 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
7911 NV_STATUS status = NV_OK;
7912
7913 // Pre-populate PDEs down to PDE1 for all GPU VA spaces on ATS systems. See
7914 // comments in block_pre_populate_pde1_gpu.
7915 if (g_uvm_global.ats.enabled && !block->cpu.ever_mapped) {
7916 uvm_gpu_va_space_t *gpu_va_space;
7917
7918 for_each_gpu_va_space(gpu_va_space, va_space) {
7919 // We only care about systems where ATS is supported and the application
7920 // enabled it.
7921 if (!gpu_va_space->ats.enabled)
7922 continue;
7923
7924 status = block_pre_populate_pde1_gpu(block, gpu_va_space, pending_tracker);
7925 if (status != NV_OK)
7926 break;
7927 }
7928 }
7929
7930 return status;
7931 }
7932
block_unmap_gpu(uvm_va_block_t * block,uvm_va_block_context_t * block_context,uvm_gpu_t * gpu,const uvm_page_mask_t * unmap_page_mask,uvm_tracker_t * out_tracker)7933 static NV_STATUS block_unmap_gpu(uvm_va_block_t *block,
7934 uvm_va_block_context_t *block_context,
7935 uvm_gpu_t *gpu,
7936 const uvm_page_mask_t *unmap_page_mask,
7937 uvm_tracker_t *out_tracker)
7938 {
7939 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
7940 uvm_pte_bits_gpu_t pte_bit;
7941 uvm_push_t push;
7942 uvm_membar_t tlb_membar;
7943 bool only_local_mappings;
7944 uvm_page_mask_t *pages_to_unmap = &block_context->mapping.page_mask;
7945 NV_STATUS status;
7946 uvm_va_block_new_pte_state_t *new_pte_state = &block_context->mapping.new_pte_state;
7947 bool mask_empty;
7948 uvm_processor_mask_t *non_uvm_lite_gpus = &block_context->mapping.non_uvm_lite_gpus;
7949
7950 // We have to check gpu_state before looking at any VA space state like our
7951 // gpu_va_space, because we could be on the eviction path where we don't
7952 // have a lock on that state. However, since remove_gpu_va_space walks each
7953 // block to unmap the GPU before destroying the gpu_va_space, we're
7954 // guaranteed that if this GPU has page tables, the gpu_va_space can't go
7955 // away while we're holding the block lock.
7956 if (!block_gpu_has_page_tables(block, gpu))
7957 return NV_OK;
7958
7959 if (!uvm_page_mask_and(pages_to_unmap, unmap_page_mask, &gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ]))
7960 return NV_OK;
7961
7962 // block_gpu_compute_new_pte_state needs a mask of pages which will have
7963 // matching attributes after the operation is performed. In the case of
7964 // unmap, those are the pages with unset bits.
7965 uvm_page_mask_andnot(&block_context->scratch_page_mask, &gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ], pages_to_unmap);
7966 uvm_page_mask_complement(&block_context->scratch_page_mask, &block_context->scratch_page_mask);
7967 block_gpu_compute_new_pte_state(block,
7968 gpu,
7969 UVM_ID_INVALID,
7970 pages_to_unmap,
7971 &block_context->scratch_page_mask,
7972 new_pte_state);
7973
7974 status = block_alloc_ptes_new_state(block, gpu, new_pte_state, out_tracker);
7975 if (status != NV_OK)
7976 return status;
7977
7978 only_local_mappings = !block_has_remote_mapping_gpu(block, &block_context->scratch_page_mask, gpu->id, pages_to_unmap);
7979 tlb_membar = uvm_hal_downgrade_membar_type(gpu, only_local_mappings);
7980
7981 status = uvm_push_begin_acquire(gpu->channel_manager,
7982 UVM_CHANNEL_TYPE_MEMOPS,
7983 &block->tracker,
7984 &push,
7985 "Unmapping pages in block [0x%llx, 0x%llx)",
7986 block->start,
7987 block->end + 1);
7988 if (status != NV_OK)
7989 return status;
7990
7991 if (new_pte_state->pte_is_2m) {
7992 // We're either unmapping a whole valid 2M PTE, or we're unmapping all
7993 // remaining pages in a split 2M PTE.
7994 block_gpu_unmap_to_2m(block, block_context, gpu, &push, tlb_membar);
7995 }
7996 else if (gpu_state->pte_is_2m) {
7997 // The block is currently mapped as a valid 2M PTE and we're unmapping
7998 // some pages within the 2M, so we have to split it into the appropriate
7999 // mix of big and 4k PTEs.
8000 block_gpu_unmap_split_2m(block, block_context, gpu, pages_to_unmap, &push, tlb_membar);
8001 }
8002 else {
8003 // We're unmapping some pre-existing mix of big and 4K PTEs into some
8004 // other mix of big and 4K PTEs.
8005 block_gpu_unmap_big_and_4k(block, block_context, gpu, pages_to_unmap, &push, tlb_membar);
8006 }
8007
8008 uvm_push_end(&push);
8009
8010 if (!uvm_processor_mask_test(block_get_uvm_lite_gpus(block), gpu->id)) {
8011
8012 uvm_processor_mask_andnot(non_uvm_lite_gpus, &block->mapped, block_get_uvm_lite_gpus(block));
8013
8014 UVM_ASSERT(uvm_processor_mask_test(non_uvm_lite_gpus, gpu->id));
8015
8016 // If the GPU is the only non-UVM-Lite processor with mappings, we can
8017 // safely mark pages as fully unmapped
8018 if (uvm_processor_mask_get_count(non_uvm_lite_gpus) == 1 && !uvm_va_block_is_hmm(block))
8019 uvm_page_mask_andnot(&block->maybe_mapped_pages, &block->maybe_mapped_pages, pages_to_unmap);
8020 }
8021
8022 // Clear block PTE state
8023 for (pte_bit = 0; pte_bit < UVM_PTE_BITS_GPU_MAX; pte_bit++) {
8024 mask_empty = !uvm_page_mask_andnot(&gpu_state->pte_bits[pte_bit],
8025 &gpu_state->pte_bits[pte_bit],
8026 pages_to_unmap);
8027 if (pte_bit == UVM_PTE_BITS_GPU_READ && mask_empty)
8028 uvm_processor_mask_clear(&block->mapped, gpu->id);
8029 }
8030
8031 UVM_ASSERT(block_check_mappings(block, block_context));
8032
8033 return uvm_tracker_add_push_safe(out_tracker, &push);
8034 }
8035
uvm_va_block_unmap(uvm_va_block_t * va_block,uvm_va_block_context_t * va_block_context,uvm_processor_id_t id,uvm_va_block_region_t region,const uvm_page_mask_t * unmap_page_mask,uvm_tracker_t * out_tracker)8036 NV_STATUS uvm_va_block_unmap(uvm_va_block_t *va_block,
8037 uvm_va_block_context_t *va_block_context,
8038 uvm_processor_id_t id,
8039 uvm_va_block_region_t region,
8040 const uvm_page_mask_t *unmap_page_mask,
8041 uvm_tracker_t *out_tracker)
8042 {
8043 uvm_page_mask_t *region_page_mask = &va_block_context->mapping.map_running_page_mask;
8044
8045 UVM_ASSERT(!uvm_va_block_is_dead(va_block));
8046 uvm_assert_mutex_locked(&va_block->lock);
8047
8048 if (UVM_ID_IS_CPU(id)) {
8049 block_unmap_cpu(va_block, va_block_context, region, unmap_page_mask);
8050 return NV_OK;
8051 }
8052
8053 uvm_page_mask_init_from_region(region_page_mask, region, unmap_page_mask);
8054
8055 return block_unmap_gpu(va_block, va_block_context, block_get_gpu(va_block, id), region_page_mask, out_tracker);
8056 }
8057
8058 // This function essentially works as a wrapper around vm_insert_page (hence
8059 // the similar function prototype). This is needed since vm_insert_page
8060 // doesn't take permissions as input, but uses vma->vm_page_prot instead.
8061 // Since we may have multiple VA blocks under one VMA which need to map
8062 // with different permissions, we have to manually change vma->vm_page_prot for
8063 // each call to vm_insert_page. Multiple faults under one VMA in separate
8064 // blocks can be serviced concurrently, so the VMA wrapper lock is used
8065 // to protect access to vma->vm_page_prot.
uvm_cpu_insert_page(struct vm_area_struct * vma,NvU64 addr,struct page * page,uvm_prot_t new_prot)8066 static NV_STATUS uvm_cpu_insert_page(struct vm_area_struct *vma,
8067 NvU64 addr,
8068 struct page *page,
8069 uvm_prot_t new_prot)
8070 {
8071 uvm_vma_wrapper_t *vma_wrapper;
8072 unsigned long target_flags;
8073 pgprot_t target_pgprot;
8074 int ret;
8075
8076 UVM_ASSERT(vma);
8077 UVM_ASSERT(vma->vm_private_data);
8078
8079 vma_wrapper = vma->vm_private_data;
8080 target_flags = vma->vm_flags;
8081
8082 if (new_prot == UVM_PROT_READ_ONLY)
8083 target_flags &= ~VM_WRITE;
8084
8085 target_pgprot = vm_get_page_prot(target_flags);
8086
8087 // Take VMA wrapper lock to check vma->vm_page_prot
8088 uvm_down_read(&vma_wrapper->lock);
8089
8090 // Take a write lock if we need to modify the VMA vm_page_prot
8091 // - vma->vm_page_prot creates writable PTEs but new prot is RO
8092 // - vma->vm_page_prot creates read-only PTEs but new_prot is RW
8093 if (pgprot_val(vma->vm_page_prot) != pgprot_val(target_pgprot)) {
8094 uvm_up_read(&vma_wrapper->lock);
8095 uvm_down_write(&vma_wrapper->lock);
8096
8097 vma->vm_page_prot = target_pgprot;
8098
8099 uvm_downgrade_write(&vma_wrapper->lock);
8100 }
8101
8102 ret = vm_insert_page(vma, addr, page);
8103 uvm_up_read(&vma_wrapper->lock);
8104 if (ret) {
8105 UVM_ASSERT_MSG(ret == -ENOMEM, "ret: %d\n", ret);
8106 return errno_to_nv_status(ret);
8107 }
8108
8109 return NV_OK;
8110 }
8111
compute_logical_prot(uvm_va_block_t * va_block,struct vm_area_struct * hmm_vma,uvm_page_index_t page_index)8112 static uvm_prot_t compute_logical_prot(uvm_va_block_t *va_block,
8113 struct vm_area_struct *hmm_vma,
8114 uvm_page_index_t page_index)
8115 {
8116 uvm_prot_t logical_prot;
8117
8118 if (uvm_va_block_is_hmm(va_block)) {
8119 NvU64 addr = uvm_va_block_cpu_page_address(va_block, page_index);
8120
8121 logical_prot = uvm_hmm_compute_logical_prot(va_block, hmm_vma, addr);
8122 }
8123 else {
8124 uvm_va_range_t *va_range = va_block->va_range;
8125
8126 UVM_ASSERT(va_range->type == UVM_VA_RANGE_TYPE_MANAGED);
8127
8128 // Zombified VA ranges no longer have a vma, so they have no permissions
8129 if (uvm_va_range_is_managed_zombie(va_range)) {
8130 logical_prot = UVM_PROT_NONE;
8131 }
8132 else {
8133 struct vm_area_struct *vma;
8134
8135 vma = uvm_va_range_vma(va_range);
8136
8137 if (!(vma->vm_flags & VM_READ))
8138 logical_prot = UVM_PROT_NONE;
8139 else if (!(vma->vm_flags & VM_WRITE))
8140 logical_prot = UVM_PROT_READ_ONLY;
8141 else
8142 logical_prot = UVM_PROT_READ_WRITE_ATOMIC;
8143 }
8144 }
8145
8146 return logical_prot;
8147 }
8148
block_page_get(uvm_va_block_t * block,block_phys_page_t block_page)8149 static struct page *block_page_get(uvm_va_block_t *block, block_phys_page_t block_page)
8150 {
8151 struct page *page;
8152
8153 if (UVM_ID_IS_CPU(block_page.processor)) {
8154 page = uvm_va_block_get_cpu_page(block, block_page.page_index);
8155 }
8156 else {
8157 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
8158 uvm_gpu_t *gpu = uvm_va_space_get_gpu(va_space, block_page.processor);
8159 size_t chunk_offset;
8160 uvm_gpu_chunk_t *chunk = block_phys_page_chunk(block, block_page, &chunk_offset);
8161
8162 UVM_ASSERT(gpu->mem_info.numa.enabled);
8163 page = uvm_gpu_chunk_to_page(&gpu->pmm, chunk) + chunk_offset / PAGE_SIZE;
8164 }
8165
8166 UVM_ASSERT(page);
8167 return page;
8168 }
8169
8170 // Creates or upgrades a CPU mapping for the given page, updating the block's
8171 // mapping and pte_bits bitmaps as appropriate. Upon successful return, the page
8172 // will be mapped with at least new_prot permissions.
8173 //
8174 // This never downgrades mappings, so new_prot must not be UVM_PROT_NONE. Use
8175 // block_unmap_cpu or uvm_va_block_revoke_prot instead.
8176 //
8177 // If the existing mapping is >= new_prot already, this is a no-op.
8178 //
8179 // It is the caller's responsibility to:
8180 // - Revoke mappings from other processors as appropriate so the CPU can map
8181 // with new_prot permissions
8182 // - Guarantee that vm_insert_page is safe to use (vma->vm_mm has a reference
8183 // and mmap_lock is held in at least read mode)
8184 // - For HMM blocks that vma is valid and safe to use, vma->vm_mm has a
8185 // reference and mmap_lock is held in at least read mode
8186 // - Ensure that the struct page corresponding to the physical memory being
8187 // mapped exists
8188 // - Manage the block's residency bitmap
8189 // - Ensure that the block hasn't been killed (block->va_range is present)
8190 // - Update the pte/mapping tracking state on success
block_map_cpu_page_to(uvm_va_block_t * block,struct vm_area_struct * hmm_vma,uvm_processor_id_t resident_id,uvm_page_index_t page_index,uvm_prot_t new_prot)8191 static NV_STATUS block_map_cpu_page_to(uvm_va_block_t *block,
8192 struct vm_area_struct *hmm_vma,
8193 uvm_processor_id_t resident_id,
8194 uvm_page_index_t page_index,
8195 uvm_prot_t new_prot)
8196 {
8197 uvm_prot_t curr_prot = block_page_prot_cpu(block, page_index);
8198 uvm_va_range_t *va_range = block->va_range;
8199 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
8200 struct vm_area_struct *vma;
8201 NV_STATUS status;
8202 NvU64 addr;
8203 struct page *page;
8204 int nid = NUMA_NO_NODE;
8205
8206 UVM_ASSERT((uvm_va_block_is_hmm(block) && hmm_vma) || va_range->type == UVM_VA_RANGE_TYPE_MANAGED);
8207 UVM_ASSERT(new_prot != UVM_PROT_NONE);
8208 UVM_ASSERT(new_prot < UVM_PROT_MAX);
8209 UVM_ASSERT(uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(resident_id)], UVM_ID_CPU));
8210
8211 uvm_assert_mutex_locked(&block->lock);
8212 if (UVM_ID_IS_CPU(resident_id))
8213 UVM_ASSERT(uvm_page_mask_test(&block->cpu.allocated, page_index));
8214
8215 // For the CPU, write implies atomic
8216 if (new_prot == UVM_PROT_READ_WRITE)
8217 new_prot = UVM_PROT_READ_WRITE_ATOMIC;
8218
8219 // Only upgrades are supported in this function
8220 UVM_ASSERT(curr_prot <= new_prot);
8221
8222 if (new_prot == curr_prot)
8223 return NV_OK;
8224
8225 // Check for existing VMA permissions. They could have been modified after
8226 // the initial mmap by mprotect.
8227 if (new_prot > compute_logical_prot(block, hmm_vma, page_index))
8228 return NV_ERR_INVALID_ACCESS_TYPE;
8229
8230 if (uvm_va_block_is_hmm(block)) {
8231 // Do not map CPU pages because they belong to the Linux kernel.
8232 return NV_OK;
8233 }
8234
8235 UVM_ASSERT(va_range);
8236
8237 if (UVM_ID_IS_CPU(resident_id)) {
8238 if (UVM_ID_IS_CPU(uvm_va_range_get_policy(va_range)->preferred_location)) {
8239 // Add the page's range group range to the range group's migrated list.
8240 uvm_range_group_range_t *rgr = uvm_range_group_range_find(va_space,
8241 uvm_va_block_cpu_page_address(block, page_index));
8242 if (rgr != NULL) {
8243 uvm_spin_lock(&rgr->range_group->migrated_ranges_lock);
8244 if (list_empty(&rgr->range_group_migrated_list_node))
8245 list_move_tail(&rgr->range_group_migrated_list_node, &rgr->range_group->migrated_ranges);
8246 uvm_spin_unlock(&rgr->range_group->migrated_ranges_lock);
8247 }
8248 }
8249
8250 nid = block_get_page_node_residency(block, page_index);
8251 UVM_ASSERT(nid != NUMA_NO_NODE);
8252 }
8253
8254 // It's possible here that current->mm != vma->vm_mm. That can happen for
8255 // example due to access_process_vm (ptrace) or get_user_pages from another
8256 // driver.
8257 //
8258 // In such cases the caller has taken care of ref counting vma->vm_mm for
8259 // us, so we can safely operate on the vma but we can't use
8260 // uvm_va_range_vma_current.
8261 vma = uvm_va_range_vma(va_range);
8262 uvm_assert_mmap_lock_locked(vma->vm_mm);
8263 UVM_ASSERT(!uvm_va_space_mm_enabled(va_space) || va_space->va_space_mm.mm == vma->vm_mm);
8264
8265 // Add the mapping
8266 addr = uvm_va_block_cpu_page_address(block, page_index);
8267
8268 // This unmap handles upgrades as vm_insert_page returns -EBUSY when
8269 // there's already a mapping present at fault_addr, so we have to unmap
8270 // first anyway when upgrading from RO -> RW.
8271 if (curr_prot != UVM_PROT_NONE)
8272 unmap_mapping_range(va_space->mapping, addr, PAGE_SIZE, 1);
8273
8274 // Don't map the CPU until prior copies and GPU PTE updates finish,
8275 // otherwise we might not stay coherent.
8276 status = uvm_tracker_wait(&block->tracker);
8277 if (status != NV_OK)
8278 return status;
8279
8280 page = block_page_get(block, block_phys_page(resident_id, nid, page_index));
8281 return uvm_cpu_insert_page(vma, addr, page, new_prot);
8282 }
8283
8284 // Maps the CPU to the given pages which are resident on resident_id.
8285 // map_page_mask is an in/out parameter: the pages which are mapped to
8286 // resident_id are removed from the mask before returning.
8287 //
8288 // Caller must ensure that:
8289 // - Pages in map_page_mask must not be set in the corresponding cpu.pte_bits
8290 // mask for the requested protection.
block_map_cpu_to(uvm_va_block_t * block,uvm_va_block_context_t * block_context,uvm_processor_id_t resident_id,uvm_va_block_region_t region,uvm_page_mask_t * map_page_mask,uvm_prot_t new_prot,uvm_tracker_t * out_tracker)8291 static NV_STATUS block_map_cpu_to(uvm_va_block_t *block,
8292 uvm_va_block_context_t *block_context,
8293 uvm_processor_id_t resident_id,
8294 uvm_va_block_region_t region,
8295 uvm_page_mask_t *map_page_mask,
8296 uvm_prot_t new_prot,
8297 uvm_tracker_t *out_tracker)
8298 {
8299 NV_STATUS status = NV_OK;
8300 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
8301 uvm_page_index_t page_index;
8302 uvm_page_mask_t *pages_to_map = &block_context->mapping.page_mask;
8303 const uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(block, resident_id, NUMA_NO_NODE);
8304 uvm_pte_bits_cpu_t prot_pte_bit = get_cpu_pte_bit_index(new_prot);
8305 uvm_pte_bits_cpu_t pte_bit;
8306
8307 UVM_ASSERT(uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(resident_id)], UVM_ID_CPU));
8308
8309 // TODO: Bug 1766424: Check if optimizing the unmap_mapping_range calls
8310 // within block_map_cpu_page_to by doing them once here is helpful.
8311
8312 UVM_ASSERT(!uvm_page_mask_and(&block_context->scratch_page_mask,
8313 map_page_mask,
8314 &block->cpu.pte_bits[prot_pte_bit]));
8315
8316 // The pages which will actually change are those in the input page mask
8317 // which are resident on the target.
8318 if (!uvm_page_mask_and(pages_to_map, map_page_mask, resident_mask))
8319 return NV_OK;
8320
8321 status = block_pre_populate_pde1_all_gpus(block, out_tracker);
8322 if (status != NV_OK)
8323 return status;
8324
8325 block->cpu.ever_mapped = true;
8326
8327 for_each_va_block_page_in_region_mask(page_index, pages_to_map, region) {
8328 status = block_map_cpu_page_to(block,
8329 block_context->hmm.vma,
8330 resident_id,
8331 page_index,
8332 new_prot);
8333 if (status != NV_OK)
8334 break;
8335
8336 uvm_processor_mask_set(&block->mapped, UVM_ID_CPU);
8337 }
8338
8339 // If there was some error, shrink the region so that we only update the
8340 // pte/mapping tracking bits for the pages that succeeded
8341 if (status != NV_OK) {
8342 region = uvm_va_block_region(region.first, page_index);
8343 uvm_page_mask_region_clear_outside(pages_to_map, region);
8344 }
8345
8346 // If pages are mapped from a remote residency, notify the remote mapping
8347 // events to tools. We skip event notification if the cause is Invalid. We
8348 // use it to signal that this function is being called from the revocation
8349 // path to avoid reporting duplicate events.
8350 if (UVM_ID_IS_GPU(resident_id) &&
8351 va_space->tools.enabled &&
8352 block_context->mapping.cause != UvmEventMapRemoteCauseInvalid) {
8353 uvm_va_block_region_t subregion;
8354 for_each_va_block_subregion_in_mask(subregion, pages_to_map, region) {
8355 uvm_tools_record_map_remote(block,
8356 NULL,
8357 UVM_ID_CPU,
8358 resident_id,
8359 uvm_va_block_region_start(block, subregion),
8360 uvm_va_block_region_size(subregion),
8361 block_context->mapping.cause);
8362 }
8363 }
8364
8365 // Update CPU mapping state
8366 for (pte_bit = 0; pte_bit <= prot_pte_bit; pte_bit++)
8367 uvm_page_mask_or(&block->cpu.pte_bits[pte_bit], &block->cpu.pte_bits[pte_bit], pages_to_map);
8368
8369 if (!uvm_va_block_is_hmm(block))
8370 uvm_page_mask_or(&block->maybe_mapped_pages, &block->maybe_mapped_pages, pages_to_map);
8371
8372 UVM_ASSERT(block_check_mappings(block, block_context));
8373
8374 // Remove all pages that were newly-mapped from the input mask
8375 uvm_page_mask_andnot(map_page_mask, map_page_mask, pages_to_map);
8376
8377 return status;
8378 }
8379
8380 // Maps the GPU to the given pages which are resident on resident_id.
8381 // map_page_mask is an in/out parameter: the pages which are mapped
8382 // to resident_id are removed from the mask before returning.
8383 //
8384 // Caller must ensure that:
8385 // - Pages in map_page_mask must not be set in the corresponding pte_bits mask
8386 // for the requested protection on the mapping GPU.
block_map_gpu_to(uvm_va_block_t * va_block,uvm_va_block_context_t * block_context,uvm_gpu_t * gpu,uvm_processor_id_t resident_id,int resident_nid,uvm_page_mask_t * map_page_mask,uvm_prot_t new_prot,uvm_tracker_t * out_tracker)8387 static NV_STATUS block_map_gpu_to(uvm_va_block_t *va_block,
8388 uvm_va_block_context_t *block_context,
8389 uvm_gpu_t *gpu,
8390 uvm_processor_id_t resident_id,
8391 int resident_nid,
8392 uvm_page_mask_t *map_page_mask,
8393 uvm_prot_t new_prot,
8394 uvm_tracker_t *out_tracker)
8395 {
8396 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id);
8397 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
8398 uvm_push_t push;
8399 NV_STATUS status;
8400 uvm_page_mask_t *pages_to_map = &block_context->mapping.page_mask;
8401 const uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, resident_id, resident_nid);
8402 uvm_pte_bits_gpu_t pte_bit;
8403 uvm_pte_bits_gpu_t prot_pte_bit = get_gpu_pte_bit_index(new_prot);
8404 uvm_va_block_new_pte_state_t *new_pte_state = &block_context->mapping.new_pte_state;
8405 block_pte_op_t pte_op;
8406
8407 UVM_ASSERT(map_page_mask);
8408 UVM_ASSERT(uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(resident_id)], gpu->id));
8409
8410 if (uvm_processor_mask_test(block_get_uvm_lite_gpus(va_block), gpu->id)) {
8411 uvm_va_policy_t *policy = uvm_va_range_get_policy(va_block->va_range);
8412 UVM_ASSERT(uvm_va_policy_preferred_location_equal(policy, resident_id, policy->preferred_nid));
8413 }
8414
8415 UVM_ASSERT(!uvm_page_mask_and(&block_context->scratch_page_mask,
8416 map_page_mask,
8417 &gpu_state->pte_bits[prot_pte_bit]));
8418
8419 // The pages which will actually change are those in the input page mask
8420 // which are resident on the target.
8421 if (!uvm_page_mask_and(pages_to_map, map_page_mask, resident_mask))
8422 return NV_OK;
8423
8424 UVM_ASSERT(block_check_mapping_residency(va_block, block_context, gpu, resident_id, pages_to_map));
8425
8426 // For PTE merge/split computation, compute all resident pages which will
8427 // have exactly new_prot after performing the mapping.
8428 uvm_page_mask_or(&block_context->scratch_page_mask, &gpu_state->pte_bits[prot_pte_bit], pages_to_map);
8429 if (prot_pte_bit < UVM_PTE_BITS_GPU_ATOMIC) {
8430 uvm_page_mask_andnot(&block_context->scratch_page_mask,
8431 &block_context->scratch_page_mask,
8432 &gpu_state->pte_bits[prot_pte_bit + 1]);
8433 }
8434 uvm_page_mask_and(&block_context->scratch_page_mask, &block_context->scratch_page_mask, resident_mask);
8435
8436 block_gpu_compute_new_pte_state(va_block,
8437 gpu,
8438 resident_id,
8439 pages_to_map,
8440 &block_context->scratch_page_mask,
8441 new_pte_state);
8442
8443 status = block_alloc_ptes_new_state(va_block, gpu, new_pte_state, out_tracker);
8444 if (status != NV_OK)
8445 return status;
8446
8447 status = uvm_push_begin_acquire(gpu->channel_manager,
8448 UVM_CHANNEL_TYPE_MEMOPS,
8449 &va_block->tracker,
8450 &push,
8451 "Mapping pages in block [0x%llx, 0x%llx) as %s",
8452 va_block->start,
8453 va_block->end + 1,
8454 uvm_prot_string(new_prot));
8455 if (status != NV_OK)
8456 return status;
8457
8458 pte_op = BLOCK_PTE_OP_MAP;
8459 if (new_pte_state->pte_is_2m) {
8460 // We're either modifying permissions of a pre-existing 2M PTE, or all
8461 // permissions match so we can merge to a new 2M PTE.
8462 block_gpu_map_to_2m(va_block, block_context, gpu, resident_id, new_prot, &push, pte_op);
8463 }
8464 else if (gpu_state->pte_is_2m) {
8465 // Permissions on a subset of the existing 2M PTE are being upgraded, so
8466 // we have to split it into the appropriate mix of big and 4k PTEs.
8467 block_gpu_map_split_2m(va_block, block_context, gpu, resident_id, pages_to_map, new_prot, &push, pte_op);
8468 }
8469 else {
8470 // We're upgrading permissions on some pre-existing mix of big and 4K
8471 // PTEs into some other mix of big and 4K PTEs.
8472 block_gpu_map_big_and_4k(va_block, block_context, gpu, resident_id, pages_to_map, new_prot, &push, pte_op);
8473 }
8474
8475 // If we are mapping remotely, record the event
8476 if (va_space->tools.enabled && !uvm_id_equal(resident_id, gpu->id)) {
8477 uvm_va_block_region_t subregion, region = uvm_va_block_region_from_block(va_block);
8478
8479 UVM_ASSERT(block_context->mapping.cause != UvmEventMapRemoteCauseInvalid);
8480
8481 for_each_va_block_subregion_in_mask(subregion, pages_to_map, region) {
8482 uvm_tools_record_map_remote(va_block,
8483 &push,
8484 gpu->id,
8485 resident_id,
8486 uvm_va_block_region_start(va_block, subregion),
8487 uvm_va_block_region_size(subregion),
8488 block_context->mapping.cause);
8489 }
8490 }
8491
8492 uvm_push_end(&push);
8493
8494 // Update GPU mapping state
8495 for (pte_bit = 0; pte_bit <= prot_pte_bit; pte_bit++)
8496 uvm_page_mask_or(&gpu_state->pte_bits[pte_bit], &gpu_state->pte_bits[pte_bit], pages_to_map);
8497
8498 uvm_processor_mask_set(&va_block->mapped, gpu->id);
8499
8500 // If we are mapping a UVM-Lite GPU or HMM va_block, do not update
8501 // maybe_mapped_pages.
8502 if (!uvm_processor_mask_test(block_get_uvm_lite_gpus(va_block), gpu->id) &&
8503 !uvm_va_block_is_hmm(va_block))
8504 uvm_page_mask_or(&va_block->maybe_mapped_pages, &va_block->maybe_mapped_pages, pages_to_map);
8505
8506 // Remove all pages resident on this processor from the input mask, which
8507 // were newly-mapped.
8508 uvm_page_mask_andnot(map_page_mask, map_page_mask, pages_to_map);
8509
8510 UVM_ASSERT(block_check_mappings(va_block, block_context));
8511
8512 return uvm_tracker_add_push_safe(out_tracker, &push);
8513 }
8514
8515 // allowed_nid_mask is only valid if the CPU is set in allowed_mask.
map_get_allowed_destinations(uvm_va_block_t * block,uvm_va_block_context_t * va_block_context,const uvm_va_policy_t * policy,uvm_processor_id_t id,uvm_processor_mask_t * allowed_mask,nodemask_t * allowed_nid_mask)8516 static void map_get_allowed_destinations(uvm_va_block_t *block,
8517 uvm_va_block_context_t *va_block_context,
8518 const uvm_va_policy_t *policy,
8519 uvm_processor_id_t id,
8520 uvm_processor_mask_t *allowed_mask,
8521 nodemask_t *allowed_nid_mask)
8522 {
8523 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
8524
8525 *allowed_nid_mask = node_possible_map;
8526
8527 if (uvm_processor_mask_test(block_get_uvm_lite_gpus(block), id)) {
8528 // UVM-Lite can only map resident pages on the preferred location
8529 uvm_processor_mask_zero(allowed_mask);
8530 uvm_processor_mask_set(allowed_mask, policy->preferred_location);
8531 if (UVM_ID_IS_CPU(policy->preferred_location) &&
8532 !uvm_va_policy_preferred_location_equal(policy, UVM_ID_CPU, NUMA_NO_NODE)) {
8533 nodes_clear(*allowed_nid_mask);
8534 node_set(policy->preferred_nid, *allowed_nid_mask);
8535 }
8536 }
8537 else if ((uvm_va_policy_is_read_duplicate(policy, va_space) ||
8538 (uvm_id_equal(policy->preferred_location, id) &&
8539 !is_uvm_fault_force_sysmem_set() &&
8540 !uvm_hmm_must_use_sysmem(block, va_block_context->hmm.vma))) &&
8541 uvm_va_space_processor_has_memory(va_space, id)) {
8542 // When operating under read-duplication we should only map the local
8543 // processor to cause fault-and-duplicate of remote pages.
8544 //
8545 // The same holds when this processor is the preferred location: only
8546 // create local mappings to force remote pages to fault-and-migrate.
8547 uvm_processor_mask_zero(allowed_mask);
8548 uvm_processor_mask_set(allowed_mask, id);
8549 }
8550 else {
8551 // Common case: Just map wherever the memory happens to reside
8552 uvm_processor_mask_and(allowed_mask, &block->resident, &va_space->can_access[uvm_id_value(id)]);
8553 return;
8554 }
8555
8556 // Clamp to resident and accessible processors
8557 uvm_processor_mask_and(allowed_mask, allowed_mask, &block->resident);
8558 uvm_processor_mask_and(allowed_mask, allowed_mask, &va_space->can_access[uvm_id_value(id)]);
8559 }
8560
uvm_va_block_map(uvm_va_block_t * va_block,uvm_va_block_context_t * va_block_context,uvm_processor_id_t id,uvm_va_block_region_t region,const uvm_page_mask_t * map_page_mask,uvm_prot_t new_prot,UvmEventMapRemoteCause cause,uvm_tracker_t * out_tracker)8561 NV_STATUS uvm_va_block_map(uvm_va_block_t *va_block,
8562 uvm_va_block_context_t *va_block_context,
8563 uvm_processor_id_t id,
8564 uvm_va_block_region_t region,
8565 const uvm_page_mask_t *map_page_mask,
8566 uvm_prot_t new_prot,
8567 UvmEventMapRemoteCause cause,
8568 uvm_tracker_t *out_tracker)
8569 {
8570 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
8571 uvm_gpu_t *gpu = NULL;
8572 uvm_processor_mask_t *allowed_destinations;
8573 uvm_processor_id_t resident_id;
8574 const uvm_page_mask_t *pte_mask;
8575 uvm_page_mask_t *running_page_mask = &va_block_context->mapping.map_running_page_mask;
8576 NV_STATUS status = NV_OK;
8577 const uvm_va_policy_t *policy = uvm_va_policy_get_region(va_block, region);
8578 nodemask_t *allowed_nid_destinations;
8579
8580 va_block_context->mapping.cause = cause;
8581
8582 UVM_ASSERT(new_prot != UVM_PROT_NONE);
8583 UVM_ASSERT(new_prot < UVM_PROT_MAX);
8584 uvm_assert_mutex_locked(&va_block->lock);
8585
8586 // Mapping is not supported on the eviction path that doesn't hold the VA
8587 // space lock.
8588 uvm_assert_rwsem_locked(&va_space->lock);
8589
8590 if (UVM_ID_IS_CPU(id)) {
8591 uvm_pte_bits_cpu_t prot_pte_bit;
8592
8593 // Check if the current thread is allowed to call vm_insert_page
8594 if (!uvm_va_block_is_hmm(va_block) && !uvm_va_range_vma_check(va_block->va_range, va_block_context->mm))
8595 return NV_OK;
8596
8597 prot_pte_bit = get_cpu_pte_bit_index(new_prot);
8598 pte_mask = &va_block->cpu.pte_bits[prot_pte_bit];
8599 }
8600 else {
8601 uvm_va_block_gpu_state_t *gpu_state;
8602 uvm_pte_bits_gpu_t prot_pte_bit;
8603
8604 gpu = uvm_va_space_get_gpu(va_space, id);
8605
8606 // Although this GPU UUID is registered in the VA space, it might not have a
8607 // GPU VA space registered.
8608 if (!uvm_gpu_va_space_get(va_space, gpu))
8609 return NV_OK;
8610
8611 gpu_state = block_gpu_state_get_alloc(va_block, gpu);
8612 if (!gpu_state)
8613 return NV_ERR_NO_MEMORY;
8614
8615 prot_pte_bit = get_gpu_pte_bit_index(new_prot);
8616 pte_mask = &gpu_state->pte_bits[prot_pte_bit];
8617 }
8618
8619 uvm_page_mask_init_from_region(running_page_mask, region, map_page_mask);
8620
8621 if (!uvm_page_mask_andnot(running_page_mask, running_page_mask, pte_mask))
8622 return NV_OK;
8623
8624 allowed_destinations = uvm_processor_mask_cache_alloc();
8625 if (!allowed_destinations)
8626 return NV_ERR_NO_MEMORY;
8627
8628 allowed_nid_destinations = uvm_kvmalloc(sizeof(*allowed_nid_destinations));
8629 if (!allowed_nid_destinations) {
8630 uvm_processor_mask_cache_free(allowed_destinations);
8631 return NV_ERR_NO_MEMORY;
8632 }
8633
8634 // Map per resident location so we can more easily detect physically-
8635 // contiguous mappings.
8636 map_get_allowed_destinations(va_block,
8637 va_block_context,
8638 policy,
8639 id,
8640 allowed_destinations,
8641 allowed_nid_destinations);
8642 for_each_closest_id(resident_id, allowed_destinations, id, va_space) {
8643 if (UVM_ID_IS_CPU(id)) {
8644 status = block_map_cpu_to(va_block,
8645 va_block_context,
8646 resident_id,
8647 region,
8648 running_page_mask,
8649 new_prot,
8650 out_tracker);
8651 }
8652 else if (UVM_ID_IS_CPU(resident_id)) {
8653 int nid;
8654
8655 // map_get_allowed_distinations() will set the mask of CPU NUMA
8656 // nodes that should be mapped.
8657 for_each_node_mask(nid, *allowed_nid_destinations) {
8658 status = block_map_gpu_to(va_block,
8659 va_block_context,
8660 gpu,
8661 resident_id,
8662 nid,
8663 running_page_mask,
8664 new_prot,
8665 out_tracker);
8666 if (status != NV_OK)
8667 break;
8668 }
8669 }
8670 else {
8671 status = block_map_gpu_to(va_block,
8672 va_block_context,
8673 gpu,
8674 resident_id,
8675 NUMA_NO_NODE,
8676 running_page_mask,
8677 new_prot,
8678 out_tracker);
8679 }
8680
8681 if (status != NV_OK)
8682 break;
8683
8684 // If we've mapped all requested pages, we're done
8685 if (uvm_page_mask_region_empty(running_page_mask, region))
8686 break;
8687 }
8688
8689 uvm_processor_mask_cache_free(allowed_destinations);
8690 uvm_kvfree(allowed_nid_destinations);
8691
8692 return status;
8693 }
8694
8695 // Revokes the given pages mapped by cpu. This is implemented by unmapping all
8696 // pages and mapping them later with the lower permission. This is required
8697 // because vm_insert_page can only be used for upgrades from Invalid.
8698 //
8699 // Caller must ensure that:
8700 // - Pages in revoke_page_mask must be set in the
8701 // cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE] mask.
block_revoke_cpu_write(uvm_va_block_t * block,uvm_va_block_context_t * block_context,uvm_va_block_region_t region,const uvm_page_mask_t * revoke_page_mask,uvm_tracker_t * out_tracker)8702 static NV_STATUS block_revoke_cpu_write(uvm_va_block_t *block,
8703 uvm_va_block_context_t *block_context,
8704 uvm_va_block_region_t region,
8705 const uvm_page_mask_t *revoke_page_mask,
8706 uvm_tracker_t *out_tracker)
8707 {
8708 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
8709 uvm_va_block_region_t subregion;
8710
8711 UVM_ASSERT(revoke_page_mask);
8712
8713 UVM_ASSERT(uvm_page_mask_subset(revoke_page_mask, &block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE]));
8714
8715 block_unmap_cpu(block, block_context, region, revoke_page_mask);
8716
8717 // Coalesce revocation event notification
8718 for_each_va_block_subregion_in_mask(subregion, revoke_page_mask, region) {
8719 uvm_perf_event_notify_revocation(&va_space->perf_events,
8720 block,
8721 UVM_ID_CPU,
8722 uvm_va_block_region_start(block, subregion),
8723 uvm_va_block_region_size(subregion),
8724 UVM_PROT_READ_WRITE_ATOMIC,
8725 UVM_PROT_READ_ONLY);
8726 }
8727
8728 // uvm_va_block_map will skip this remap if we aren't holding the right mm
8729 // lock.
8730 return uvm_va_block_map(block,
8731 block_context,
8732 UVM_ID_CPU,
8733 region,
8734 revoke_page_mask,
8735 UVM_PROT_READ_ONLY,
8736 UvmEventMapRemoteCauseInvalid,
8737 out_tracker);
8738 }
8739
block_revoke_prot_gpu_perf_notify(uvm_va_block_t * block,uvm_va_block_context_t * block_context,uvm_gpu_t * gpu,uvm_prot_t prot_revoked,const uvm_page_mask_t * pages_revoked)8740 static void block_revoke_prot_gpu_perf_notify(uvm_va_block_t *block,
8741 uvm_va_block_context_t *block_context,
8742 uvm_gpu_t *gpu,
8743 uvm_prot_t prot_revoked,
8744 const uvm_page_mask_t *pages_revoked)
8745 {
8746 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
8747 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, gpu->id);
8748 uvm_va_block_region_t subregion, region = uvm_va_block_region_from_block(block);
8749 uvm_pte_bits_gpu_t pte_bit;
8750
8751 for (pte_bit = UVM_PTE_BITS_GPU_ATOMIC; pte_bit >= get_gpu_pte_bit_index(prot_revoked); pte_bit--) {
8752 uvm_prot_t old_prot;
8753
8754 if (!uvm_page_mask_and(&block_context->scratch_page_mask, &gpu_state->pte_bits[pte_bit], pages_revoked))
8755 continue;
8756
8757 if (pte_bit == UVM_PTE_BITS_GPU_ATOMIC)
8758 old_prot = UVM_PROT_READ_WRITE_ATOMIC;
8759 else
8760 old_prot = UVM_PROT_READ_WRITE;
8761
8762 for_each_va_block_subregion_in_mask(subregion, &block_context->scratch_page_mask, region) {
8763 uvm_perf_event_notify_revocation(&va_space->perf_events,
8764 block,
8765 gpu->id,
8766 uvm_va_block_region_start(block, subregion),
8767 uvm_va_block_region_size(subregion),
8768 old_prot,
8769 prot_revoked - 1);
8770 }
8771 }
8772 }
8773
8774 // Revokes the given pages mapped by gpu which are resident on resident_id.
8775 // revoke_page_mask is an in/out parameter: the pages which have the appropriate
8776 // permissions and are mapped to resident_id are removed from the mask before
8777 // returning.
8778 //
8779 // Caller must ensure that:
8780 // - Pages in map_page_mask must be set in the corresponding pte_bits mask for
8781 // the protection to be revoked on the mapping GPU.
block_revoke_prot_gpu_to(uvm_va_block_t * va_block,uvm_va_block_context_t * block_context,uvm_gpu_t * gpu,uvm_processor_id_t resident_id,uvm_page_mask_t * revoke_page_mask,uvm_prot_t prot_to_revoke,uvm_tracker_t * out_tracker)8782 static NV_STATUS block_revoke_prot_gpu_to(uvm_va_block_t *va_block,
8783 uvm_va_block_context_t *block_context,
8784 uvm_gpu_t *gpu,
8785 uvm_processor_id_t resident_id,
8786 uvm_page_mask_t *revoke_page_mask,
8787 uvm_prot_t prot_to_revoke,
8788 uvm_tracker_t *out_tracker)
8789 {
8790 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id);
8791 uvm_push_t push;
8792 NV_STATUS status;
8793 uvm_pte_bits_gpu_t pte_bit;
8794 uvm_pte_bits_gpu_t prot_pte_bit = get_gpu_pte_bit_index(prot_to_revoke);
8795 uvm_prot_t new_prot = prot_to_revoke - 1;
8796 uvm_va_block_new_pte_state_t *new_pte_state = &block_context->mapping.new_pte_state;
8797 block_pte_op_t pte_op;
8798 const uvm_page_mask_t *resident_mask = uvm_va_block_resident_mask_get(va_block, resident_id, NUMA_NO_NODE);
8799 uvm_page_mask_t *pages_to_revoke = &block_context->mapping.page_mask;
8800
8801 UVM_ASSERT(revoke_page_mask);
8802 UVM_ASSERT(uvm_page_mask_subset(revoke_page_mask, &gpu_state->pte_bits[prot_pte_bit]));
8803
8804 // The pages which will actually change are those in the input page mask
8805 // which are resident on the target.
8806 if (!uvm_page_mask_and(pages_to_revoke, revoke_page_mask, resident_mask))
8807 return NV_OK;
8808
8809 UVM_ASSERT(block_check_mapping_residency(va_block, block_context, gpu, resident_id, pages_to_revoke));
8810
8811 // For PTE merge/split computation, compute all resident pages which will
8812 // have exactly prot_to_revoke-1 after performing the revocation.
8813 uvm_page_mask_andnot(&block_context->scratch_page_mask, &gpu_state->pte_bits[prot_pte_bit], pages_to_revoke);
8814 uvm_page_mask_andnot(&block_context->scratch_page_mask,
8815 &gpu_state->pte_bits[prot_pte_bit - 1],
8816 &block_context->scratch_page_mask);
8817 uvm_page_mask_and(&block_context->scratch_page_mask, &block_context->scratch_page_mask, resident_mask);
8818
8819 block_gpu_compute_new_pte_state(va_block,
8820 gpu,
8821 resident_id,
8822 pages_to_revoke,
8823 &block_context->scratch_page_mask,
8824 new_pte_state);
8825
8826 status = block_alloc_ptes_new_state(va_block, gpu, new_pte_state, out_tracker);
8827 if (status != NV_OK)
8828 return status;
8829
8830 status = uvm_push_begin_acquire(gpu->channel_manager,
8831 UVM_CHANNEL_TYPE_MEMOPS,
8832 &va_block->tracker,
8833 &push,
8834 "Revoking %s access privileges in block [0x%llx, 0x%llx) ",
8835 uvm_prot_string(prot_to_revoke),
8836 va_block->start,
8837 va_block->end + 1);
8838 if (status != NV_OK)
8839 return status;
8840
8841 pte_op = BLOCK_PTE_OP_REVOKE;
8842 if (new_pte_state->pte_is_2m) {
8843 // We're either modifying permissions of a pre-existing 2M PTE, or all
8844 // permissions match so we can merge to a new 2M PTE.
8845 block_gpu_map_to_2m(va_block, block_context, gpu, resident_id, new_prot, &push, pte_op);
8846 }
8847 else if (gpu_state->pte_is_2m) {
8848 // Permissions on a subset of the existing 2M PTE are being downgraded,
8849 // so we have to split it into the appropriate mix of big and 4k PTEs.
8850 block_gpu_map_split_2m(va_block, block_context, gpu, resident_id, pages_to_revoke, new_prot, &push, pte_op);
8851 }
8852 else {
8853 // We're downgrading permissions on some pre-existing mix of big and 4K
8854 // PTEs into some other mix of big and 4K PTEs.
8855 block_gpu_map_big_and_4k(va_block, block_context, gpu, resident_id, pages_to_revoke, new_prot, &push, pte_op);
8856 }
8857
8858 uvm_push_end(&push);
8859
8860 block_revoke_prot_gpu_perf_notify(va_block, block_context, gpu, prot_to_revoke, pages_to_revoke);
8861
8862 // Update GPU mapping state
8863 for (pte_bit = UVM_PTE_BITS_GPU_ATOMIC; pte_bit >= prot_pte_bit; pte_bit--)
8864 uvm_page_mask_andnot(&gpu_state->pte_bits[pte_bit], &gpu_state->pte_bits[pte_bit], pages_to_revoke);
8865
8866 // Remove all pages resident on this processor from the input mask, which
8867 // pages which were revoked and pages which already had the correct
8868 // permissions.
8869 uvm_page_mask_andnot(revoke_page_mask, revoke_page_mask, pages_to_revoke);
8870
8871 UVM_ASSERT(block_check_mappings(va_block, block_context));
8872
8873 return uvm_tracker_add_push_safe(out_tracker, &push);
8874 }
8875
uvm_va_block_revoke_prot(uvm_va_block_t * va_block,uvm_va_block_context_t * va_block_context,uvm_processor_id_t id,uvm_va_block_region_t region,const uvm_page_mask_t * revoke_page_mask,uvm_prot_t prot_to_revoke,uvm_tracker_t * out_tracker)8876 NV_STATUS uvm_va_block_revoke_prot(uvm_va_block_t *va_block,
8877 uvm_va_block_context_t *va_block_context,
8878 uvm_processor_id_t id,
8879 uvm_va_block_region_t region,
8880 const uvm_page_mask_t *revoke_page_mask,
8881 uvm_prot_t prot_to_revoke,
8882 uvm_tracker_t *out_tracker)
8883 {
8884 uvm_gpu_t *gpu;
8885 uvm_va_block_gpu_state_t *gpu_state;
8886 uvm_processor_mask_t *resident_procs;
8887 uvm_processor_id_t resident_id;
8888 uvm_page_mask_t *running_page_mask = &va_block_context->mapping.revoke_running_page_mask;
8889 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
8890 uvm_pte_bits_gpu_t prot_pte_bit;
8891 NV_STATUS status = NV_OK;
8892
8893 UVM_ASSERT(prot_to_revoke > UVM_PROT_READ_ONLY);
8894 UVM_ASSERT(prot_to_revoke < UVM_PROT_MAX);
8895 uvm_assert_mutex_locked(&va_block->lock);
8896
8897 if (UVM_ID_IS_CPU(id)) {
8898 if (prot_to_revoke == UVM_PROT_READ_WRITE_ATOMIC)
8899 return NV_OK;
8900
8901 if (uvm_va_block_is_hmm(va_block)) {
8902 // Linux is responsible for CPU page table updates.
8903 uvm_page_mask_region_clear(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE], region);
8904 return NV_OK;
8905 }
8906
8907 uvm_page_mask_init_from_region(running_page_mask, region, revoke_page_mask);
8908
8909 if (uvm_page_mask_and(running_page_mask, running_page_mask, &va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE]))
8910 return block_revoke_cpu_write(va_block, va_block_context, region, running_page_mask, out_tracker);
8911
8912 return NV_OK;
8913 }
8914
8915 gpu = uvm_va_space_get_gpu(va_space, id);
8916
8917 // UVM-Lite GPUs should never have access revoked
8918 UVM_ASSERT_MSG(!uvm_processor_mask_test(block_get_uvm_lite_gpus(va_block), gpu->id),
8919 "GPU %s\n", uvm_gpu_name(gpu));
8920
8921 // Return early if there are no mappings for the GPU present in the block
8922 if (!uvm_processor_mask_test(&va_block->mapped, gpu->id))
8923 return NV_OK;
8924
8925 gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id);
8926 prot_pte_bit = get_gpu_pte_bit_index(prot_to_revoke);
8927
8928 uvm_page_mask_init_from_region(running_page_mask, region, revoke_page_mask);
8929
8930 if (!uvm_page_mask_and(running_page_mask, running_page_mask, &gpu_state->pte_bits[prot_pte_bit]))
8931 return NV_OK;
8932
8933 resident_procs = uvm_processor_mask_cache_alloc();
8934 if (!resident_procs)
8935 return NV_ERR_NO_MEMORY;
8936
8937 // Revoke per resident location so we can more easily detect physically-
8938 // contiguous mappings.
8939 uvm_processor_mask_copy(resident_procs, &va_block->resident);
8940
8941 for_each_closest_id(resident_id, resident_procs, gpu->id, va_space) {
8942 NV_STATUS status = block_revoke_prot_gpu_to(va_block,
8943 va_block_context,
8944 gpu,
8945 resident_id,
8946 running_page_mask,
8947 prot_to_revoke,
8948 out_tracker);
8949 if (status != NV_OK)
8950 break;
8951
8952 // If we've revoked all requested pages, we're done
8953 if (uvm_page_mask_region_empty(running_page_mask, region))
8954 break;
8955 }
8956
8957 uvm_processor_mask_cache_free(resident_procs);
8958
8959 return status;
8960 }
8961
uvm_va_block_map_mask(uvm_va_block_t * va_block,uvm_va_block_context_t * va_block_context,const uvm_processor_mask_t * map_processor_mask,uvm_va_block_region_t region,const uvm_page_mask_t * map_page_mask,uvm_prot_t new_prot,UvmEventMapRemoteCause cause)8962 NV_STATUS uvm_va_block_map_mask(uvm_va_block_t *va_block,
8963 uvm_va_block_context_t *va_block_context,
8964 const uvm_processor_mask_t *map_processor_mask,
8965 uvm_va_block_region_t region,
8966 const uvm_page_mask_t *map_page_mask,
8967 uvm_prot_t new_prot,
8968 UvmEventMapRemoteCause cause)
8969 {
8970 uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
8971 NV_STATUS status = NV_OK;
8972 NV_STATUS tracker_status;
8973 uvm_processor_id_t id;
8974
8975 for_each_id_in_mask(id, map_processor_mask) {
8976 status = uvm_va_block_map(va_block,
8977 va_block_context,
8978 id,
8979 region,
8980 map_page_mask,
8981 new_prot,
8982 cause,
8983 &local_tracker);
8984 if (status != NV_OK)
8985 break;
8986 }
8987
8988 // Regardless of error, add the successfully-pushed mapping operations into
8989 // the block's tracker. Note that we can't overwrite the tracker because we
8990 // aren't guaranteed that the map actually pushed anything (in which case it
8991 // would've acquired the block tracker first).
8992 tracker_status = uvm_tracker_add_tracker_safe(&va_block->tracker, &local_tracker);
8993 uvm_tracker_deinit(&local_tracker);
8994
8995 return status == NV_OK ? tracker_status : status;
8996 }
8997
uvm_va_block_unmap_mask(uvm_va_block_t * va_block,uvm_va_block_context_t * va_block_context,const uvm_processor_mask_t * unmap_processor_mask,uvm_va_block_region_t region,const uvm_page_mask_t * unmap_page_mask)8998 NV_STATUS uvm_va_block_unmap_mask(uvm_va_block_t *va_block,
8999 uvm_va_block_context_t *va_block_context,
9000 const uvm_processor_mask_t *unmap_processor_mask,
9001 uvm_va_block_region_t region,
9002 const uvm_page_mask_t *unmap_page_mask)
9003 {
9004 uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
9005 NV_STATUS status = NV_OK;
9006 NV_STATUS tracker_status;
9007 uvm_processor_id_t id;
9008
9009 // Watch out, unmap_mask could change during iteration since it could be
9010 // va_block->mapped.
9011 for_each_id_in_mask(id, unmap_processor_mask) {
9012 // Errors could either be a system-fatal error (ECC) or an allocation
9013 // retry due to PTE splitting. In either case we should stop after
9014 // hitting the first one.
9015 status = uvm_va_block_unmap(va_block, va_block_context, id, region, unmap_page_mask, &local_tracker);
9016 if (status != NV_OK)
9017 break;
9018 }
9019
9020 // See the comment in uvm_va_block_map_mask for adding to the tracker.
9021 tracker_status = uvm_tracker_add_tracker_safe(&va_block->tracker, &local_tracker);
9022 uvm_tracker_deinit(&local_tracker);
9023
9024 return status == NV_OK ? tracker_status : status;
9025 }
9026
uvm_va_block_revoke_prot_mask(uvm_va_block_t * va_block,uvm_va_block_context_t * va_block_context,const uvm_processor_mask_t * revoke_processor_mask,uvm_va_block_region_t region,const uvm_page_mask_t * revoke_page_mask,uvm_prot_t prot_to_revoke)9027 NV_STATUS uvm_va_block_revoke_prot_mask(uvm_va_block_t *va_block,
9028 uvm_va_block_context_t *va_block_context,
9029 const uvm_processor_mask_t *revoke_processor_mask,
9030 uvm_va_block_region_t region,
9031 const uvm_page_mask_t *revoke_page_mask,
9032 uvm_prot_t prot_to_revoke)
9033 {
9034 uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
9035 NV_STATUS status = NV_OK;
9036 NV_STATUS tracker_status;
9037 uvm_processor_id_t id;
9038
9039 for_each_id_in_mask(id, revoke_processor_mask) {
9040 status = uvm_va_block_revoke_prot(va_block,
9041 va_block_context,
9042 id,
9043 region,
9044 revoke_page_mask,
9045 prot_to_revoke,
9046 &local_tracker);
9047 if (status != NV_OK)
9048 break;
9049 }
9050
9051 // See the comment in uvm_va_block_map_mask for adding to the tracker.
9052 tracker_status = uvm_tracker_add_tracker_safe(&va_block->tracker, &local_tracker);
9053 uvm_tracker_deinit(&local_tracker);
9054
9055 return status == NV_OK ? tracker_status : status;
9056 }
9057
9058 // Updates the read_duplicated_pages mask in the block when the state of GPU id
9059 // is being destroyed
update_read_duplicated_pages_mask(uvm_va_block_t * block,uvm_gpu_id_t id,uvm_va_block_gpu_state_t * gpu_state)9060 static void update_read_duplicated_pages_mask(uvm_va_block_t *block,
9061 uvm_gpu_id_t id,
9062 uvm_va_block_gpu_state_t *gpu_state)
9063 {
9064 uvm_gpu_id_t running_id;
9065 bool first = true;
9066 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
9067 uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, NULL);
9068 uvm_page_mask_t *running_page_mask = &block_context->update_read_duplicated_pages.running_page_mask;
9069 uvm_page_mask_t *tmp_page_mask = &block_context->scratch_page_mask;
9070
9071 uvm_page_mask_zero(&block->read_duplicated_pages);
9072
9073 for_each_id_in_mask(running_id, &block->resident) {
9074 const uvm_page_mask_t *running_residency_mask;
9075
9076 if (uvm_id_equal(running_id, id))
9077 continue;
9078
9079 running_residency_mask = uvm_va_block_resident_mask_get(block, running_id, NUMA_NO_NODE);
9080
9081 if (first) {
9082 uvm_page_mask_copy(running_page_mask, running_residency_mask);
9083 first = false;
9084 continue;
9085 }
9086
9087 if (uvm_page_mask_and(tmp_page_mask, running_page_mask, running_residency_mask))
9088 uvm_page_mask_or(&block->read_duplicated_pages, &block->read_duplicated_pages, tmp_page_mask);
9089
9090 uvm_page_mask_or(running_page_mask, running_page_mask, running_residency_mask);
9091 }
9092 }
9093
9094 // Unmaps all GPU mappings under this block, frees the page tables, and frees
9095 // all the GPU chunks. This simply drops the chunks on the floor, so the caller
9096 // must take care of copying the data elsewhere if it needs to remain intact.
9097 //
9098 // This serializes on the block tracker since it must unmap page tables.
block_destroy_gpu_state(uvm_va_block_t * block,uvm_va_block_context_t * block_context,uvm_gpu_id_t id)9099 static void block_destroy_gpu_state(uvm_va_block_t *block, uvm_va_block_context_t *block_context, uvm_gpu_id_t id)
9100 {
9101 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, id);
9102 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
9103 uvm_gpu_va_space_t *gpu_va_space;
9104 uvm_gpu_t *gpu, *other_gpu;
9105
9106 if (!gpu_state)
9107 return;
9108
9109 uvm_assert_mutex_locked(&block->lock);
9110
9111 // Unmap PTEs and free page tables
9112 gpu = uvm_va_space_get_gpu(va_space, id);
9113 gpu_va_space = uvm_gpu_va_space_get(va_space, gpu);
9114 if (gpu_va_space) {
9115
9116 uvm_va_block_remove_gpu_va_space(block, gpu_va_space, block_context);
9117 }
9118
9119 UVM_ASSERT(!uvm_processor_mask_test(&block->mapped, id));
9120
9121 // No processor should have this GPU mapped at this point
9122 UVM_ASSERT(block_check_processor_not_mapped(block, block_context, id));
9123
9124 // We need to remove the mappings of the indirect peers from the reverse
9125 // map when the GPU state is being destroyed (for example, on
9126 // unregister_gpu) and when peer access between indirect peers is disabled.
9127 // However, we need to avoid double mapping removals. There are two
9128 // possible scenarios:
9129 // - Disable peer access first. This will remove all mappings between A and
9130 // B GPUs, and the indirect_peers bit is cleared. Thus, the later call to
9131 // unregister_gpu will not operate on that pair of GPUs.
9132 // - Unregister GPU first. This will remove all mappings from all indirect
9133 // peers to the GPU being unregistered. It will also destroy its GPU state.
9134 // Subsequent calls to disable peers will remove the mappings from the GPU
9135 // being unregistered, but never to the GPU being unregistered (since it no
9136 // longer has a valid GPU state).
9137 for_each_va_space_gpu_in_mask(other_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)])
9138 block_gpu_unmap_all_chunks_indirect_peer(block, gpu, other_gpu);
9139
9140 if (gpu_state->chunks) {
9141 size_t i, num_chunks;
9142
9143 update_read_duplicated_pages_mask(block, id, gpu_state);
9144 uvm_page_mask_zero(&gpu_state->resident);
9145 block_clear_resident_processor(block, id);
9146
9147 num_chunks = block_num_gpu_chunks(block, gpu);
9148 for (i = 0; i < num_chunks; i++) {
9149 uvm_gpu_chunk_t *chunk = gpu_state->chunks[i];
9150 if (!chunk)
9151 continue;
9152
9153 uvm_mmu_chunk_unmap(chunk, &block->tracker);
9154 uvm_pmm_gpu_free(&gpu->pmm, chunk, &block->tracker);
9155 }
9156
9157 uvm_kvfree(gpu_state->chunks);
9158 }
9159 else {
9160 UVM_ASSERT(!uvm_processor_mask_test(&block->resident, id));
9161 }
9162
9163
9164 // Pending operations may still need the DMA memory to be mapped.
9165 uvm_tracker_wait(&block->tracker);
9166
9167 block_gpu_unmap_phys_all_cpu_pages(block, gpu);
9168 uvm_processor_mask_clear(&block->evicted_gpus, id);
9169
9170 kmem_cache_free(g_uvm_va_block_gpu_state_cache, gpu_state);
9171 block->gpus[uvm_id_gpu_index(id)] = NULL;
9172 }
9173
block_put_ptes_safe(uvm_page_tree_t * tree,uvm_page_table_range_t * range)9174 static void block_put_ptes_safe(uvm_page_tree_t *tree, uvm_page_table_range_t *range)
9175 {
9176 if (range->table) {
9177 uvm_page_tree_put_ptes(tree, range);
9178 memset(range, 0, sizeof(*range));
9179 }
9180 }
9181
uvm_va_block_add_gpu_va_space(uvm_va_block_t * va_block,uvm_gpu_va_space_t * gpu_va_space)9182 NV_STATUS uvm_va_block_add_gpu_va_space(uvm_va_block_t *va_block, uvm_gpu_va_space_t *gpu_va_space)
9183 {
9184 uvm_assert_mutex_locked(&va_block->lock);
9185
9186 if (!gpu_va_space->ats.enabled || !va_block->cpu.ever_mapped)
9187 return NV_OK;
9188
9189 // Pre-populate PDEs down to PDE1 for all GPU VA spaces on ATS systems. See
9190 // comments in pre_populate_pde1_gpu.
9191 return block_pre_populate_pde1_gpu(va_block, gpu_va_space, NULL);
9192 }
9193
uvm_va_block_remove_gpu_va_space(uvm_va_block_t * va_block,uvm_gpu_va_space_t * gpu_va_space,uvm_va_block_context_t * block_context)9194 void uvm_va_block_remove_gpu_va_space(uvm_va_block_t *va_block,
9195 uvm_gpu_va_space_t *gpu_va_space,
9196 uvm_va_block_context_t *block_context)
9197 {
9198 uvm_pte_batch_t *pte_batch = &block_context->mapping.pte_batch;
9199 uvm_tlb_batch_t *tlb_batch = &block_context->mapping.tlb_batch;
9200 uvm_gpu_t *gpu = gpu_va_space->gpu;
9201 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id);
9202 uvm_va_block_region_t region = uvm_va_block_region_from_block(va_block);
9203 uvm_push_t push;
9204 NV_STATUS status;
9205
9206 uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
9207
9208 if (!gpu_state)
9209 return;
9210
9211 uvm_assert_mutex_locked(&va_block->lock);
9212
9213 // Unmapping the whole block won't cause a page table split, so this should
9214 // only fail if we have a system-fatal error.
9215 status = uvm_va_block_unmap(va_block, block_context, gpu->id, region, NULL, &local_tracker);
9216 if (status != NV_OK) {
9217 UVM_ASSERT(status == uvm_global_get_status());
9218 return; // Just leak
9219 }
9220
9221 UVM_ASSERT(!uvm_processor_mask_test(&va_block->mapped, gpu->id));
9222
9223 // Reset the page tables if other allocations could reuse them
9224 if (!block_gpu_supports_2m(va_block, gpu) &&
9225 !bitmap_empty(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK)) {
9226
9227 status = uvm_push_begin_acquire(gpu->channel_manager,
9228 UVM_CHANNEL_TYPE_MEMOPS,
9229 &local_tracker,
9230 &push,
9231 "Resetting PTEs for block [0x%llx, 0x%llx)",
9232 va_block->start,
9233 va_block->end + 1);
9234 if (status != NV_OK) {
9235 UVM_ASSERT(status == uvm_global_get_status());
9236 return; // Just leak
9237 }
9238
9239 uvm_pte_batch_begin(&push, pte_batch);
9240 uvm_tlb_batch_begin(&gpu_va_space->page_tables, tlb_batch);
9241
9242 // When the big PTEs is active, the 4k PTEs under it are garbage. Make
9243 // them invalid so the page tree code can reuse them for other
9244 // allocations on this VA. These don't need TLB invalidates since the
9245 // big PTEs above them are active.
9246 if (gpu_state->page_table_range_4k.table) {
9247 uvm_page_mask_init_from_big_ptes(va_block, gpu, &block_context->scratch_page_mask, gpu_state->big_ptes);
9248 block_gpu_pte_clear_4k(va_block, gpu, &block_context->scratch_page_mask, 0, pte_batch, NULL);
9249 }
9250
9251 // We unmapped all big PTEs above, which means they have the unmapped
9252 // pattern so the GPU MMU won't read 4k PTEs under them. Set them to
9253 // invalid to activate the 4ks below so new allocations using just those
9254 // 4k PTEs will work.
9255 block_gpu_pte_clear_big(va_block, gpu, gpu_state->big_ptes, 0, pte_batch, tlb_batch);
9256
9257 uvm_pte_batch_end(pte_batch);
9258 uvm_tlb_batch_end(tlb_batch, &push, UVM_MEMBAR_NONE);
9259
9260 uvm_push_end(&push);
9261 uvm_tracker_overwrite_with_push(&local_tracker, &push);
9262 }
9263
9264 // The unmap must finish before we free the page tables
9265 status = uvm_tracker_wait_deinit(&local_tracker);
9266 if (status != NV_OK)
9267 return; // System-fatal error, just leak
9268
9269 // Note that if the PTE is currently 2M with lower tables allocated but not
9270 // in use, calling put_ptes on those lower ranges will re-write the 2M entry
9271 // to be a PDE.
9272 block_put_ptes_safe(&gpu_va_space->page_tables, &gpu_state->page_table_range_4k);
9273 block_put_ptes_safe(&gpu_va_space->page_tables, &gpu_state->page_table_range_big);
9274 block_put_ptes_safe(&gpu_va_space->page_tables, &gpu_state->page_table_range_2m);
9275
9276 gpu_state->pte_is_2m = false;
9277 gpu_state->initialized_big = false;
9278 gpu_state->activated_big = false;
9279 gpu_state->activated_4k = false;
9280 bitmap_zero(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
9281
9282 UVM_ASSERT(block_check_mappings(va_block, block_context));
9283 }
9284
uvm_va_block_enable_peer(uvm_va_block_t * va_block,uvm_gpu_t * gpu0,uvm_gpu_t * gpu1)9285 NV_STATUS uvm_va_block_enable_peer(uvm_va_block_t *va_block, uvm_gpu_t *gpu0, uvm_gpu_t *gpu1)
9286 {
9287 NV_STATUS status;
9288 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
9289
9290 UVM_ASSERT(uvm_gpu_peer_caps(gpu0, gpu1)->link_type != UVM_GPU_LINK_INVALID);
9291 uvm_assert_rwsem_locked_write(&va_space->lock);
9292 uvm_assert_mutex_locked(&va_block->lock);
9293
9294 if (uvm_processor_mask_test(&va_space->indirect_peers[uvm_id_value(gpu0->id)], gpu1->id)) {
9295 status = block_gpu_map_all_chunks_indirect_peer(va_block, gpu0, gpu1);
9296 if (status != NV_OK)
9297 return status;
9298
9299 status = block_gpu_map_all_chunks_indirect_peer(va_block, gpu1, gpu0);
9300 if (status != NV_OK) {
9301 block_gpu_unmap_all_chunks_indirect_peer(va_block, gpu0, gpu1);
9302 return status;
9303 }
9304 }
9305
9306 // TODO: Bug 1767224: Refactor the uvm_va_block_set_accessed_by logic so we
9307 // call it here.
9308
9309 return NV_OK;
9310 }
9311
uvm_va_block_disable_peer(uvm_va_block_t * va_block,uvm_gpu_t * gpu0,uvm_gpu_t * gpu1)9312 void uvm_va_block_disable_peer(uvm_va_block_t *va_block, uvm_gpu_t *gpu0, uvm_gpu_t *gpu1)
9313 {
9314 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
9315 NV_STATUS status;
9316 uvm_tracker_t tracker = UVM_TRACKER_INIT();
9317 uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, NULL);
9318 uvm_page_mask_t *unmap_page_mask = &block_context->caller_page_mask;
9319 const uvm_page_mask_t *resident0;
9320 const uvm_page_mask_t *resident1;
9321
9322 uvm_assert_mutex_locked(&va_block->lock);
9323
9324 // See comment in block_destroy_gpu_state
9325 if (uvm_processor_mask_test(&va_space->indirect_peers[uvm_id_value(gpu0->id)], gpu1->id)) {
9326 block_gpu_unmap_all_chunks_indirect_peer(va_block, gpu0, gpu1);
9327 block_gpu_unmap_all_chunks_indirect_peer(va_block, gpu1, gpu0);
9328 }
9329
9330 // If either of the GPUs doesn't have GPU state then nothing could be mapped
9331 // between them.
9332 if (!uvm_va_block_gpu_state_get(va_block, gpu0->id) || !uvm_va_block_gpu_state_get(va_block, gpu1->id))
9333 return;
9334
9335 resident0 = uvm_va_block_resident_mask_get(va_block, gpu0->id, NUMA_NO_NODE);
9336 resident1 = uvm_va_block_resident_mask_get(va_block, gpu1->id, NUMA_NO_NODE);
9337
9338 // Unmap all pages resident on gpu1, but not on gpu0, from gpu0
9339 if (uvm_page_mask_andnot(unmap_page_mask, resident1, resident0)) {
9340 status = block_unmap_gpu(va_block, block_context, gpu0, unmap_page_mask, &tracker);
9341 if (status != NV_OK) {
9342 // Since all PTEs unmapped by this call have the same aperture, page
9343 // splits should never be required so any failure should be the
9344 // result of a system-fatal error.
9345 UVM_ASSERT_MSG(status == uvm_global_get_status(),
9346 "Unmapping failed: %s, GPU %s\n",
9347 nvstatusToString(status),
9348 uvm_gpu_name(gpu0));
9349 }
9350 }
9351
9352 // Unmap all pages resident on gpu0, but not on gpu1, from gpu1
9353 if (uvm_page_mask_andnot(unmap_page_mask, resident0, resident1)) {
9354 status = block_unmap_gpu(va_block, block_context, gpu1, unmap_page_mask, &tracker);
9355 if (status != NV_OK) {
9356 UVM_ASSERT_MSG(status == uvm_global_get_status(),
9357 "Unmapping failed: %s, GPU %s\n",
9358 nvstatusToString(status),
9359 uvm_gpu_name(gpu0));
9360 }
9361 }
9362
9363 status = uvm_tracker_add_tracker_safe(&va_block->tracker, &tracker);
9364 if (status != NV_OK)
9365 UVM_ASSERT(status == uvm_global_get_status());
9366
9367 status = uvm_tracker_wait_deinit(&tracker);
9368 if (status != NV_OK)
9369 UVM_ASSERT(status == uvm_global_get_status());
9370 }
9371
uvm_va_block_unmap_preferred_location_uvm_lite(uvm_va_block_t * va_block,uvm_gpu_t * gpu)9372 void uvm_va_block_unmap_preferred_location_uvm_lite(uvm_va_block_t *va_block, uvm_gpu_t *gpu)
9373 {
9374 NV_STATUS status;
9375 uvm_va_range_t *va_range = va_block->va_range;
9376 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
9377 uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, NULL);
9378 uvm_va_block_region_t region = uvm_va_block_region_from_block(va_block);
9379
9380 uvm_assert_mutex_locked(&va_block->lock);
9381 UVM_ASSERT(uvm_processor_mask_test(&va_range->uvm_lite_gpus, gpu->id));
9382
9383 // If the GPU doesn't have GPU state then nothing could be mapped.
9384 if (!uvm_va_block_gpu_state_get(va_block, gpu->id))
9385 return;
9386
9387 // In UVM-Lite mode, mappings to the preferred location are not tracked
9388 // directly, so just unmap the whole block.
9389 status = uvm_va_block_unmap(va_block, block_context, gpu->id, region, NULL, &va_block->tracker);
9390 if (status != NV_OK) {
9391 // Unmapping the whole block should not cause page splits so any failure
9392 // should be the result of a system-fatal error.
9393 UVM_ASSERT_MSG(status == uvm_global_get_status(),
9394 "Unmapping failed: %s, GPU %s\n",
9395 nvstatusToString(status), uvm_gpu_name(gpu));
9396 }
9397
9398 status = uvm_tracker_wait(&va_block->tracker);
9399 if (status != NV_OK) {
9400 UVM_ASSERT_MSG(status == uvm_global_get_status(),
9401 "Unmapping failed: %s, GPU %s\n",
9402 nvstatusToString(status), uvm_gpu_name(gpu));
9403 }
9404 }
9405
9406 // Evict pages from the GPU by moving each resident region to the CPU
9407 //
9408 // Notably the caller needs to support allocation-retry as
9409 // uvm_va_block_migrate_locked() requires that.
block_evict_pages_from_gpu(uvm_va_block_t * va_block,uvm_gpu_t * gpu,struct mm_struct * mm)9410 static NV_STATUS block_evict_pages_from_gpu(uvm_va_block_t *va_block, uvm_gpu_t *gpu, struct mm_struct *mm)
9411 {
9412 NV_STATUS status = NV_OK;
9413 const uvm_page_mask_t *resident = uvm_va_block_resident_mask_get(va_block, gpu->id, NUMA_NO_NODE);
9414 uvm_va_block_region_t region = uvm_va_block_region_from_block(va_block);
9415 uvm_va_block_region_t subregion;
9416 uvm_service_block_context_t *service_context;
9417
9418 service_context = uvm_service_block_context_alloc(mm);
9419 if (!service_context)
9420 return NV_ERR_NO_MEMORY;
9421
9422 // Move all subregions resident on the GPU to the CPU
9423 for_each_va_block_subregion_in_mask(subregion, resident, region) {
9424 if (uvm_va_block_is_hmm(va_block)) {
9425 status = uvm_hmm_va_block_evict_pages_from_gpu(va_block, gpu, service_context, resident, subregion);
9426 }
9427 else {
9428 status = uvm_va_block_migrate_locked(va_block,
9429 NULL,
9430 service_context,
9431 subregion,
9432 UVM_ID_CPU,
9433 UVM_MIGRATE_MODE_MAKE_RESIDENT_AND_MAP,
9434 NULL);
9435 }
9436
9437 if (status != NV_OK)
9438 break;
9439 }
9440
9441 if (status == NV_OK)
9442 UVM_ASSERT(!uvm_processor_mask_test(&va_block->resident, gpu->id));
9443
9444 uvm_service_block_context_free(service_context);
9445
9446 return status;
9447 }
9448
uvm_va_block_unregister_gpu_locked(uvm_va_block_t * va_block,uvm_gpu_t * gpu,struct mm_struct * mm)9449 void uvm_va_block_unregister_gpu_locked(uvm_va_block_t *va_block, uvm_gpu_t *gpu, struct mm_struct *mm)
9450 {
9451 NV_STATUS status;
9452 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id);
9453 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
9454 uvm_va_block_context_t *va_block_context = uvm_va_space_block_context(va_space, mm);
9455
9456 uvm_assert_mutex_locked(&va_block->lock);
9457
9458 if (!gpu_state)
9459 return;
9460
9461 // The mappings should've already been torn down by GPU VA space unregister
9462 UVM_ASSERT(!uvm_processor_mask_test(&va_block->mapped, gpu->id));
9463 UVM_ASSERT(uvm_page_mask_empty(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ]));
9464 UVM_ASSERT(!block_gpu_has_page_tables(va_block, gpu));
9465
9466 // Use UVM_VA_BLOCK_RETRY_LOCKED() as the va block lock is already taken and
9467 // we don't rely on any state of the block across the call.
9468 // TODO: Bug 4494289: Prevent setting the global error on allocation
9469 // failures.
9470 status = UVM_VA_BLOCK_RETRY_LOCKED(va_block, NULL, block_evict_pages_from_gpu(va_block, gpu, mm));
9471 if (status != NV_OK) {
9472 UVM_ERR_PRINT("Failed to evict GPU pages on GPU unregister: %s, GPU %s\n",
9473 nvstatusToString(status),
9474 uvm_gpu_name(gpu));
9475 uvm_global_set_fatal_error(status);
9476 }
9477
9478 // This function will copy the block's tracker into each chunk then free the
9479 // chunk to PMM. If we do this before waiting for the block tracker below
9480 // we'll populate PMM's free chunks with tracker entries, which gives us
9481 // better testing coverage of chunk synchronization on GPU unregister.
9482 block_destroy_gpu_state(va_block, va_block_context, gpu->id);
9483
9484 // Any time a GPU is unregistered we need to make sure that there are no
9485 // pending (direct or indirect) tracker entries for that GPU left in the
9486 // block's tracker. The only way to ensure that is to wait for the whole
9487 // tracker.
9488 status = uvm_tracker_wait(&va_block->tracker);
9489 if (status != NV_OK)
9490 UVM_ASSERT(status == uvm_global_get_status());
9491 }
9492
uvm_va_block_unregister_gpu(uvm_va_block_t * va_block,uvm_gpu_t * gpu,struct mm_struct * mm)9493 void uvm_va_block_unregister_gpu(uvm_va_block_t *va_block, uvm_gpu_t *gpu, struct mm_struct *mm)
9494 {
9495 // Take the lock internally to not expose the caller to allocation-retry.
9496 uvm_mutex_lock(&va_block->lock);
9497
9498 uvm_va_block_unregister_gpu_locked(va_block, gpu, mm);
9499
9500 uvm_mutex_unlock(&va_block->lock);
9501 }
9502
block_mark_region_cpu_dirty(uvm_va_block_t * va_block,uvm_va_block_region_t region)9503 static void block_mark_region_cpu_dirty(uvm_va_block_t *va_block, uvm_va_block_region_t region)
9504 {
9505 uvm_page_index_t page_index;
9506 uvm_page_mask_t *resident_mask;
9507
9508 uvm_assert_mutex_locked(&va_block->lock);
9509 resident_mask = uvm_va_block_resident_mask_get(va_block, UVM_ID_CPU, NUMA_NO_NODE);
9510 for_each_va_block_page_in_region_mask(page_index, resident_mask, region) {
9511 int nid = block_get_page_node_residency(va_block, page_index);
9512 UVM_ASSERT(nid != NUMA_NO_NODE);
9513 block_mark_cpu_page_dirty(va_block, page_index, nid);
9514 }
9515 }
9516
9517 // Tears down everything within the block, but doesn't free the block itself.
9518 // Note that when uvm_va_block_kill is called, this is called twice: once for
9519 // the initial kill itself, then again when the block's ref count is eventually
9520 // destroyed. block->va_range is used to track whether the block has already
9521 // been killed.
block_kill(uvm_va_block_t * block)9522 static void block_kill(uvm_va_block_t *block)
9523 {
9524 uvm_va_space_t *va_space;
9525 uvm_perf_event_data_t event_data;
9526 uvm_cpu_chunk_t *chunk;
9527 uvm_gpu_id_t id;
9528 NV_STATUS status;
9529 uvm_va_block_region_t region = uvm_va_block_region_from_block(block);
9530 uvm_page_index_t page_index;
9531 uvm_page_index_t next_page_index;
9532 int nid;
9533 uvm_va_block_context_t *block_context;
9534
9535 if (uvm_va_block_is_dead(block))
9536 return;
9537
9538 va_space = uvm_va_block_get_va_space(block);
9539 event_data.block_destroy.block = block;
9540 uvm_perf_event_notify(&va_space->perf_events, UVM_PERF_EVENT_BLOCK_DESTROY, &event_data);
9541
9542 block_context = uvm_va_space_block_context(va_space, NULL);
9543
9544 // Unmap all processors in parallel first. Unmapping the whole block won't
9545 // cause a page table split, so this should only fail if we have a system-
9546 // fatal error.
9547 if (!uvm_processor_mask_empty(&block->mapped)) {
9548 // HMM CPU mappings are controlled by Linux so no need to unmap.
9549 // Remote GPU mappings will be removed below.
9550 if (uvm_va_block_is_hmm(block) && uvm_processor_mask_test(&block->mapped, UVM_ID_CPU)) {
9551 uvm_page_mask_zero(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE]);
9552 uvm_page_mask_zero(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ]);
9553 uvm_processor_mask_clear(&block->mapped, UVM_ID_CPU);
9554 }
9555
9556 // We could only be killed with mapped GPU state by VA range free or VA
9557 // space teardown, so it's safe to use the va_space's block_context
9558 // because both of those have the VA space lock held in write mode.
9559 status = uvm_va_block_unmap_mask(block, block_context, &block->mapped, region, NULL);
9560 UVM_ASSERT(status == uvm_global_get_status());
9561 }
9562
9563 UVM_ASSERT(uvm_processor_mask_empty(&block->mapped));
9564
9565 // Free the GPU page tables and chunks
9566 for_each_gpu_id(id)
9567 block_destroy_gpu_state(block, block_context, id);
9568
9569 // Wait for the GPU PTE unmaps before freeing CPU memory
9570 uvm_tracker_wait_deinit(&block->tracker);
9571
9572 // No processor should have the CPU mapped at this point
9573 UVM_ASSERT(block_check_processor_not_mapped(block, block_context, UVM_ID_CPU));
9574
9575 // Free CPU pages
9576 for_each_possible_uvm_node(nid) {
9577 uvm_va_block_cpu_node_state_t *node_state = block_node_state_get(block, nid);
9578 size_t index = node_to_index(nid);
9579
9580 for_each_cpu_chunk_in_block_safe(chunk, page_index, next_page_index, block, nid) {
9581 // be conservative.
9582 // Tell the OS we wrote to the page because we sometimes clear the dirty
9583 // bit after writing to it. HMM dirty flags are managed by the kernel.
9584 if (!uvm_va_block_is_hmm(block))
9585 uvm_cpu_chunk_mark_dirty(chunk, 0);
9586
9587 uvm_cpu_chunk_remove_from_block(block, nid, page_index);
9588 uvm_cpu_chunk_free(chunk);
9589 }
9590
9591 UVM_ASSERT(uvm_page_mask_empty(&node_state->allocated));
9592 UVM_ASSERT(node_state->chunks == 0);
9593 kmem_cache_free(g_uvm_va_block_cpu_node_state_cache, block->cpu.node_state[index]);
9594 }
9595
9596 uvm_kvfree((void *)block->cpu.node_state);
9597 block->cpu.node_state = NULL;
9598
9599 // Clearing the resident bit isn't strictly necessary since this block
9600 // is getting destroyed, but it keeps state consistent for assertions.
9601 uvm_page_mask_zero(&block->cpu.resident);
9602 block_clear_resident_processor(block, UVM_ID_CPU);
9603
9604 if (uvm_va_block_is_hmm(block))
9605 uvm_va_policy_clear(block, block->start, block->end);
9606
9607 block->va_range = NULL;
9608 #if UVM_IS_CONFIG_HMM()
9609 block->hmm.va_space = NULL;
9610 #endif
9611 }
9612
9613 // Called when the block's ref count drops to 0
uvm_va_block_destroy(nv_kref_t * nv_kref)9614 void uvm_va_block_destroy(nv_kref_t *nv_kref)
9615 {
9616 uvm_va_block_t *block = container_of(nv_kref, uvm_va_block_t, kref);
9617
9618 // Nobody else should have a reference when freeing
9619 uvm_assert_mutex_unlocked(&block->lock);
9620
9621 uvm_mutex_lock(&block->lock);
9622 block_kill(block);
9623 uvm_mutex_unlock(&block->lock);
9624 uvm_va_block_free(block);
9625 }
9626
uvm_va_block_kill(uvm_va_block_t * va_block)9627 void uvm_va_block_kill(uvm_va_block_t *va_block)
9628 {
9629 uvm_mutex_lock(&va_block->lock);
9630 block_kill(va_block);
9631 uvm_mutex_unlock(&va_block->lock);
9632
9633 // May call block_kill again
9634 uvm_va_block_release(va_block);
9635 }
9636
block_gpu_release_region(uvm_va_block_t * va_block,uvm_gpu_id_t gpu_id,uvm_va_block_gpu_state_t * gpu_state,uvm_page_mask_t * page_mask,uvm_va_block_region_t region)9637 static void block_gpu_release_region(uvm_va_block_t *va_block,
9638 uvm_gpu_id_t gpu_id,
9639 uvm_va_block_gpu_state_t *gpu_state,
9640 uvm_page_mask_t *page_mask,
9641 uvm_va_block_region_t region)
9642 {
9643 uvm_page_index_t page_index;
9644
9645 for_each_va_block_page_in_region_mask(page_index, page_mask, region) {
9646 uvm_gpu_chunk_t *gpu_chunk = gpu_state->chunks[page_index];
9647
9648 if (!gpu_chunk)
9649 continue;
9650
9651 // TODO: Bug 3898467: unmap indirect peers when freeing GPU chunks
9652
9653 uvm_mmu_chunk_unmap(gpu_chunk, &va_block->tracker);
9654
9655 // The GPU chunk will be freed when the device private reference drops.
9656 if (uvm_page_mask_test_and_clear(&gpu_state->resident, page_index) &&
9657 uvm_page_mask_empty(&gpu_state->resident))
9658 block_clear_resident_processor(va_block, gpu_id);
9659
9660 gpu_state->chunks[page_index] = NULL;
9661 }
9662 }
9663
uvm_va_block_munmap_region(uvm_va_block_t * va_block,uvm_va_block_region_t region)9664 void uvm_va_block_munmap_region(uvm_va_block_t *va_block,
9665 uvm_va_block_region_t region)
9666 {
9667 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
9668 uvm_perf_event_data_t event_data;
9669 uvm_gpu_id_t gpu_id;
9670
9671 UVM_ASSERT(uvm_va_block_is_hmm(va_block));
9672 uvm_assert_mutex_locked(&va_block->lock);
9673
9674 // Reset thrashing state for the region.
9675 event_data.block_munmap.block = va_block;
9676 event_data.block_munmap.region = region;
9677 uvm_perf_event_notify(&va_space->perf_events, UVM_PERF_EVENT_BLOCK_MUNMAP, &event_data);
9678
9679 // Release any remaining vidmem chunks in the given region.
9680 for_each_gpu_id(gpu_id) {
9681 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(va_block, gpu_id);
9682
9683 if (!gpu_state)
9684 continue;
9685
9686 uvm_page_mask_region_clear(&gpu_state->evicted, region);
9687 if (uvm_page_mask_empty(&gpu_state->evicted))
9688 uvm_processor_mask_clear(&va_block->evicted_gpus, gpu_id);
9689
9690 if (gpu_state->chunks) {
9691 block_gpu_release_region(va_block, gpu_id, gpu_state, NULL, region);
9692
9693 // TODO: bug 3660922: Need to update the read duplicated pages mask
9694 // when read duplication is supported for HMM.
9695 }
9696 else {
9697 UVM_ASSERT(!uvm_processor_mask_test(&va_block->resident, gpu_id));
9698 }
9699 }
9700
9701 uvm_va_policy_clear(va_block,
9702 uvm_va_block_region_start(va_block, region),
9703 uvm_va_block_region_end(va_block, region));
9704 }
9705
block_split_presplit_ptes_gpu(uvm_va_block_t * existing,uvm_va_block_t * new,uvm_gpu_t * gpu)9706 static NV_STATUS block_split_presplit_ptes_gpu(uvm_va_block_t *existing, uvm_va_block_t *new, uvm_gpu_t *gpu)
9707 {
9708 uvm_va_block_gpu_state_t *existing_gpu_state = uvm_va_block_gpu_state_get(existing, gpu->id);
9709 uvm_va_space_t *va_space = uvm_va_block_get_va_space(existing);
9710 uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, NULL);
9711 NvU32 big_page_size = uvm_va_block_gpu_big_page_size(existing, gpu);
9712 NvU32 alloc_sizes;
9713 DECLARE_BITMAP(new_big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
9714 uvm_page_index_t new_start_page_index = uvm_va_block_cpu_page_index(existing, new->start);
9715 size_t big_page_index;
9716 uvm_push_t push;
9717 NV_STATUS status;
9718
9719 // We only have to split to big PTEs if we're currently a 2M PTE
9720 if (existing_gpu_state->pte_is_2m) {
9721 // We can skip the split if the 2M PTE is invalid and we have no lower
9722 // PTEs.
9723 if (block_page_prot_gpu(existing, gpu, 0) == UVM_PROT_NONE &&
9724 !existing_gpu_state->page_table_range_big.table &&
9725 !existing_gpu_state->page_table_range_4k.table)
9726 return NV_OK;
9727
9728 alloc_sizes = big_page_size;
9729 bitmap_fill(new_big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
9730
9731 if (!IS_ALIGNED(new->start, big_page_size)) {
9732 alloc_sizes |= UVM_PAGE_SIZE_4K;
9733
9734 big_page_index = uvm_va_block_big_page_index(existing, new_start_page_index, big_page_size);
9735 __clear_bit(big_page_index, new_big_ptes);
9736 }
9737
9738 status = block_alloc_ptes_with_retry(existing, gpu, alloc_sizes, NULL);
9739 if (status != NV_OK)
9740 return status;
9741
9742 status = uvm_push_begin_acquire(gpu->channel_manager,
9743 UVM_CHANNEL_TYPE_MEMOPS,
9744 &existing->tracker,
9745 &push,
9746 "Splitting 2M PTE, existing [0x%llx, 0x%llx) new [0x%llx, 0x%llx)",
9747 existing->start, existing->end + 1,
9748 new->start, new->end + 1);
9749 if (status != NV_OK)
9750 return status;
9751
9752 block_gpu_split_2m(existing, block_context, gpu, new_big_ptes, &push);
9753 }
9754 else {
9755 big_page_index = uvm_va_block_big_page_index(existing, new_start_page_index, big_page_size);
9756
9757 // If the split point is on a big page boundary, or if the split point
9758 // is not currently covered by a big PTE, we don't have to split
9759 // anything.
9760 if (IS_ALIGNED(new->start, big_page_size) ||
9761 big_page_index == MAX_BIG_PAGES_PER_UVM_VA_BLOCK ||
9762 !test_bit(big_page_index, existing_gpu_state->big_ptes))
9763 return NV_OK;
9764
9765 status = block_alloc_ptes_with_retry(existing, gpu, UVM_PAGE_SIZE_4K, NULL);
9766 if (status != NV_OK)
9767 return status;
9768
9769 bitmap_zero(new_big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
9770 __set_bit(big_page_index, new_big_ptes);
9771
9772 status = uvm_push_begin_acquire(gpu->channel_manager,
9773 UVM_CHANNEL_TYPE_MEMOPS,
9774 &existing->tracker,
9775 &push,
9776 "Splitting big PTE, existing [0x%llx, 0x%llx) new [0x%llx, 0x%llx)",
9777 existing->start, existing->end + 1,
9778 new->start, new->end + 1);
9779 if (status != NV_OK)
9780 return status;
9781
9782 block_gpu_split_big(existing, block_context, gpu, new_big_ptes, &push);
9783 }
9784
9785 uvm_push_end(&push);
9786
9787 // Adding this push to existing block tracker will cause all GPU PTE splits
9788 // to serialize on each other, but it's simpler than maintaining a separate
9789 // tracker and this path isn't performance-critical.
9790 return uvm_tracker_add_push_safe(&existing->tracker, &push);
9791 }
9792
block_split_presplit_ptes(uvm_va_block_t * existing,uvm_va_block_t * new)9793 static NV_STATUS block_split_presplit_ptes(uvm_va_block_t *existing, uvm_va_block_t *new)
9794 {
9795 uvm_gpu_t *gpu;
9796 uvm_gpu_id_t id;
9797 NV_STATUS status;
9798
9799 for_each_gpu_id(id) {
9800 if (!uvm_va_block_gpu_state_get(existing, id))
9801 continue;
9802
9803 gpu = block_get_gpu(existing, id);
9804
9805 if (block_gpu_has_page_tables(existing, gpu)) {
9806 status = block_split_presplit_ptes_gpu(existing, new, gpu);
9807 if (status != NV_OK)
9808 return status;
9809 }
9810 }
9811
9812 return NV_OK;
9813 }
9814
9815 typedef struct
9816 {
9817 // Number of chunks contained by this VA block
9818 size_t num_chunks;
9819
9820 // Index of the "interesting" chunk, either adjacent to or spanning the
9821 // split point depending on which block this is.
9822 size_t chunk_index;
9823
9824 // Size of the chunk referenced by chunk_index
9825 uvm_chunk_size_t chunk_size;
9826 } block_gpu_chunk_split_state_t;
9827
block_gpu_chunk_get_split_state(uvm_va_block_t * block,block_gpu_chunk_split_state_t * state,NvU64 start,NvU64 end,uvm_page_index_t page_index,uvm_gpu_t * gpu)9828 static void block_gpu_chunk_get_split_state(uvm_va_block_t *block,
9829 block_gpu_chunk_split_state_t *state,
9830 NvU64 start,
9831 NvU64 end,
9832 uvm_page_index_t page_index,
9833 uvm_gpu_t *gpu)
9834 {
9835 NvU64 size = end - start + 1;
9836 state->num_chunks = block_num_gpu_chunks_range(block, start, size, gpu);
9837 state->chunk_index = block_gpu_chunk_index_range(block, start, size, gpu, page_index, &state->chunk_size);
9838 }
9839
block_merge_chunk(uvm_va_block_t * block,uvm_gpu_t * gpu,uvm_gpu_chunk_t * chunk)9840 static void block_merge_chunk(uvm_va_block_t *block, uvm_gpu_t *gpu, uvm_gpu_chunk_t *chunk)
9841 {
9842 uvm_gpu_t *accessing_gpu;
9843 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
9844
9845 uvm_pmm_gpu_merge_chunk(&gpu->pmm, chunk);
9846
9847 for_each_va_space_gpu_in_mask(accessing_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)]) {
9848 NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(&gpu->pmm, chunk, accessing_gpu);
9849
9850 uvm_pmm_sysmem_mappings_merge_gpu_chunk_mappings(&accessing_gpu->pmm_reverse_sysmem_mappings,
9851 peer_addr,
9852 uvm_gpu_chunk_get_size(chunk));
9853 }
9854 }
9855
9856 // Perform any chunk splitting and array growing required for this block split,
9857 // but don't actually move chunk pointers anywhere.
block_presplit_gpu_chunks(uvm_va_block_t * existing,uvm_va_block_t * new,uvm_gpu_t * gpu)9858 static NV_STATUS block_presplit_gpu_chunks(uvm_va_block_t *existing, uvm_va_block_t *new, uvm_gpu_t *gpu)
9859 {
9860 uvm_va_block_gpu_state_t *existing_gpu_state = uvm_va_block_gpu_state_get(existing, gpu->id);
9861 uvm_gpu_t *accessing_gpu;
9862 uvm_va_space_t *va_space = uvm_va_block_get_va_space(existing);
9863 uvm_gpu_chunk_t **temp_chunks;
9864 uvm_gpu_chunk_t *original_chunk, *curr_chunk;
9865 uvm_page_index_t split_page_index = uvm_va_block_cpu_page_index(existing, new->start);
9866 uvm_chunk_sizes_mask_t split_sizes;
9867 uvm_chunk_size_t subchunk_size;
9868 NV_STATUS status;
9869 block_gpu_chunk_split_state_t existing_before_state, existing_after_state, new_state;
9870
9871 block_gpu_chunk_get_split_state(existing,
9872 &existing_before_state,
9873 existing->start,
9874 existing->end,
9875 split_page_index,
9876 gpu);
9877 block_gpu_chunk_get_split_state(existing,
9878 &existing_after_state,
9879 existing->start,
9880 new->start - 1,
9881 split_page_index - 1,
9882 gpu);
9883 block_gpu_chunk_get_split_state(new,
9884 &new_state,
9885 new->start,
9886 new->end,
9887 0,
9888 gpu);
9889
9890 // Even though we're splitting existing, we could wind up requiring a larger
9891 // chunks array if we split a large chunk into many smaller ones.
9892 if (existing_after_state.num_chunks > existing_before_state.num_chunks) {
9893 temp_chunks = uvm_kvrealloc(existing_gpu_state->chunks,
9894 existing_after_state.num_chunks * sizeof(existing_gpu_state->chunks[0]));
9895 if (!temp_chunks)
9896 return NV_ERR_NO_MEMORY;
9897 existing_gpu_state->chunks = temp_chunks;
9898 }
9899
9900 original_chunk = existing_gpu_state->chunks[existing_before_state.chunk_index];
9901
9902 // If the chunk covering the split point is not populated, we're done. We've
9903 // already grown the array to cover any new chunks which may be populated
9904 // later.
9905 if (!original_chunk)
9906 return NV_OK;
9907
9908 // Figure out the splits we need to perform. Remove all sizes >= the current
9909 // size, and all sizes < the target size. Note that the resulting mask will
9910 // be 0 if the sizes match (we're already splitting at a chunk boundary).
9911 UVM_ASSERT(uvm_gpu_chunk_get_size(original_chunk) == existing_before_state.chunk_size);
9912 UVM_ASSERT(existing_before_state.chunk_size >= new_state.chunk_size);
9913 split_sizes = gpu->parent->mmu_user_chunk_sizes;
9914 split_sizes &= existing_before_state.chunk_size - 1;
9915 split_sizes &= ~(new_state.chunk_size - 1);
9916
9917 // Keep splitting the chunk covering the split point until we hit the target
9918 // size.
9919 curr_chunk = original_chunk;
9920 for_each_chunk_size_rev(subchunk_size, split_sizes) {
9921 size_t last_index, num_subchunks;
9922
9923 status = uvm_pmm_gpu_split_chunk(&gpu->pmm, curr_chunk, subchunk_size, NULL);
9924 if (status != NV_OK)
9925 goto error;
9926
9927 // Split physical GPU mappings for indirect peers
9928 for_each_va_space_gpu_in_mask(accessing_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)]) {
9929 NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(&gpu->pmm, curr_chunk, accessing_gpu);
9930
9931 status = uvm_pmm_sysmem_mappings_split_gpu_chunk_mappings(&accessing_gpu->pmm_reverse_sysmem_mappings,
9932 peer_addr,
9933 subchunk_size);
9934 if (status != NV_OK)
9935 goto error;
9936 }
9937
9938 if (subchunk_size == new_state.chunk_size)
9939 break;
9940
9941 // Compute the last subchunk index prior to the split point. Divide the
9942 // entire address space into units of subchunk_size, then mod by the
9943 // number of subchunks within the parent.
9944 last_index = (size_t)uvm_div_pow2_64(new->start - 1, subchunk_size);
9945 num_subchunks = (size_t)uvm_div_pow2_64(uvm_gpu_chunk_get_size(curr_chunk), subchunk_size);
9946 UVM_ASSERT(num_subchunks > 1);
9947 last_index &= num_subchunks - 1;
9948
9949 uvm_pmm_gpu_get_subchunks(&gpu->pmm, curr_chunk, last_index, 1, &curr_chunk);
9950 UVM_ASSERT(uvm_gpu_chunk_get_size(curr_chunk) == subchunk_size);
9951 }
9952
9953 // Note that existing's chunks array still has a pointer to original_chunk,
9954 // not to any newly-split subchunks. If a subsequent split failure occurs on
9955 // a later GPU we'll have to merge it back. Once we're past the preallocate
9956 // stage we'll remove it from the chunks array and move the new split chunks
9957 // in.
9958
9959 return NV_OK;
9960
9961 error:
9962 // On error we need to leave the chunk in its initial state
9963 block_merge_chunk(existing, gpu, original_chunk);
9964
9965 return status;
9966 }
9967
block_split_cpu_chunk_to_64k(uvm_va_block_t * block,int nid)9968 static NV_STATUS block_split_cpu_chunk_to_64k(uvm_va_block_t *block, int nid)
9969 {
9970 uvm_cpu_chunk_storage_mixed_t *mixed;
9971 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, nid, 0);
9972 uvm_va_block_cpu_node_state_t *node_state = block_node_state_get(block, nid);
9973 NV_STATUS status;
9974
9975 UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_2M);
9976 UVM_ASSERT(uvm_cpu_storage_get_type(node_state) == UVM_CPU_CHUNK_STORAGE_CHUNK);
9977
9978 mixed = uvm_kvmalloc_zero(sizeof(*mixed));
9979 if (!mixed)
9980 return NV_ERR_NO_MEMORY;
9981
9982 status = uvm_cpu_chunk_split(chunk, (uvm_cpu_chunk_t **)&mixed->slots);
9983 if (status != NV_OK) {
9984 uvm_kvfree(mixed);
9985 return status;
9986 }
9987
9988 bitmap_fill(mixed->big_chunks, MAX_BIG_CPU_CHUNK_SLOTS_PER_UVM_VA_BLOCK);
9989 node_state->chunks = (unsigned long)mixed | UVM_CPU_CHUNK_STORAGE_MIXED;
9990
9991 return status;
9992 }
9993
block_split_cpu_chunk_to_4k(uvm_va_block_t * block,uvm_page_index_t page_index,int nid)9994 static NV_STATUS block_split_cpu_chunk_to_4k(uvm_va_block_t *block, uvm_page_index_t page_index, int nid)
9995 {
9996 uvm_va_block_cpu_node_state_t *node_state = block_node_state_get(block, nid);
9997 uvm_cpu_chunk_storage_mixed_t *mixed;
9998 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, nid, page_index);
9999 uvm_cpu_chunk_t **small_chunks;
10000 size_t slot_index;
10001 NV_STATUS status;
10002
10003 UVM_ASSERT(chunk);
10004 UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_64K);
10005 UVM_ASSERT(uvm_cpu_storage_get_type(node_state) == UVM_CPU_CHUNK_STORAGE_MIXED);
10006
10007 mixed = uvm_cpu_storage_get_ptr(node_state);
10008 slot_index = compute_slot_index(block, page_index);
10009 small_chunks = uvm_kvmalloc_zero(sizeof(*small_chunks) * MAX_SMALL_CHUNKS_PER_BIG_SLOT);
10010 if (!small_chunks)
10011 return NV_ERR_NO_MEMORY;
10012
10013 status = uvm_cpu_chunk_split(chunk, small_chunks);
10014 if (status != NV_OK) {
10015 uvm_kvfree(small_chunks);
10016 return status;
10017 }
10018
10019 mixed->slots[slot_index] = small_chunks;
10020 clear_bit(slot_index, mixed->big_chunks);
10021
10022 return status;
10023 }
10024
block_split_cpu_chunk_one(uvm_va_block_t * block,uvm_page_index_t page_index,int nid)10025 static NV_STATUS block_split_cpu_chunk_one(uvm_va_block_t *block, uvm_page_index_t page_index, int nid)
10026 {
10027 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, nid, page_index);
10028 uvm_chunk_size_t chunk_size = uvm_cpu_chunk_get_size(chunk);
10029 uvm_chunk_size_t new_size;
10030 uvm_gpu_t *gpu;
10031 NvU64 gpu_mapping_addr;
10032 uvm_processor_mask_t *gpu_split_mask;
10033 uvm_gpu_id_t id;
10034 NV_STATUS status;
10035
10036 gpu_split_mask = uvm_processor_mask_cache_alloc();
10037 if (!gpu_split_mask)
10038 return NV_ERR_NO_MEMORY;
10039
10040 if (chunk_size == UVM_CHUNK_SIZE_2M)
10041 new_size = UVM_CHUNK_SIZE_64K;
10042 else
10043 new_size = UVM_CHUNK_SIZE_4K;
10044
10045 UVM_ASSERT(IS_ALIGNED(chunk_size, new_size));
10046
10047 uvm_processor_mask_zero(gpu_split_mask);
10048 for_each_gpu_id(id) {
10049 if (!uvm_va_block_gpu_state_get(block, id))
10050 continue;
10051
10052 gpu = block_get_gpu(block, id);
10053
10054 // If the parent chunk has not been mapped, there is nothing to split.
10055 gpu_mapping_addr = uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk, gpu->parent);
10056 if (gpu_mapping_addr == 0)
10057 continue;
10058
10059 status = uvm_pmm_sysmem_mappings_split_gpu_mappings(&gpu->pmm_reverse_sysmem_mappings,
10060 gpu_mapping_addr,
10061 new_size);
10062 if (status != NV_OK)
10063 goto merge;
10064
10065 uvm_processor_mask_set(gpu_split_mask, id);
10066 }
10067
10068 if (new_size == UVM_CHUNK_SIZE_64K)
10069 status = block_split_cpu_chunk_to_64k(block, nid);
10070 else
10071 status = block_split_cpu_chunk_to_4k(block, page_index, nid);
10072
10073 if (status != NV_OK) {
10074 merge:
10075 for_each_gpu_id_in_mask(id, gpu_split_mask) {
10076 gpu = block_get_gpu(block, id);
10077 gpu_mapping_addr = uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk, gpu->parent);
10078 uvm_pmm_sysmem_mappings_merge_gpu_mappings(&gpu->pmm_reverse_sysmem_mappings,
10079 gpu_mapping_addr,
10080 chunk_size);
10081 }
10082 }
10083
10084 uvm_processor_mask_cache_free(gpu_split_mask);
10085
10086 return status;
10087 }
10088
block_prealloc_cpu_chunk_storage(uvm_va_block_t * existing,uvm_va_block_t * new,int nid)10089 static NV_STATUS block_prealloc_cpu_chunk_storage(uvm_va_block_t *existing, uvm_va_block_t *new, int nid)
10090 {
10091 uvm_cpu_chunk_storage_mixed_t *existing_mixed;
10092 uvm_cpu_chunk_storage_mixed_t *new_mixed = NULL;
10093 uvm_va_block_cpu_node_state_t *node_state = block_node_state_get(existing, nid);
10094 uvm_va_block_cpu_node_state_t *new_node_state = block_node_state_get(new, nid);
10095 size_t slot_offset;
10096 size_t existing_slot;
10097 NV_STATUS status = NV_OK;
10098
10099 UVM_ASSERT(uvm_cpu_storage_get_type(node_state) == UVM_CPU_CHUNK_STORAGE_MIXED);
10100 existing_mixed = uvm_cpu_storage_get_ptr(node_state);
10101
10102 // Pre-allocate chunk storage for the new block. By definition, the new block
10103 // will contain either 64K and/or 4K chunks.
10104 //
10105 // We do this here so there are no failures in block_split_cpu().
10106 new_mixed = uvm_kvmalloc_zero(sizeof(*new_mixed));
10107 if (!new_mixed)
10108 return NV_ERR_NO_MEMORY;
10109
10110 slot_offset = compute_slot_index(existing, uvm_va_block_cpu_page_index(existing, new->start));
10111 existing_slot = slot_offset;
10112 for_each_clear_bit_from(existing_slot, existing_mixed->big_chunks, MAX_BIG_CPU_CHUNK_SLOTS_PER_UVM_VA_BLOCK) {
10113 size_t new_slot = existing_slot - slot_offset;
10114
10115 if (existing_mixed->slots[existing_slot]) {
10116 uvm_cpu_chunk_t **small_chunks = uvm_kvmalloc_zero(sizeof(*small_chunks) * MAX_SMALL_CHUNKS_PER_BIG_SLOT);
10117
10118 if (!small_chunks) {
10119 status = NV_ERR_NO_MEMORY;
10120 goto done;
10121 }
10122
10123 new_mixed->slots[new_slot] = small_chunks;
10124 }
10125 }
10126
10127 new_node_state->chunks = (unsigned long)new_mixed | UVM_CPU_CHUNK_STORAGE_MIXED;
10128 UVM_ASSERT(status == NV_OK);
10129
10130 done:
10131 if (status != NV_OK) {
10132 for (; existing_slot > slot_offset; existing_slot--)
10133 uvm_kvfree(new_mixed->slots[existing_slot - slot_offset]);
10134
10135 uvm_kvfree(new_mixed);
10136 }
10137
10138 return status;
10139 }
10140
block_free_cpu_chunk_storage(uvm_va_block_t * block,int nid)10141 static void block_free_cpu_chunk_storage(uvm_va_block_t *block, int nid)
10142 {
10143 uvm_va_block_cpu_node_state_t *node_state = block_node_state_get(block, nid);
10144
10145 if (node_state->chunks) {
10146 uvm_cpu_chunk_storage_mixed_t *mixed;
10147 size_t slot_index;
10148
10149 UVM_ASSERT(uvm_cpu_storage_get_type(node_state) == UVM_CPU_CHUNK_STORAGE_MIXED);
10150 mixed = uvm_cpu_storage_get_ptr(node_state);
10151 for (slot_index = 0; slot_index < MAX_BIG_CPU_CHUNK_SLOTS_PER_UVM_VA_BLOCK; slot_index++)
10152 uvm_kvfree(mixed->slots[slot_index]);
10153
10154 uvm_kvfree(mixed);
10155 node_state->chunks = 0;
10156 }
10157 }
10158
10159 // Perform any CPU chunk splitting that may be required for this block split.
10160 // Just like block_presplit_gpu_chunks, no chunks are moved to the new block.
block_presplit_cpu_chunks(uvm_va_block_t * existing,uvm_va_block_t * new)10161 static NV_STATUS block_presplit_cpu_chunks(uvm_va_block_t *existing, uvm_va_block_t *new)
10162 {
10163 uvm_page_index_t page_index = uvm_va_block_cpu_page_index(existing, new->start);
10164 uvm_cpu_chunk_t *splitting_chunk;
10165 uvm_chunk_sizes_mask_t split_sizes = uvm_cpu_chunk_get_allocation_sizes();
10166 uvm_chunk_size_t subchunk_size;
10167 NV_STATUS status = NV_OK;
10168 int nid;
10169
10170 UVM_ASSERT(!IS_ALIGNED(new->start, UVM_VA_BLOCK_SIZE));
10171
10172 for_each_possible_uvm_node(nid) {
10173 splitting_chunk = uvm_cpu_chunk_get_chunk_for_page(existing, nid, page_index);
10174
10175 // If the page covering the split point has not been populated, there is no
10176 // need to split.
10177 if (!splitting_chunk)
10178 continue;
10179
10180 // If the split point is aligned on the chunk size, there is no need to
10181 // split.
10182 if (IS_ALIGNED(new->start, uvm_cpu_chunk_get_size(splitting_chunk)))
10183 continue;
10184
10185 // Remove all sizes above the chunk's current size.
10186 split_sizes &= uvm_cpu_chunk_get_size(splitting_chunk) - 1;
10187 // Remove all sizes below the alignment of the new block's start.
10188 split_sizes &= ~(IS_ALIGNED(new->start, UVM_CHUNK_SIZE_64K) ? UVM_CHUNK_SIZE_64K - 1 : 0);
10189
10190 for_each_chunk_size_rev(subchunk_size, split_sizes) {
10191 status = block_split_cpu_chunk_one(existing, page_index, nid);
10192 if (status != NV_OK)
10193 return status;
10194 }
10195
10196 status = block_prealloc_cpu_chunk_storage(existing, new, nid);
10197 if (status != NV_OK)
10198 break;
10199 }
10200
10201 return status;
10202 }
10203
block_merge_cpu_chunks_to_64k(uvm_va_block_t * block,uvm_page_index_t page_index,int nid)10204 static void block_merge_cpu_chunks_to_64k(uvm_va_block_t *block, uvm_page_index_t page_index, int nid)
10205 {
10206 uvm_va_block_cpu_node_state_t *node_state = block_node_state_get(block, nid);
10207 uvm_cpu_chunk_storage_mixed_t *mixed = uvm_cpu_storage_get_ptr(node_state);
10208 size_t slot_index = compute_slot_index(block, page_index);
10209 uvm_cpu_chunk_t **small_chunks = mixed->slots[slot_index];
10210 uvm_cpu_chunk_t *merged_chunk;
10211
10212 UVM_ASSERT(uvm_cpu_storage_get_type(node_state) == UVM_CPU_CHUNK_STORAGE_MIXED);
10213 UVM_ASSERT(small_chunks);
10214 UVM_ASSERT(!test_bit(slot_index, mixed->big_chunks));
10215
10216 merged_chunk = uvm_cpu_chunk_merge(small_chunks);
10217 mixed->slots[slot_index] = merged_chunk;
10218 set_bit(slot_index, mixed->big_chunks);
10219 uvm_kvfree(small_chunks);
10220 }
10221
block_merge_cpu_chunks_to_2m(uvm_va_block_t * block,uvm_page_index_t page_index,int nid)10222 static void block_merge_cpu_chunks_to_2m(uvm_va_block_t *block, uvm_page_index_t page_index, int nid)
10223 {
10224 uvm_va_block_cpu_node_state_t *node_state = block_node_state_get(block, nid);
10225 uvm_cpu_chunk_storage_mixed_t *mixed = uvm_cpu_storage_get_ptr(node_state);
10226 uvm_cpu_chunk_t **big_chunks = (uvm_cpu_chunk_t **)&mixed->slots;
10227 uvm_cpu_chunk_t *merged_chunk;
10228
10229 UVM_ASSERT(uvm_cpu_storage_get_type(node_state) == UVM_CPU_CHUNK_STORAGE_MIXED);
10230 UVM_ASSERT(bitmap_full(mixed->big_chunks, MAX_BIG_CPU_CHUNK_SLOTS_PER_UVM_VA_BLOCK));
10231
10232 merged_chunk = uvm_cpu_chunk_merge(big_chunks);
10233 node_state->chunks = (unsigned long)merged_chunk | UVM_CPU_CHUNK_STORAGE_CHUNK;
10234 uvm_kvfree(mixed);
10235 }
10236
block_merge_cpu_chunks_one(uvm_va_block_t * block,uvm_page_index_t page_index,int nid)10237 static void block_merge_cpu_chunks_one(uvm_va_block_t *block, uvm_page_index_t page_index, int nid)
10238 {
10239 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(block, nid, page_index);
10240 uvm_gpu_id_t id;
10241
10242 if (!chunk)
10243 return;
10244
10245 if (uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_4K) {
10246 block_merge_cpu_chunks_to_64k(block, page_index, nid);
10247 }
10248 else {
10249 UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == UVM_CHUNK_SIZE_64K);
10250 block_merge_cpu_chunks_to_2m(block, page_index, nid);
10251 }
10252
10253 chunk = uvm_cpu_chunk_get_chunk_for_page(block, nid, page_index);
10254
10255 for_each_gpu_id(id) {
10256 NvU64 gpu_mapping_addr;
10257 uvm_gpu_t *gpu;
10258
10259 if (!uvm_va_block_gpu_state_get(block, id))
10260 continue;
10261
10262 gpu = block_get_gpu(block, id);
10263 gpu_mapping_addr = uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk, gpu->parent);
10264 if (gpu_mapping_addr == 0)
10265 continue;
10266
10267 uvm_pmm_sysmem_mappings_merge_gpu_mappings(&gpu->pmm_reverse_sysmem_mappings,
10268 gpu_mapping_addr,
10269 uvm_cpu_chunk_get_size(chunk));
10270 }
10271 }
10272
block_merge_cpu_chunks(uvm_va_block_t * existing,uvm_va_block_t * new)10273 static void block_merge_cpu_chunks(uvm_va_block_t *existing, uvm_va_block_t *new)
10274 {
10275 uvm_page_index_t page_index = uvm_va_block_cpu_page_index(existing, new->start);
10276 uvm_chunk_sizes_mask_t merge_sizes = uvm_cpu_chunk_get_allocation_sizes();
10277 uvm_chunk_size_t largest_size;
10278 size_t block_size = uvm_va_block_size(existing);
10279 int nid;
10280
10281 // Since block sizes are not always powers of 2, use the largest power of 2
10282 // less than or equal to the block size since we can't merge to a size
10283 // larger than the block's size.
10284 largest_size = rounddown_pow_of_two(block_size);
10285
10286 for_each_possible_uvm_node(nid) {
10287 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(existing, nid, page_index);
10288 uvm_va_block_cpu_node_state_t *node_state = block_node_state_get(existing, nid);
10289 uvm_chunk_size_t chunk_size;
10290 uvm_chunk_size_t merge_size;
10291
10292 if (!chunk || uvm_cpu_chunk_is_physical(chunk))
10293 continue;
10294
10295 chunk_size = uvm_cpu_chunk_get_size(chunk);
10296
10297 // Remove all CPU chunk sizes above the size of the existing VA block.
10298 merge_sizes &= (largest_size | (largest_size - 1));
10299
10300 // Remove all CPU chunk sizes smaller than the size of the chunk being merged up.
10301 merge_sizes &= ~(chunk_size | (chunk_size - 1));
10302
10303 for_each_chunk_size(merge_size, merge_sizes) {
10304 uvm_va_block_region_t chunk_region;
10305
10306 // The block has to fully contain the VA range after the merge.
10307 if (!uvm_va_block_contains_address(existing, UVM_ALIGN_DOWN(new->start, merge_size)) ||
10308 !uvm_va_block_contains_address(existing, UVM_ALIGN_DOWN(new->start, merge_size) + merge_size - 1))
10309 break;
10310
10311 chunk_region = uvm_va_block_chunk_region(existing, merge_size, page_index);
10312
10313 // If not all pages in the region covered by the chunk are allocated,
10314 // we can't merge.
10315 if (!uvm_page_mask_region_full(&node_state->allocated, chunk_region))
10316 break;
10317
10318 block_merge_cpu_chunks_one(existing, chunk_region.first, nid);
10319 chunk = uvm_cpu_chunk_get_chunk_for_page(existing, nid, page_index);
10320 if (uvm_cpu_chunk_is_physical(chunk))
10321 break;
10322 }
10323
10324 block_free_cpu_chunk_storage(new, nid);
10325 }
10326 }
10327
10328 // Pre-allocate everything which doesn't require retry on both existing and new
10329 // which will be needed to handle a split. If this fails, existing must remain
10330 // functionally unmodified.
block_split_preallocate_no_retry(uvm_va_block_t * existing,uvm_va_block_t * new)10331 static NV_STATUS block_split_preallocate_no_retry(uvm_va_block_t *existing, uvm_va_block_t *new)
10332 {
10333 NV_STATUS status;
10334 uvm_gpu_t *gpu;
10335 uvm_gpu_id_t id;
10336 uvm_page_index_t split_page_index;
10337 uvm_va_block_test_t *block_test;
10338
10339 status = block_presplit_cpu_chunks(existing, new);
10340 if (status != NV_OK)
10341 goto error;
10342
10343 for_each_gpu_id(id) {
10344 if (!uvm_va_block_gpu_state_get(existing, id))
10345 continue;
10346
10347 gpu = block_get_gpu(existing, id);
10348
10349 status = block_presplit_gpu_chunks(existing, new, gpu);
10350 if (status != NV_OK)
10351 goto error;
10352
10353 if (!block_gpu_state_get_alloc(new, gpu)) {
10354 status = NV_ERR_NO_MEMORY;
10355 goto error;
10356 }
10357 }
10358
10359 block_test = uvm_va_block_get_test(existing);
10360 if (block_test && block_test->inject_split_error) {
10361 block_test->inject_split_error = false;
10362 if (!uvm_va_block_is_hmm(existing)) {
10363 UVM_ASSERT(existing->va_range->inject_split_error);
10364 existing->va_range->inject_split_error = false;
10365 }
10366 status = NV_ERR_NO_MEMORY;
10367 goto error;
10368 }
10369
10370 if (uvm_va_block_is_hmm(existing)) {
10371 uvm_va_policy_node_t *node = uvm_va_policy_node_find(existing, new->start);
10372
10373 if (node && node->node.start != new->start) {
10374 status = uvm_va_policy_node_split(existing, node, new->start - 1, NULL);
10375 if (status != NV_OK)
10376 goto error;
10377 }
10378 }
10379
10380 return NV_OK;
10381
10382 error:
10383 // Merge back the chunks we split
10384 split_page_index = uvm_va_block_cpu_page_index(existing, new->start);
10385
10386 for_each_gpu_id(id) {
10387 uvm_gpu_chunk_t *chunk;
10388 size_t chunk_index;
10389 uvm_va_block_gpu_state_t *existing_gpu_state = uvm_va_block_gpu_state_get(existing, id);
10390
10391 if (!existing_gpu_state)
10392 continue;
10393
10394 // If the chunk spanning the split point was split, merge it back
10395 gpu = block_get_gpu(existing, id);
10396 chunk_index = block_gpu_chunk_index(existing, gpu, split_page_index, NULL);
10397 chunk = existing_gpu_state->chunks[chunk_index];
10398 if (!chunk || chunk->state != UVM_PMM_GPU_CHUNK_STATE_IS_SPLIT)
10399 continue;
10400
10401 block_merge_chunk(existing, gpu, chunk);
10402
10403 // We could attempt to shrink the chunks array back down, but it doesn't
10404 // hurt much to have it larger than necessary, and we'd have to handle
10405 // the shrink call failing anyway on this error path.
10406
10407 }
10408
10409 block_merge_cpu_chunks(existing, new);
10410
10411 return status;
10412 }
10413
10414 // Re-calculate the block's top-level processor masks:
10415 // - block->mapped
10416 // - block->resident
10417 //
10418 // This is called on block split.
block_set_processor_masks(uvm_va_block_t * block)10419 static void block_set_processor_masks(uvm_va_block_t *block)
10420 {
10421 size_t num_pages = uvm_va_block_num_cpu_pages(block);
10422 uvm_va_block_region_t block_region = uvm_va_block_region(0, num_pages);
10423 uvm_gpu_id_t id;
10424
10425 if (uvm_page_mask_region_empty(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ], block_region)) {
10426 UVM_ASSERT(uvm_page_mask_region_empty(&block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE], block_region));
10427 uvm_processor_mask_clear(&block->mapped, UVM_ID_CPU);
10428 }
10429 else {
10430 uvm_processor_mask_set(&block->mapped, UVM_ID_CPU);
10431 }
10432
10433 if (uvm_page_mask_region_empty(uvm_va_block_resident_mask_get(block, UVM_ID_CPU, NUMA_NO_NODE), block_region)) {
10434 uvm_va_space_t *va_space = uvm_va_block_get_va_space(block);
10435
10436 if (uvm_processor_mask_get_gpu_count(&va_space->can_access[UVM_ID_CPU_VALUE]) == 0)
10437 UVM_ASSERT(!uvm_processor_mask_test(&block->mapped, UVM_ID_CPU));
10438
10439 block_clear_resident_processor(block, UVM_ID_CPU);
10440 }
10441 else {
10442 block_set_resident_processor(block, UVM_ID_CPU);
10443 }
10444
10445 for_each_gpu_id(id) {
10446 uvm_va_block_gpu_state_t *gpu_state = uvm_va_block_gpu_state_get(block, id);
10447 if (!gpu_state)
10448 continue;
10449
10450 if (uvm_page_mask_region_empty(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_READ], block_region)) {
10451 UVM_ASSERT(uvm_page_mask_region_empty(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_WRITE], block_region));
10452 UVM_ASSERT(uvm_page_mask_region_empty(&gpu_state->pte_bits[UVM_PTE_BITS_GPU_ATOMIC], block_region));
10453 uvm_processor_mask_clear(&block->mapped, id);
10454 }
10455 else {
10456 uvm_processor_mask_set(&block->mapped, id);
10457 }
10458
10459 if (uvm_page_mask_region_empty(&gpu_state->resident, block_region))
10460 block_clear_resident_processor(block, id);
10461 else
10462 block_set_resident_processor(block, id);
10463
10464 if (uvm_page_mask_region_empty(&gpu_state->evicted, block_region))
10465 uvm_processor_mask_clear(&block->evicted_gpus, id);
10466 else
10467 uvm_processor_mask_set(&block->evicted_gpus, id);
10468 }
10469 }
10470
10471 // Split a PAGES_PER_UVM_VA_BLOCK sized bitmap into new and existing parts
10472 // corresponding to a block split.
block_split_page_mask(uvm_page_mask_t * existing_mask,size_t existing_pages,uvm_page_mask_t * new_mask,size_t new_pages)10473 static void block_split_page_mask(uvm_page_mask_t *existing_mask,
10474 size_t existing_pages,
10475 uvm_page_mask_t *new_mask,
10476 size_t new_pages)
10477 {
10478 UVM_ASSERT_MSG(existing_pages + new_pages <= PAGES_PER_UVM_VA_BLOCK, "existing %zu new %zu\n",
10479 existing_pages, new_pages);
10480
10481 // The new block is always in the upper region of existing, so shift the bit
10482 // vectors down.
10483 //
10484 // Note that bitmap_shift_right requires both dst and src to be the same
10485 // size. That's ok since we don't scale them by block size.
10486 uvm_page_mask_shift_right(new_mask, existing_mask, existing_pages);
10487 uvm_page_mask_region_clear(existing_mask, uvm_va_block_region(existing_pages, existing_pages + new_pages));
10488 }
10489
10490 // Split the CPU state within the existing block. existing's start is correct
10491 // but its end has not yet been adjusted.
block_split_cpu(uvm_va_block_t * existing,uvm_va_block_t * new)10492 static void block_split_cpu(uvm_va_block_t *existing, uvm_va_block_t *new)
10493 {
10494 size_t existing_pages, new_pages = uvm_va_block_num_cpu_pages(new);
10495 uvm_pte_bits_cpu_t pte_bit;
10496 uvm_va_block_region_t block_region = uvm_va_block_region_from_block(existing);
10497 uvm_page_index_t split_page_index = uvm_va_block_cpu_page_index(existing, new->start);
10498 uvm_page_index_t page_index;
10499 uvm_page_index_t next_page_index;
10500 uvm_cpu_chunk_t *chunk;
10501 uvm_va_range_t *existing_va_range = existing->va_range;
10502 int nid;
10503
10504 if (existing_va_range) {
10505 UVM_ASSERT(existing->va_range->type == UVM_VA_RANGE_TYPE_MANAGED);
10506 UVM_ASSERT(existing->va_range->type == new->va_range->type);
10507 }
10508
10509 UVM_ASSERT(existing->start < new->start);
10510 UVM_ASSERT(existing->end == new->end);
10511
10512 UVM_ASSERT(PAGE_ALIGNED(new->start));
10513 UVM_ASSERT(PAGE_ALIGNED(existing->start));
10514
10515 existing_pages = (new->start - existing->start) / PAGE_SIZE;
10516
10517 // We don't have to unmap the CPU since its virtual -> physical mappings
10518 // don't change.
10519
10520 for_each_possible_uvm_node(nid) {
10521 uvm_page_mask_t *existing_resident_mask = uvm_va_block_resident_mask_get(existing, UVM_ID_CPU, nid);
10522 uvm_page_mask_t *new_resident_mask = uvm_va_block_resident_mask_get(new, UVM_ID_CPU, nid);
10523
10524 for_each_cpu_chunk_in_block_region_safe(chunk,
10525 page_index,
10526 next_page_index,
10527 existing,
10528 nid,
10529 uvm_va_block_region(split_page_index, block_region.outer)) {
10530 uvm_page_index_t new_chunk_page_index;
10531 NV_STATUS status;
10532
10533 uvm_cpu_chunk_remove_from_block(existing, nid, page_index);
10534
10535 // The chunk has to be adjusted for the new block before inserting it.
10536 new_chunk_page_index = page_index - split_page_index;
10537
10538 // This should never fail because all necessary storage was allocated
10539 // in block_presplit_cpu_chunks().
10540 status = uvm_cpu_chunk_insert_in_block(new, chunk, new_chunk_page_index);
10541 UVM_ASSERT(status == NV_OK);
10542 }
10543
10544 block_split_page_mask(existing_resident_mask, existing_pages, new_resident_mask, new_pages);
10545 }
10546
10547 block_split_page_mask(&existing->cpu.resident, existing_pages, &new->cpu.resident, new_pages);
10548 new->cpu.ever_mapped = existing->cpu.ever_mapped;
10549
10550 for (pte_bit = 0; pte_bit < UVM_PTE_BITS_CPU_MAX; pte_bit++)
10551 block_split_page_mask(&existing->cpu.pte_bits[pte_bit], existing_pages, &new->cpu.pte_bits[pte_bit], new_pages);
10552 }
10553
10554 // Fill out the blocks' chunks arrays with the chunks split by
10555 // block_presplit_gpu_chunks.
block_copy_split_gpu_chunks(uvm_va_block_t * existing,uvm_va_block_t * new,uvm_gpu_t * gpu)10556 static void block_copy_split_gpu_chunks(uvm_va_block_t *existing, uvm_va_block_t *new, uvm_gpu_t *gpu)
10557 {
10558 uvm_va_block_gpu_state_t *existing_gpu_state = uvm_va_block_gpu_state_get(existing, gpu->id);
10559 uvm_va_block_gpu_state_t *new_gpu_state = uvm_va_block_gpu_state_get(new, gpu->id);
10560 uvm_gpu_chunk_t **temp_chunks;
10561 uvm_gpu_chunk_t *original_chunk;
10562 block_gpu_chunk_split_state_t existing_before_state, existing_after_state, new_state;
10563 size_t num_pre_chunks, num_post_chunks, num_split_chunks_existing, num_split_chunks_new;
10564 uvm_page_index_t split_page_index = uvm_va_block_cpu_page_index(existing, new->start);
10565 size_t i;
10566
10567 block_gpu_chunk_get_split_state(existing,
10568 &existing_before_state,
10569 existing->start,
10570 existing->end,
10571 split_page_index,
10572 gpu);
10573 block_gpu_chunk_get_split_state(existing,
10574 &existing_after_state,
10575 existing->start,
10576 new->start - 1,
10577 split_page_index - 1,
10578 gpu);
10579 block_gpu_chunk_get_split_state(new,
10580 &new_state,
10581 new->start,
10582 new->end,
10583 0,
10584 gpu);
10585
10586 // General case (B is original_chunk):
10587 // split
10588 // v
10589 // existing (before) [------ A -----][------ B -----][------ C -----]
10590 // existing (after) [------ A -----][- B0 -]
10591 // new [- B1 -][------ C -----]
10592 //
10593 // Note that the logic below also handles the case of the split happening at
10594 // a chunk boundary. That case behaves as though there is no B0 chunk.
10595
10596 // Number of chunks to the left and right of original_chunk (A and C above).
10597 // Either or both of these may be 0.
10598 num_pre_chunks = existing_before_state.chunk_index;
10599 num_post_chunks = existing_before_state.num_chunks - num_pre_chunks - 1;
10600
10601 // Number of subchunks under existing's portion of original_chunk (B0 above)
10602 num_split_chunks_existing = existing_after_state.num_chunks - num_pre_chunks;
10603
10604 // Number of subchunks under new's portion of original_chunk (B1 above)
10605 num_split_chunks_new = new_state.num_chunks - num_post_chunks;
10606
10607 UVM_ASSERT(num_pre_chunks + num_split_chunks_existing > 0);
10608 UVM_ASSERT(num_split_chunks_new > 0);
10609
10610 // Copy post chunks from the end of existing into new (C above)
10611 memcpy(&new_gpu_state->chunks[num_split_chunks_new],
10612 &existing_gpu_state->chunks[existing_before_state.chunk_index + 1],
10613 num_post_chunks * sizeof(new_gpu_state->chunks[0]));
10614
10615 // Save off the original split chunk since we may overwrite the array
10616 original_chunk = existing_gpu_state->chunks[existing_before_state.chunk_index];
10617
10618 // Fill out the new pointers
10619 if (original_chunk) {
10620 // Note that if the split happened at a chunk boundary, original_chunk
10621 // will not be split. In that case, num_split_chunks_existing will be 0
10622 // and num_split_chunks_new will be 1, so the left copy will be skipped
10623 // and the right copy will pick up the chunk.
10624
10625 // Copy left newly-split chunks into existing (B0 above). The array was
10626 // re-sized in block_presplit_gpu_chunks as necessary.
10627 size_t num_subchunks;
10628
10629 num_subchunks = uvm_pmm_gpu_get_subchunks(&gpu->pmm,
10630 original_chunk,
10631 0, // start_index
10632 num_split_chunks_existing,
10633 &existing_gpu_state->chunks[existing_before_state.chunk_index]);
10634 UVM_ASSERT(num_subchunks == num_split_chunks_existing);
10635
10636 // Copy right newly-split chunks into new (B1 above), overwriting the
10637 // pointer to the original chunk.
10638 num_subchunks = uvm_pmm_gpu_get_subchunks(&gpu->pmm,
10639 original_chunk,
10640 num_split_chunks_existing, // start_index
10641 num_split_chunks_new,
10642 &new_gpu_state->chunks[0]);
10643 UVM_ASSERT(num_subchunks == num_split_chunks_new);
10644 }
10645 else {
10646 // If the chunk wasn't already populated we don't need to copy pointers
10647 // anywhere, but we need to clear out stale pointers from existing's
10648 // array covering the new elements. new's chunks array was already zero-
10649 // initialized.
10650 memset(&existing_gpu_state->chunks[existing_before_state.chunk_index],
10651 0,
10652 num_split_chunks_existing * sizeof(existing_gpu_state->chunks[0]));
10653 }
10654
10655 // Since we update the reverse map information, protect it against a
10656 // concurrent lookup
10657 uvm_spin_lock(&gpu->pmm.list_lock);
10658
10659 // Update the reverse map of all the chunks that are now under the new block
10660 for (i = 0; i < new_state.num_chunks; ++i) {
10661 if (new_gpu_state->chunks[i]) {
10662 UVM_ASSERT(new_gpu_state->chunks[i]->va_block == existing);
10663 new_gpu_state->chunks[i]->va_block = new;
10664
10665 // Adjust the page_index within the VA block for the new subchunks in
10666 // the new VA block
10667 UVM_ASSERT(new_gpu_state->chunks[i]->va_block_page_index >= split_page_index);
10668 new_gpu_state->chunks[i]->va_block_page_index -= split_page_index;
10669 }
10670 }
10671
10672 uvm_spin_unlock(&gpu->pmm.list_lock);
10673
10674 // Attempt to shrink existing's chunk allocation. If the realloc fails, just
10675 // keep on using the old larger one.
10676 if (existing_after_state.num_chunks < existing_before_state.num_chunks) {
10677 temp_chunks = uvm_kvrealloc(existing_gpu_state->chunks,
10678 existing_after_state.num_chunks * sizeof(existing_gpu_state->chunks[0]));
10679 if (temp_chunks)
10680 existing_gpu_state->chunks = temp_chunks;
10681 }
10682 }
10683
block_split_gpu(uvm_va_block_t * existing,uvm_va_block_t * new,uvm_gpu_id_t gpu_id)10684 static void block_split_gpu(uvm_va_block_t *existing, uvm_va_block_t *new, uvm_gpu_id_t gpu_id)
10685 {
10686 uvm_va_block_gpu_state_t *existing_gpu_state = uvm_va_block_gpu_state_get(existing, gpu_id);
10687 uvm_va_block_gpu_state_t *new_gpu_state = uvm_va_block_gpu_state_get(new, gpu_id);
10688 uvm_va_space_t *va_space = uvm_va_block_get_va_space(existing);
10689 uvm_gpu_va_space_t *gpu_va_space;
10690 uvm_gpu_t *gpu;
10691 uvm_gpu_t *accessing_gpu;
10692 size_t new_pages = uvm_va_block_num_cpu_pages(new);
10693 size_t existing_pages, existing_pages_4k, existing_pages_big, new_pages_big;
10694 uvm_pte_bits_gpu_t pte_bit;
10695 size_t num_chunks, i;
10696 uvm_cpu_chunk_t *cpu_chunk;
10697 uvm_page_index_t page_index;
10698 int nid;
10699
10700 if (!existing_gpu_state)
10701 return;
10702
10703 gpu = uvm_va_space_get_gpu(va_space, gpu_id);
10704 UVM_ASSERT(new_gpu_state);
10705
10706 new_gpu_state->force_4k_ptes = existing_gpu_state->force_4k_ptes;
10707
10708 UVM_ASSERT(PAGE_ALIGNED(new->start));
10709 UVM_ASSERT(PAGE_ALIGNED(existing->start));
10710 existing_pages = (new->start - existing->start) / PAGE_SIZE;
10711
10712 for_each_possible_uvm_node(nid) {
10713 for_each_cpu_chunk_in_block(cpu_chunk, page_index, new, nid) {
10714 uvm_pmm_sysmem_mappings_reparent_gpu_mapping(&gpu->pmm_reverse_sysmem_mappings,
10715 uvm_cpu_chunk_get_parent_gpu_phys_addr(cpu_chunk,
10716 gpu->parent),
10717 new);
10718 }
10719 }
10720
10721 block_copy_split_gpu_chunks(existing, new, gpu);
10722
10723 num_chunks = block_num_gpu_chunks(new, gpu);
10724
10725 // Reparent GPU mappings for indirect peers
10726 for (i = 0; i < num_chunks; ++i) {
10727 uvm_gpu_chunk_t *chunk = new_gpu_state->chunks[i];
10728 if (!chunk)
10729 continue;
10730
10731 for_each_va_space_gpu_in_mask(accessing_gpu, va_space, &va_space->indirect_peers[uvm_id_value(gpu->id)]) {
10732 NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(&gpu->pmm, chunk, accessing_gpu);
10733
10734 uvm_pmm_sysmem_mappings_reparent_gpu_chunk_mapping(&accessing_gpu->pmm_reverse_sysmem_mappings,
10735 peer_addr,
10736 new);
10737 }
10738 }
10739
10740 block_split_page_mask(&existing_gpu_state->resident,
10741 existing_pages,
10742 &new_gpu_state->resident,
10743 new_pages);
10744
10745 for (pte_bit = 0; pte_bit < UVM_PTE_BITS_GPU_MAX; pte_bit++) {
10746 block_split_page_mask(&existing_gpu_state->pte_bits[pte_bit], existing_pages,
10747 &new_gpu_state->pte_bits[pte_bit], new_pages);
10748 }
10749
10750 // Adjust page table ranges.
10751 gpu_va_space = uvm_gpu_va_space_get(va_space, gpu);
10752 if (gpu_va_space) {
10753 if (existing_gpu_state->page_table_range_big.table) {
10754 NvU32 big_page_size = uvm_va_block_gpu_big_page_size(existing, gpu);
10755
10756 // existing's end has not been adjusted yet
10757 existing_pages_big = range_num_big_pages(existing->start, new->start - 1, big_page_size);
10758
10759 // Take references on all big pages covered by new
10760 new_pages_big = uvm_va_block_num_big_pages(new, big_page_size);
10761 if (new_pages_big) {
10762 uvm_page_table_range_get_upper(&gpu_va_space->page_tables,
10763 &existing_gpu_state->page_table_range_big,
10764 &new_gpu_state->page_table_range_big,
10765 new_pages_big);
10766
10767 // If the split point is within a big page region, we might have
10768 // a gap since neither existing nor new can use it anymore.
10769 // Get the top N bits from existing's mask to handle that.
10770 bitmap_shift_right(new_gpu_state->big_ptes,
10771 existing_gpu_state->big_ptes,
10772 uvm_va_block_num_big_pages(existing, big_page_size) - new_pages_big,
10773 MAX_BIG_PAGES_PER_UVM_VA_BLOCK);
10774
10775 new_gpu_state->initialized_big = existing_gpu_state->initialized_big;
10776 }
10777
10778 // Drop existing's references on the big PTEs it no longer covers
10779 // now that new has references on them. Note that neither existing
10780 // nor new might have big PTEs after the split. In that case, this
10781 // shrink will free the entire old range.
10782 uvm_page_table_range_shrink(&gpu_va_space->page_tables,
10783 &existing_gpu_state->page_table_range_big,
10784 existing_pages_big);
10785
10786 if (existing_pages_big == 0) {
10787 memset(&existing_gpu_state->page_table_range_big, 0, sizeof(existing_gpu_state->page_table_range_big));
10788 existing_gpu_state->initialized_big = false;
10789 }
10790
10791 bitmap_clear(existing_gpu_state->big_ptes,
10792 existing_pages_big,
10793 MAX_BIG_PAGES_PER_UVM_VA_BLOCK - existing_pages_big);
10794 }
10795
10796 if (existing_gpu_state->page_table_range_4k.table) {
10797 // Since existing and new share the same PDE we just need to bump
10798 // the ref-count on new's sub-range.
10799 uvm_page_table_range_get_upper(&gpu_va_space->page_tables,
10800 &existing_gpu_state->page_table_range_4k,
10801 &new_gpu_state->page_table_range_4k,
10802 uvm_va_block_size(new) / UVM_PAGE_SIZE_4K);
10803
10804 // Drop existing's references on the PTEs it no longer covers now
10805 // that new has references on them.
10806 existing_pages_4k = existing_pages * (PAGE_SIZE / UVM_PAGE_SIZE_4K);
10807 uvm_page_table_range_shrink(&gpu_va_space->page_tables,
10808 &existing_gpu_state->page_table_range_4k,
10809 existing_pages_4k);
10810 }
10811
10812 // We have to set this explicitly to handle the case of splitting an
10813 // invalid, active 2M PTE with no lower page tables allocated.
10814 if (existing_gpu_state->pte_is_2m) {
10815 UVM_ASSERT(!existing_gpu_state->page_table_range_big.table);
10816 UVM_ASSERT(!existing_gpu_state->page_table_range_4k.table);
10817 existing_gpu_state->pte_is_2m = false;
10818 }
10819
10820 // existing can't possibly cover 2MB after a split, so drop any 2M PTE
10821 // references it has. We've taken the necessary references on the lower
10822 // tables above.
10823 block_put_ptes_safe(&gpu_va_space->page_tables, &existing_gpu_state->page_table_range_2m);
10824 existing_gpu_state->activated_big = false;
10825 existing_gpu_state->activated_4k = false;
10826 }
10827
10828 block_split_page_mask(&existing_gpu_state->evicted, existing_pages, &new_gpu_state->evicted, new_pages);
10829 }
10830
uvm_va_block_split(uvm_va_block_t * existing_va_block,NvU64 new_end,uvm_va_block_t ** new_va_block,uvm_va_range_t * new_va_range)10831 NV_STATUS uvm_va_block_split(uvm_va_block_t *existing_va_block,
10832 NvU64 new_end,
10833 uvm_va_block_t **new_va_block,
10834 uvm_va_range_t *new_va_range)
10835 {
10836 uvm_va_space_t *va_space;
10837 uvm_va_block_t *new_block = NULL;
10838 NV_STATUS status;
10839
10840 va_space = new_va_range->va_space;
10841 UVM_ASSERT(existing_va_block->va_range);
10842 UVM_ASSERT(existing_va_block->va_range->va_space == va_space);
10843 UVM_ASSERT(!uvm_va_block_is_hmm(existing_va_block));
10844
10845 // External range types can't be split
10846 UVM_ASSERT(existing_va_block->va_range->type == UVM_VA_RANGE_TYPE_MANAGED);
10847 UVM_ASSERT(new_va_range->type == UVM_VA_RANGE_TYPE_MANAGED);
10848 uvm_assert_rwsem_locked_write(&va_space->lock);
10849
10850 UVM_ASSERT(new_end > existing_va_block->start);
10851 UVM_ASSERT(new_end < existing_va_block->end);
10852 UVM_ASSERT(PAGE_ALIGNED(new_end + 1));
10853
10854 status = uvm_va_block_create(new_va_range, new_end + 1, existing_va_block->end, &new_block);
10855 if (status != NV_OK)
10856 return status;
10857
10858 // We're protected from other splits and faults by the va_space lock being
10859 // held in write mode, but that doesn't stop the reverse mapping (eviction
10860 // path) from inspecting the existing block. Stop those threads by taking
10861 // the block lock. When a reverse mapping thread takes this lock after the
10862 // split has been performed, it will have to re-inspect state and may see
10863 // that it should use the newly-split block instead.
10864 uvm_mutex_lock(&existing_va_block->lock);
10865
10866 status = uvm_va_block_split_locked(existing_va_block, new_end, new_block, new_va_range);
10867
10868 uvm_mutex_unlock(&existing_va_block->lock);
10869
10870 if (status != NV_OK)
10871 uvm_va_block_release(new_block);
10872 else if (new_va_block)
10873 *new_va_block = new_block;
10874
10875 return status;
10876 }
10877
uvm_va_block_split_locked(uvm_va_block_t * existing_va_block,NvU64 new_end,uvm_va_block_t * new_block,uvm_va_range_t * new_va_range)10878 NV_STATUS uvm_va_block_split_locked(uvm_va_block_t *existing_va_block,
10879 NvU64 new_end,
10880 uvm_va_block_t *new_block,
10881 uvm_va_range_t *new_va_range)
10882 {
10883 uvm_va_space_t *va_space = uvm_va_block_get_va_space(existing_va_block);
10884 uvm_gpu_id_t id;
10885 NV_STATUS status;
10886 uvm_perf_event_data_t event_data;
10887 uvm_va_block_context_t *va_block_context;
10888
10889 uvm_assert_rwsem_locked_write(&va_space->lock);
10890
10891 va_block_context = uvm_va_space_block_context(va_space, NULL);
10892
10893 UVM_ASSERT(block_check_chunks(existing_va_block));
10894
10895 // As soon as we update existing's reverse mappings to point to the newly-
10896 // split block, the eviction path could try to operate on the new block.
10897 // Lock that out too until new is ready.
10898 //
10899 // Note that we usually shouldn't nest block locks, but it's ok here because
10900 // we just created new_block so no other thread could possibly take it out
10901 // of order with existing's lock.
10902 uvm_mutex_lock_nested(&new_block->lock);
10903
10904 // The split has to be transactional, meaning that if we fail, the existing
10905 // block must not be modified. Handle that by pre-allocating everything we
10906 // might need under both existing and new at the start so we only have a
10907 // single point of failure.
10908
10909 // Since pre-allocation might require allocating new PTEs, we have to handle
10910 // allocation retry which might drop existing's block lock. The
10911 // preallocation is split into two steps for that: the first part which
10912 // allocates and splits PTEs can handle having the block lock dropped then
10913 // re-taken. It won't modify existing_va_block other than adding new PTE
10914 // allocations and splitting existing PTEs, which is always safe.
10915 status = UVM_VA_BLOCK_RETRY_LOCKED(existing_va_block,
10916 NULL,
10917 block_split_presplit_ptes(existing_va_block, new_block));
10918 if (status != NV_OK)
10919 goto out;
10920
10921 // Pre-allocate, stage two. This modifies existing_va_block in ways which
10922 // violate many assumptions (such as changing chunk size), but it will put
10923 // things back into place on a failure without dropping the block lock.
10924 status = block_split_preallocate_no_retry(existing_va_block, new_block);
10925 if (status != NV_OK)
10926 goto out;
10927
10928 // We'll potentially be freeing page tables, so we need to wait for any
10929 // outstanding work before we start
10930 status = uvm_tracker_wait(&existing_va_block->tracker);
10931 if (status != NV_OK)
10932 goto out;
10933
10934 // Update existing's state only once we're past all failure points
10935
10936 event_data.block_shrink.block = existing_va_block;
10937 uvm_perf_event_notify(&va_space->perf_events, UVM_PERF_EVENT_BLOCK_SHRINK, &event_data);
10938
10939 block_split_cpu(existing_va_block, new_block);
10940
10941 for_each_gpu_id(id)
10942 block_split_gpu(existing_va_block, new_block, id);
10943
10944 // Update the size of the existing block first so that
10945 // block_set_processor_masks can use block_{set,clear}_resident_processor
10946 // that relies on the size to be correct.
10947 existing_va_block->end = new_end;
10948
10949 block_split_page_mask(&existing_va_block->read_duplicated_pages,
10950 uvm_va_block_num_cpu_pages(existing_va_block),
10951 &new_block->read_duplicated_pages,
10952 uvm_va_block_num_cpu_pages(new_block));
10953
10954 if (!uvm_va_block_is_hmm(existing_va_block)) {
10955 block_split_page_mask(&existing_va_block->maybe_mapped_pages,
10956 uvm_va_block_num_cpu_pages(existing_va_block),
10957 &new_block->maybe_mapped_pages,
10958 uvm_va_block_num_cpu_pages(new_block));
10959 }
10960
10961 block_set_processor_masks(existing_va_block);
10962 block_set_processor_masks(new_block);
10963
10964 if (uvm_va_block_is_hmm(existing_va_block)) {
10965 uvm_hmm_va_block_split_tree(existing_va_block, new_block);
10966 uvm_va_policy_node_split_move(existing_va_block, new_block);
10967 }
10968
10969 out:
10970 // Run checks on existing_va_block even on failure, since an error must
10971 // leave the block in a consistent state.
10972 UVM_ASSERT(block_check_chunks(existing_va_block));
10973 UVM_ASSERT(block_check_mappings(existing_va_block, va_block_context));
10974 if (status == NV_OK) {
10975 UVM_ASSERT(block_check_chunks(new_block));
10976 UVM_ASSERT(block_check_mappings(new_block, va_block_context));
10977 }
10978 else {
10979 int nid;
10980
10981 for_each_possible_uvm_node(nid)
10982 block_free_cpu_chunk_storage(new_block, nid);
10983 }
10984
10985 uvm_mutex_unlock_nested(&new_block->lock);
10986
10987 return status;
10988 }
10989
block_region_might_read_duplicate(uvm_va_block_t * va_block,uvm_va_block_region_t region)10990 static bool block_region_might_read_duplicate(uvm_va_block_t *va_block,
10991 uvm_va_block_region_t region)
10992 {
10993 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
10994 uvm_va_range_t *va_range = va_block->va_range;
10995
10996 if (!uvm_va_space_can_read_duplicate(va_space, NULL))
10997 return false;
10998
10999 // TODO: Bug 3660922: need to implement HMM read duplication support.
11000 if (uvm_va_block_is_hmm(va_block) ||
11001 uvm_va_range_get_policy(va_range)->read_duplication == UVM_READ_DUPLICATION_DISABLED)
11002 return false;
11003
11004 if (uvm_va_range_get_policy(va_range)->read_duplication == UVM_READ_DUPLICATION_UNSET
11005 && uvm_page_mask_region_weight(&va_block->read_duplicated_pages, region) == 0)
11006 return false;
11007
11008 return true;
11009 }
11010
11011 // Returns the new access permission for the processor that faulted or
11012 // triggered access counter notifications on the given page
11013 //
11014 // TODO: Bug 1766424: this function works on a single page at a time. This
11015 // could be changed in the future to optimize multiple faults/counters on
11016 // contiguous pages.
compute_new_permission(uvm_va_block_t * va_block,uvm_va_block_context_t * va_block_context,uvm_page_index_t page_index,uvm_processor_id_t fault_processor_id,uvm_processor_id_t new_residency,uvm_fault_access_type_t access_type)11017 static uvm_prot_t compute_new_permission(uvm_va_block_t *va_block,
11018 uvm_va_block_context_t *va_block_context,
11019 uvm_page_index_t page_index,
11020 uvm_processor_id_t fault_processor_id,
11021 uvm_processor_id_t new_residency,
11022 uvm_fault_access_type_t access_type)
11023 {
11024 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
11025 uvm_prot_t logical_prot, new_prot;
11026 uvm_processor_mask_t *revoke_processors = &va_block_context->scratch_processor_mask;
11027 struct vm_area_struct *hmm_vma = va_block_context->hmm.vma;
11028
11029 // TODO: Bug 1766432: Refactor into policies. Current policy is
11030 // query_promote: upgrade access privileges to avoid future faults IF
11031 // they don't trigger further revocations.
11032 new_prot = uvm_fault_access_type_to_prot(access_type);
11033 logical_prot = compute_logical_prot(va_block, hmm_vma, page_index);
11034
11035 UVM_ASSERT(logical_prot >= new_prot);
11036
11037 if ((logical_prot > UVM_PROT_READ_ONLY) &&
11038 (new_prot == UVM_PROT_READ_ONLY) &&
11039 !block_region_might_read_duplicate(va_block, uvm_va_block_region_for_page(page_index))) {
11040
11041 block_page_authorized_processors(va_block,
11042 page_index,
11043 UVM_PROT_READ_WRITE_ATOMIC,
11044 revoke_processors);
11045
11046 uvm_processor_mask_andnot(revoke_processors,
11047 revoke_processors,
11048 &va_space->has_native_atomics[uvm_id_value(new_residency)]);
11049
11050 // Only check if there are no faultable processors in the revoke
11051 // processors mask.
11052 uvm_processor_mask_and(revoke_processors, revoke_processors, &va_space->faultable_processors);
11053
11054 if (uvm_processor_mask_empty(revoke_processors))
11055 new_prot = UVM_PROT_READ_WRITE;
11056 }
11057
11058 if (logical_prot == UVM_PROT_READ_WRITE_ATOMIC && new_prot == UVM_PROT_READ_WRITE) {
11059 if (uvm_processor_mask_test(&va_space->has_native_atomics[uvm_id_value(new_residency)], fault_processor_id))
11060 new_prot = UVM_PROT_READ_WRITE_ATOMIC;
11061 }
11062
11063 return new_prot;
11064 }
11065
do_block_add_mappings_after_migration(uvm_va_block_t * va_block,uvm_va_block_context_t * va_block_context,uvm_processor_id_t new_residency,uvm_processor_id_t processor_id,const uvm_processor_mask_t * map_processors,uvm_va_block_region_t region,const uvm_page_mask_t * map_page_mask,uvm_prot_t max_prot,const uvm_processor_mask_t * thrashing_processors,uvm_tracker_t * tracker)11066 static NV_STATUS do_block_add_mappings_after_migration(uvm_va_block_t *va_block,
11067 uvm_va_block_context_t *va_block_context,
11068 uvm_processor_id_t new_residency,
11069 uvm_processor_id_t processor_id,
11070 const uvm_processor_mask_t *map_processors,
11071 uvm_va_block_region_t region,
11072 const uvm_page_mask_t *map_page_mask,
11073 uvm_prot_t max_prot,
11074 const uvm_processor_mask_t *thrashing_processors,
11075 uvm_tracker_t *tracker)
11076 {
11077 NV_STATUS status;
11078 uvm_processor_id_t map_processor_id;
11079 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
11080 uvm_prot_t new_map_prot = max_prot;
11081 uvm_processor_mask_t *map_processors_local;
11082 uvm_processor_mask_t *native_atomics_mask = &va_space->has_native_atomics[uvm_id_value(new_residency)];
11083
11084 map_processors_local = uvm_processor_mask_cache_alloc();
11085 if (!map_processors_local)
11086 return NV_ERR_NO_MEMORY;
11087
11088 uvm_processor_mask_copy(map_processors_local, map_processors);
11089
11090 // Handle atomic mappings separately
11091 if (max_prot == UVM_PROT_READ_WRITE_ATOMIC) {
11092 if (uvm_processor_mask_test(native_atomics_mask, processor_id)) {
11093
11094 for_each_id_in_mask(map_processor_id, map_processors_local) {
11095 UvmEventMapRemoteCause cause = UvmEventMapRemoteCausePolicy;
11096
11097 // Skip processors without native atomics to the residency.
11098 if (!uvm_processor_mask_test(native_atomics_mask, map_processor_id))
11099 continue;
11100
11101 if (thrashing_processors && uvm_processor_mask_test(thrashing_processors, map_processor_id))
11102 cause = UvmEventMapRemoteCauseThrashing;
11103
11104 status = uvm_va_block_map(va_block,
11105 va_block_context,
11106 map_processor_id,
11107 region,
11108 map_page_mask,
11109 UVM_PROT_READ_WRITE_ATOMIC,
11110 cause,
11111 tracker);
11112 if (status != NV_OK)
11113 goto out;
11114 }
11115
11116 // Filter out these mapped processors for the next steps
11117 uvm_processor_mask_andnot(map_processors_local, map_processors_local, native_atomics_mask);
11118
11119 new_map_prot = UVM_PROT_READ_WRITE;
11120 }
11121 else {
11122 if (UVM_ID_IS_CPU(processor_id))
11123 new_map_prot = UVM_PROT_READ_WRITE;
11124 else
11125 new_map_prot = UVM_PROT_READ_ONLY;
11126 }
11127 }
11128
11129 // Map the rest of processors
11130 for_each_id_in_mask(map_processor_id, map_processors_local) {
11131 UvmEventMapRemoteCause cause = UvmEventMapRemoteCausePolicy;
11132 uvm_prot_t final_map_prot;
11133 bool map_processor_has_enabled_system_wide_atomics =
11134 uvm_processor_mask_test(&va_space->system_wide_atomics_enabled_processors, map_processor_id);
11135
11136 // Write mappings from processors with disabled system-wide atomics are treated like atomics
11137 if (new_map_prot == UVM_PROT_READ_WRITE && !map_processor_has_enabled_system_wide_atomics)
11138 final_map_prot = UVM_PROT_READ_WRITE_ATOMIC;
11139 else
11140 final_map_prot = new_map_prot;
11141
11142 if (thrashing_processors && uvm_processor_mask_test(thrashing_processors, map_processor_id))
11143 cause = UvmEventMapRemoteCauseThrashing;
11144
11145 status = uvm_va_block_map(va_block,
11146 va_block_context,
11147 map_processor_id,
11148 region,
11149 map_page_mask,
11150 final_map_prot,
11151 cause,
11152 tracker);
11153 if (status != NV_OK)
11154 goto out;
11155 }
11156
11157 out:
11158 uvm_processor_mask_cache_free(map_processors_local);
11159
11160 return NV_OK;
11161 }
11162
uvm_va_block_add_mappings_after_migration(uvm_va_block_t * va_block,uvm_va_block_context_t * va_block_context,uvm_processor_id_t new_residency,uvm_processor_id_t processor_id,uvm_va_block_region_t region,const uvm_page_mask_t * map_page_mask,uvm_prot_t max_prot,const uvm_processor_mask_t * thrashing_processors)11163 NV_STATUS uvm_va_block_add_mappings_after_migration(uvm_va_block_t *va_block,
11164 uvm_va_block_context_t *va_block_context,
11165 uvm_processor_id_t new_residency,
11166 uvm_processor_id_t processor_id,
11167 uvm_va_block_region_t region,
11168 const uvm_page_mask_t *map_page_mask,
11169 uvm_prot_t max_prot,
11170 const uvm_processor_mask_t *thrashing_processors)
11171 {
11172 NV_STATUS tracker_status, status = NV_OK;
11173 uvm_processor_mask_t *map_other_processors = NULL;
11174 uvm_processor_mask_t *map_uvm_lite_gpus = NULL;
11175 uvm_processor_id_t map_processor_id;
11176 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
11177 const uvm_page_mask_t *final_page_mask = map_page_mask;
11178 uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
11179 const uvm_va_policy_t *policy = uvm_va_policy_get_region(va_block, region);
11180 uvm_processor_id_t preferred_location;
11181
11182 uvm_assert_mutex_locked(&va_block->lock);
11183
11184 map_other_processors = uvm_processor_mask_cache_alloc();
11185 if (!map_other_processors) {
11186 status = NV_ERR_NO_MEMORY;
11187 goto out;
11188 }
11189
11190 map_uvm_lite_gpus = uvm_processor_mask_cache_alloc();
11191 if (!map_uvm_lite_gpus) {
11192 status = NV_ERR_NO_MEMORY;
11193 goto out;
11194 }
11195
11196 // Read duplication takes precedence over SetAccessedBy.
11197 //
11198 // Exclude ranges with read duplication set...
11199 if (uvm_va_policy_is_read_duplicate(policy, va_space)) {
11200 status = NV_OK;
11201 goto out;
11202 }
11203
11204 // ... and pages read-duplicated by performance heuristics
11205 if (policy->read_duplication == UVM_READ_DUPLICATION_UNSET) {
11206 if (map_page_mask) {
11207 uvm_page_mask_andnot(&va_block_context->mapping.filtered_page_mask,
11208 map_page_mask,
11209 &va_block->read_duplicated_pages);
11210 }
11211 else {
11212 uvm_page_mask_complement(&va_block_context->mapping.filtered_page_mask, &va_block->read_duplicated_pages);
11213 }
11214 final_page_mask = &va_block_context->mapping.filtered_page_mask;
11215 }
11216
11217 // Add mappings for accessed_by processors and the given processor mask
11218 if (thrashing_processors)
11219 uvm_processor_mask_or(map_other_processors, &policy->accessed_by, thrashing_processors);
11220 else
11221 uvm_processor_mask_copy(map_other_processors, &policy->accessed_by);
11222
11223 // Only processors that can access the new location must be considered
11224 uvm_processor_mask_and(map_other_processors,
11225 map_other_processors,
11226 &va_space->accessible_from[uvm_id_value(new_residency)]);
11227
11228 // Exclude caller processor as it must have already been mapped
11229 uvm_processor_mask_clear(map_other_processors, processor_id);
11230
11231 // Exclude preferred location so it won't get remote mappings
11232 preferred_location = policy->preferred_location;
11233 if (UVM_ID_IS_VALID(preferred_location) &&
11234 !uvm_id_equal(new_residency, preferred_location) &&
11235 uvm_va_space_processor_has_memory(va_space, preferred_location)) {
11236 uvm_processor_mask_clear(map_other_processors, preferred_location);
11237 }
11238
11239 // Map the UVM-Lite GPUs if the new location is the preferred location. This
11240 // will only create mappings on first touch. After that they're persistent
11241 // so uvm_va_block_map will be a no-op.
11242 uvm_processor_mask_and(map_uvm_lite_gpus, map_other_processors, block_get_uvm_lite_gpus(va_block));
11243 if (!uvm_processor_mask_empty(map_uvm_lite_gpus) &&
11244 uvm_va_policy_preferred_location_equal(policy, new_residency, va_block_context->make_resident.dest_nid)) {
11245 for_each_id_in_mask (map_processor_id, map_uvm_lite_gpus) {
11246 status = uvm_va_block_map(va_block,
11247 va_block_context,
11248 map_processor_id,
11249 region,
11250 final_page_mask,
11251 UVM_PROT_READ_WRITE_ATOMIC,
11252 UvmEventMapRemoteCauseCoherence,
11253 &local_tracker);
11254 if (status != NV_OK)
11255 goto out;
11256 }
11257 }
11258
11259 uvm_processor_mask_andnot(map_other_processors, map_other_processors, block_get_uvm_lite_gpus(va_block));
11260
11261 // We can't map non-migratable pages to the CPU. If we have any, build a
11262 // new mask of migratable pages and map the CPU separately.
11263 if (uvm_processor_mask_test(map_other_processors, UVM_ID_CPU) &&
11264 !uvm_range_group_all_migratable(va_space,
11265 uvm_va_block_region_start(va_block, region),
11266 uvm_va_block_region_end(va_block, region))) {
11267 uvm_page_mask_t *migratable_mask = &va_block_context->mapping.migratable_mask;
11268
11269 uvm_range_group_migratable_page_mask(va_block, region, migratable_mask);
11270 if (uvm_page_mask_and(migratable_mask, migratable_mask, final_page_mask)) {
11271 status = do_block_add_mappings_after_migration(va_block,
11272 va_block_context,
11273 new_residency,
11274 processor_id,
11275 &g_uvm_processor_mask_cpu,
11276 region,
11277 migratable_mask,
11278 max_prot,
11279 thrashing_processors,
11280 &local_tracker);
11281 if (status != NV_OK)
11282 goto out;
11283 }
11284
11285 uvm_processor_mask_clear(map_other_processors, UVM_ID_CPU);
11286 }
11287
11288 status = do_block_add_mappings_after_migration(va_block,
11289 va_block_context,
11290 new_residency,
11291 processor_id,
11292 map_other_processors,
11293 region,
11294 final_page_mask,
11295 max_prot,
11296 thrashing_processors,
11297 &local_tracker);
11298 if (status != NV_OK)
11299 goto out;
11300
11301 out:
11302 tracker_status = uvm_tracker_add_tracker_safe(&va_block->tracker, &local_tracker);
11303 uvm_tracker_deinit(&local_tracker);
11304 uvm_processor_mask_cache_free(map_other_processors);
11305 uvm_processor_mask_cache_free(map_uvm_lite_gpus);
11306
11307 return status == NV_OK ? tracker_status : status;
11308 }
11309
uvm_va_block_page_compute_highest_permission(uvm_va_block_t * va_block,uvm_va_block_context_t * va_block_context,uvm_processor_id_t processor_id,uvm_page_index_t page_index)11310 uvm_prot_t uvm_va_block_page_compute_highest_permission(uvm_va_block_t *va_block,
11311 uvm_va_block_context_t *va_block_context,
11312 uvm_processor_id_t processor_id,
11313 uvm_page_index_t page_index)
11314 {
11315 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
11316 uvm_processor_mask_t *resident_processors = &va_block_context->scratch_processor_mask;
11317 NvU32 resident_processors_count;
11318
11319 if (uvm_processor_mask_test(block_get_uvm_lite_gpus(va_block), processor_id))
11320 return UVM_PROT_READ_WRITE_ATOMIC;
11321
11322 uvm_va_block_page_resident_processors(va_block, page_index, resident_processors);
11323 resident_processors_count = uvm_processor_mask_get_count(resident_processors);
11324
11325 if (resident_processors_count == 0) {
11326 return UVM_PROT_NONE;
11327 }
11328 else if (resident_processors_count > 1) {
11329 // If there are many copies, we can only map READ ONLY
11330 //
11331 // The block state doesn't track the mapping target (aperture) of each
11332 // individual PTE, just the permissions and where the data is resident.
11333 // If the data is resident in multiple places, then we have a problem
11334 // since we can't know where the PTE points. This means we won't know
11335 // what needs to be unmapped for cases like UvmUnregisterGpu and
11336 // UvmDisablePeerAccess.
11337 //
11338 // The simple way to solve this is to enforce that a read-duplication
11339 // mapping always points to local memory.
11340 if (uvm_processor_mask_test(resident_processors, processor_id))
11341 return UVM_PROT_READ_ONLY;
11342
11343 return UVM_PROT_NONE;
11344 }
11345 else {
11346 uvm_processor_id_t atomic_id;
11347 uvm_processor_id_t residency;
11348 uvm_processor_mask_t *atomic_mappings;
11349 uvm_processor_mask_t *write_mappings;
11350
11351 // Search the id of the processor with the only resident copy
11352 residency = uvm_processor_mask_find_first_id(resident_processors);
11353 UVM_ASSERT(UVM_ID_IS_VALID(residency));
11354
11355 // If we cannot map the processor with the resident copy, exit
11356 if (!uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(residency)], processor_id))
11357 return UVM_PROT_NONE;
11358
11359 // Fast path: if the page is not mapped anywhere else, it can be safely
11360 // mapped with RWA permission
11361 if (!uvm_page_mask_test(&va_block->maybe_mapped_pages, page_index) &&
11362 !uvm_va_block_is_hmm(va_block))
11363 return UVM_PROT_READ_WRITE_ATOMIC;
11364
11365 atomic_mappings = &va_block_context->scratch_processor_mask;
11366
11367 block_page_authorized_processors(va_block, page_index, UVM_PROT_READ_WRITE_ATOMIC, atomic_mappings);
11368
11369 // Exclude processors with system-wide atomics disabled from atomic_mappings
11370 uvm_processor_mask_and(atomic_mappings, atomic_mappings, &va_space->system_wide_atomics_enabled_processors);
11371
11372 // Exclude the processor for which the mapping protections are being computed
11373 uvm_processor_mask_clear(atomic_mappings, processor_id);
11374
11375 // If there is any processor with atomic mapping, check if it has native atomics to the processor
11376 // with the resident copy. If it does not, we can only map READ ONLY
11377 atomic_id = uvm_processor_mask_find_first_id(atomic_mappings);
11378 if (UVM_ID_IS_VALID(atomic_id) &&
11379 !uvm_processor_mask_test(&va_space->has_native_atomics[uvm_id_value(residency)], atomic_id)) {
11380 return UVM_PROT_READ_ONLY;
11381 }
11382
11383 write_mappings = &va_block_context->scratch_processor_mask;
11384
11385 block_page_authorized_processors(va_block, page_index, UVM_PROT_READ_WRITE, write_mappings);
11386
11387 // Exclude the processor for which the mapping protections are being computed
11388 uvm_processor_mask_clear(write_mappings, processor_id);
11389
11390 // At this point, any processor with atomic mappings either has native
11391 // atomics support to the processor with the resident copy or has
11392 // disabled system-wide atomics. If the requesting processor has
11393 // disabled system-wide atomics or has native atomics to that processor,
11394 // we can map with ATOMIC privileges. Likewise, if there are no other
11395 // processors with WRITE or ATOMIC mappings, we can map with ATOMIC
11396 // privileges. For HMM, don't allow GPU atomic access to remote mapped
11397 // system memory even if there are no write mappings since CPU access
11398 // can be upgraded without notification.
11399 if (!uvm_processor_mask_test(&va_space->system_wide_atomics_enabled_processors, processor_id) ||
11400 uvm_processor_mask_test(&va_space->has_native_atomics[uvm_id_value(residency)], processor_id) ||
11401 (uvm_processor_mask_empty(write_mappings) && !uvm_va_block_is_hmm(va_block))) {
11402 return UVM_PROT_READ_WRITE_ATOMIC;
11403 }
11404
11405 return UVM_PROT_READ_WRITE;
11406 }
11407 }
11408
uvm_va_block_add_mappings(uvm_va_block_t * va_block,uvm_va_block_context_t * va_block_context,uvm_processor_id_t processor_id,uvm_va_block_region_t region,const uvm_page_mask_t * page_mask,UvmEventMapRemoteCause cause)11409 NV_STATUS uvm_va_block_add_mappings(uvm_va_block_t *va_block,
11410 uvm_va_block_context_t *va_block_context,
11411 uvm_processor_id_t processor_id,
11412 uvm_va_block_region_t region,
11413 const uvm_page_mask_t *page_mask,
11414 UvmEventMapRemoteCause cause)
11415 {
11416 uvm_va_range_t *va_range = va_block->va_range;
11417 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
11418 NV_STATUS status = NV_OK;
11419 uvm_page_index_t page_index;
11420 uvm_range_group_range_iter_t iter;
11421 uvm_prot_t prot_to_map;
11422
11423 if (UVM_ID_IS_CPU(processor_id) && !uvm_va_block_is_hmm(va_block)) {
11424 if (!uvm_va_range_vma_check(va_range, va_block_context->mm))
11425 return NV_OK;
11426
11427 uvm_range_group_range_migratability_iter_first(va_space,
11428 uvm_va_block_region_start(va_block, region),
11429 uvm_va_block_region_end(va_block, region),
11430 &iter);
11431 }
11432
11433 for (prot_to_map = UVM_PROT_READ_ONLY; prot_to_map <= UVM_PROT_READ_WRITE_ATOMIC; ++prot_to_map)
11434 va_block_context->mask_by_prot[prot_to_map - 1].count = 0;
11435
11436 for_each_va_block_page_in_region_mask(page_index, page_mask, region) {
11437 // Read duplication takes precedence over SetAccessedBy. Exclude pages
11438 // read-duplicated by performance heuristics
11439 if (uvm_page_mask_test(&va_block->read_duplicated_pages, page_index))
11440 continue;
11441
11442 prot_to_map = uvm_va_block_page_compute_highest_permission(va_block,
11443 va_block_context,
11444 processor_id,
11445 page_index);
11446 if (prot_to_map == UVM_PROT_NONE)
11447 continue;
11448
11449 if (UVM_ID_IS_CPU(processor_id) && !uvm_va_block_is_hmm(va_block)) {
11450 while (uvm_va_block_cpu_page_index(va_block, iter.end) < page_index) {
11451 uvm_range_group_range_migratability_iter_next(va_space,
11452 &iter,
11453 uvm_va_block_region_end(va_block, region));
11454 }
11455
11456 if (!iter.migratable)
11457 continue;
11458 }
11459
11460 if (va_block_context->mask_by_prot[prot_to_map - 1].count++ == 0)
11461 uvm_page_mask_zero(&va_block_context->mask_by_prot[prot_to_map - 1].page_mask);
11462
11463 uvm_page_mask_set(&va_block_context->mask_by_prot[prot_to_map - 1].page_mask, page_index);
11464 }
11465
11466 for (prot_to_map = UVM_PROT_READ_ONLY; prot_to_map <= UVM_PROT_READ_WRITE_ATOMIC; ++prot_to_map) {
11467 if (va_block_context->mask_by_prot[prot_to_map - 1].count == 0)
11468 continue;
11469
11470 status = uvm_va_block_map(va_block,
11471 va_block_context,
11472 processor_id,
11473 region,
11474 &va_block_context->mask_by_prot[prot_to_map - 1].page_mask,
11475 prot_to_map,
11476 cause,
11477 &va_block->tracker);
11478 if (status != NV_OK)
11479 break;
11480 }
11481
11482 return status;
11483 }
11484
can_read_duplicate(uvm_va_block_t * va_block,uvm_page_index_t page_index,const uvm_va_policy_t * policy,const uvm_perf_thrashing_hint_t * thrashing_hint)11485 static bool can_read_duplicate(uvm_va_block_t *va_block,
11486 uvm_page_index_t page_index,
11487 const uvm_va_policy_t *policy,
11488 const uvm_perf_thrashing_hint_t *thrashing_hint)
11489 {
11490 if (uvm_va_policy_is_read_duplicate(policy, uvm_va_block_get_va_space(va_block)))
11491 return true;
11492
11493 if (policy->read_duplication != UVM_READ_DUPLICATION_DISABLED &&
11494 uvm_page_mask_test(&va_block->read_duplicated_pages, page_index) &&
11495 thrashing_hint->type != UVM_PERF_THRASHING_HINT_TYPE_PIN)
11496 return true;
11497
11498 return false;
11499 }
11500
11501 // TODO: Bug 1827400: If the faulting processor has support for native
11502 // atomics to the current location and the faults on the page were
11503 // triggered by atomic accesses only, we keep the current residency.
11504 // This is a short-term solution to exercise remote atomics over
11505 // NVLINK when possible (not only when preferred location is set to
11506 // the remote GPU) as they are much faster than relying on page
11507 // faults and permission downgrades, which cause thrashing. In the
11508 // future, the thrashing detection/prevention heuristics should
11509 // detect and handle this case.
map_remote_on_atomic_fault(uvm_va_space_t * va_space,NvU32 access_type_mask,uvm_processor_id_t processor_id,uvm_processor_id_t residency)11510 static bool map_remote_on_atomic_fault(uvm_va_space_t *va_space,
11511 NvU32 access_type_mask,
11512 uvm_processor_id_t processor_id,
11513 uvm_processor_id_t residency)
11514 {
11515 // This policy can be enabled/disabled using a module parameter
11516 if (!uvm_perf_map_remote_on_native_atomics_fault)
11517 return false;
11518
11519 // Only consider atomics faults
11520 if (uvm_fault_access_type_mask_lowest(access_type_mask) < UVM_FAULT_ACCESS_TYPE_ATOMIC_WEAK)
11521 return false;
11522
11523 // We cannot differentiate CPU writes from atomics. We exclude CPU faults
11524 // from the logic explained above in order to avoid mapping CPU to vidmem
11525 // memory due to a write.
11526 if (UVM_ID_IS_CPU(processor_id))
11527 return false;
11528
11529 // On P9 systems (which have native HW support for system-wide atomics), we
11530 // have determined experimentally that placing memory on a GPU yields the
11531 // best performance on most cases (since CPU can cache vidmem but not vice
11532 // versa). Therefore, don't map remotely if the current residency is
11533 // sysmem.
11534 if (UVM_ID_IS_CPU(residency))
11535 return false;
11536
11537 return uvm_processor_mask_test(&va_space->has_native_atomics[uvm_id_value(residency)], processor_id);
11538 }
11539
11540 // TODO: Bug 1766424: this function works on a single page at a time. This
11541 // could be changed in the future to optimize multiple faults or access
11542 // counter notifications on contiguous pages.
block_select_processor_residency(uvm_va_block_t * va_block,uvm_va_block_context_t * va_block_context,uvm_page_index_t page_index,uvm_processor_id_t processor_id,NvU32 access_type_mask,const uvm_va_policy_t * policy,const uvm_perf_thrashing_hint_t * thrashing_hint,uvm_service_operation_t operation,const bool hmm_migratable,bool * read_duplicate)11543 static uvm_processor_id_t block_select_processor_residency(uvm_va_block_t *va_block,
11544 uvm_va_block_context_t *va_block_context,
11545 uvm_page_index_t page_index,
11546 uvm_processor_id_t processor_id,
11547 NvU32 access_type_mask,
11548 const uvm_va_policy_t *policy,
11549 const uvm_perf_thrashing_hint_t *thrashing_hint,
11550 uvm_service_operation_t operation,
11551 const bool hmm_migratable,
11552 bool *read_duplicate)
11553 {
11554 uvm_processor_id_t closest_resident_processor;
11555 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
11556 bool may_read_duplicate;
11557 uvm_processor_id_t preferred_location;
11558
11559 // TODO: Bug 3660968: Remove uvm_hmm_force_sysmem_set() check as soon as
11560 // HMM migration is implemented VMAs other than anonymous memory.
11561 // TODO: Bug 4050579: Remove hmm_migratable check when swap cached pages
11562 // can be migrated.
11563 if (is_uvm_fault_force_sysmem_set() ||
11564 !hmm_migratable ||
11565 uvm_hmm_must_use_sysmem(va_block, va_block_context->hmm.vma)) {
11566 *read_duplicate = false;
11567 return UVM_ID_CPU;
11568 }
11569
11570 may_read_duplicate = can_read_duplicate(va_block, page_index, policy, thrashing_hint);
11571
11572 // Read/prefetch faults on a VA range with read duplication enabled
11573 // always create a copy of the page on the faulting processor's memory.
11574 // Note that access counters always use UVM_FAULT_ACCESS_TYPE_PREFETCH,
11575 // which will lead to read duplication if it is enabled.
11576 *read_duplicate = may_read_duplicate &&
11577 (uvm_fault_access_type_mask_highest(access_type_mask) <= UVM_FAULT_ACCESS_TYPE_READ);
11578
11579 if (*read_duplicate)
11580 return processor_id;
11581
11582 *read_duplicate = false;
11583
11584 // If read-duplication is active in the page but we are not
11585 // read-duplicating because the access type is not a read or a prefetch,
11586 // the faulting processor should get a local copy
11587 if (may_read_duplicate)
11588 return processor_id;
11589
11590 // If the faulting processor is the preferred location always migrate
11591 preferred_location = policy->preferred_location;
11592 if (uvm_id_equal(processor_id, preferred_location)) {
11593 if (thrashing_hint->type != UVM_PERF_THRASHING_HINT_TYPE_NONE) {
11594 UVM_ASSERT(thrashing_hint->type == UVM_PERF_THRASHING_HINT_TYPE_PIN);
11595 if (uvm_va_space_processor_has_memory(va_space, processor_id))
11596 UVM_ASSERT(uvm_id_equal(thrashing_hint->pin.residency, processor_id));
11597 }
11598
11599 return processor_id;
11600 }
11601
11602 // If the faulting processor is the CPU, HMM has to migrate the block to
11603 // system memory.
11604 // TODO: Bug 3900021: [UVM-HMM] investigate thrashing improvements.
11605 if (UVM_ID_IS_CPU(processor_id) && uvm_va_block_is_hmm(va_block))
11606 return processor_id;
11607
11608 if (thrashing_hint->type == UVM_PERF_THRASHING_HINT_TYPE_PIN) {
11609 UVM_ASSERT(uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(thrashing_hint->pin.residency)],
11610 processor_id));
11611 return thrashing_hint->pin.residency;
11612 }
11613
11614 closest_resident_processor = uvm_va_block_page_get_closest_resident(va_block,
11615 va_block_context,
11616 page_index,
11617 processor_id);
11618
11619 // If the page is not resident anywhere, select the preferred location as
11620 // long as the preferred location is accessible from the faulting processor.
11621 // Otherwise select the faulting processor.
11622 if (UVM_ID_IS_INVALID(closest_resident_processor)) {
11623 if (UVM_ID_IS_VALID(preferred_location) &&
11624 uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(preferred_location)],
11625 processor_id)) {
11626 return preferred_location;
11627 }
11628
11629 return processor_id;
11630 }
11631
11632 // AccessedBy mappings might have not been created for the CPU if the thread
11633 // which made the memory resident did not have the proper references on the
11634 // mm_struct (for example, the GPU fault handling path when
11635 // uvm_va_space_mm_enabled() is false).
11636 //
11637 // Also, in uvm_migrate_*, we implement a two-pass scheme in which
11638 // AccessedBy mappings may be delayed to the second pass. This can produce
11639 // faults even if the faulting processor is in the accessed_by mask.
11640 //
11641 // Here, we keep it on the current residency and we just add the missing
11642 // mapping.
11643 if (uvm_processor_mask_test(&policy->accessed_by, processor_id) &&
11644 uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(closest_resident_processor)], processor_id) &&
11645 operation != UVM_SERVICE_OPERATION_ACCESS_COUNTERS) {
11646 return closest_resident_processor;
11647 }
11648
11649 // Check if we should map the closest resident processor remotely on atomic
11650 // fault
11651 if (map_remote_on_atomic_fault(va_space, access_type_mask, processor_id, closest_resident_processor))
11652 return closest_resident_processor;
11653
11654 // If the processor has access to the preferred location, and the page is
11655 // not resident on the accessing processor, move it to the preferred
11656 // location.
11657 if (!uvm_id_equal(closest_resident_processor, processor_id) &&
11658 UVM_ID_IS_VALID(preferred_location) &&
11659 uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(preferred_location)], processor_id))
11660 return preferred_location;
11661
11662 // Check if we should map the closest resident processor remotely on remote CPU fault
11663 //
11664 // When faulting on CPU, there's a linux process on behalf of it, which is associated
11665 // with a unique VM pointed by current->mm. A block of memory residing on GPU is also
11666 // associated with VM, pointed by va_block_context->mm. If they match, it's a regular
11667 // (local) fault, and we may want to migrate a page from GPU to CPU.
11668 // If it's a 'remote' fault, i.e. linux process differs from one associated with block
11669 // VM, we might preserve residence.
11670 //
11671 // Establishing a remote fault without access counters means the memory could stay in
11672 // the wrong spot for a long time, which is why we prefer to avoid creating remote
11673 // mappings. However when NIC accesses a memory residing on GPU, it's worth to keep it
11674 // in place for NIC accesses.
11675 //
11676 // The logic that's used to detect remote faulting also keeps memory in place for
11677 // ptrace accesses. We would prefer to control those policies separately, but the
11678 // NIC case takes priority.
11679 // If the accessing processor is CPU, we're either handling a fault
11680 // from other than owning process, or we're handling an MOMC
11681 // notification. Only prevent migration for the former.
11682 if (UVM_ID_IS_CPU(processor_id) &&
11683 operation != UVM_SERVICE_OPERATION_ACCESS_COUNTERS &&
11684 uvm_processor_mask_test(&va_space->accessible_from[uvm_id_value(closest_resident_processor)], processor_id) &&
11685 va_block_context->mm != current->mm) {
11686 UVM_ASSERT(va_block_context->mm != NULL);
11687 return closest_resident_processor;
11688 }
11689
11690 // If the page is resident on a processor other than the preferred location,
11691 // or the faulting processor can't access the preferred location, we select
11692 // the faulting processor as the new residency.
11693 return processor_id;
11694 }
11695
block_select_node_residency(uvm_va_block_t * va_block,uvm_page_index_t page_index,uvm_processor_id_t new_residency,const uvm_va_policy_t * policy,const uvm_perf_thrashing_hint_t * thrashing_hint)11696 static int block_select_node_residency(uvm_va_block_t *va_block,
11697 uvm_page_index_t page_index,
11698 uvm_processor_id_t new_residency,
11699 const uvm_va_policy_t *policy,
11700 const uvm_perf_thrashing_hint_t *thrashing_hint)
11701 {
11702 // For CPU faults, the fault handler runs on the CPU that faulted.
11703 // For GPU faults, the bottom half is pinned to CPUs closest to their GPU.
11704 // Therefore, in both cases, we can use numa_mem_id() to get the NUMA node
11705 // ID of the faulting processor.
11706 // Note that numa_mem_id() returns the nearest node with memory. In most
11707 // cases, this will be the current NUMA node. However, in the case that the
11708 // current node does not have any memory, we probably want the nearest node
11709 // with memory, anyway.
11710 int current_nid = numa_mem_id();
11711 bool may_read_duplicate = can_read_duplicate(va_block, page_index, policy, thrashing_hint);
11712
11713 // For HMM allocations UVM doesn't always control allocation of the
11714 // destination page as the kernel may have already allocated one. Therefore
11715 // we can't respect the preferred node ID for HMM pages.
11716 // TODO: Bug 4453874: [UVM-HMM] Respect the preferred CPU NUMA Node ID when making a HMM page resident
11717 if (uvm_va_block_is_hmm(va_block))
11718 return NUMA_NO_NODE;
11719
11720 // If the new resident processor is not the CPU, return the preferred nid
11721 // since it could be used for CPU allocations of staging pages.
11722 if (!UVM_ID_IS_CPU(new_residency))
11723 return policy->preferred_nid;
11724
11725 // If the preferred location is the CPU, the new resident nid is the
11726 // preferred nid.
11727 if (UVM_ID_IS_CPU(policy->preferred_location))
11728 return policy->preferred_nid;
11729
11730 // If read duplication is enabled and the page is also resident on the CPU,
11731 // keep its current NUMA node residency.
11732 if (may_read_duplicate && uvm_va_block_cpu_is_page_resident_on(va_block, NUMA_NO_NODE, page_index))
11733 return NUMA_NO_NODE;
11734
11735 // The new_residency processor is the CPU and the preferred location is not
11736 // the CPU. If the page is resident on the CPU, keep its current residency.
11737 if (uvm_va_block_cpu_is_page_resident_on(va_block, NUMA_NO_NODE, page_index))
11738 return NUMA_NO_NODE;
11739
11740 return current_nid;
11741 }
11742
uvm_va_block_select_residency(uvm_va_block_t * va_block,uvm_va_block_context_t * va_block_context,uvm_page_index_t page_index,uvm_processor_id_t processor_id,NvU32 access_type_mask,const uvm_va_policy_t * policy,const uvm_perf_thrashing_hint_t * thrashing_hint,uvm_service_operation_t operation,const bool hmm_migratable,bool * read_duplicate)11743 uvm_processor_id_t uvm_va_block_select_residency(uvm_va_block_t *va_block,
11744 uvm_va_block_context_t *va_block_context,
11745 uvm_page_index_t page_index,
11746 uvm_processor_id_t processor_id,
11747 NvU32 access_type_mask,
11748 const uvm_va_policy_t *policy,
11749 const uvm_perf_thrashing_hint_t *thrashing_hint,
11750 uvm_service_operation_t operation,
11751 const bool hmm_migratable,
11752 bool *read_duplicate)
11753 {
11754 uvm_processor_id_t id;
11755 int nid;
11756
11757 UVM_ASSERT(uvm_hmm_check_context_vma_is_valid(va_block,
11758 va_block_context->hmm.vma,
11759 uvm_va_block_region_for_page(page_index)));
11760
11761 // First, select the processor for the new residency.
11762 id = block_select_processor_residency(va_block,
11763 va_block_context,
11764 page_index,
11765 processor_id,
11766 access_type_mask,
11767 policy,
11768 thrashing_hint,
11769 operation,
11770 hmm_migratable,
11771 read_duplicate);
11772
11773 // If the intended residency doesn't have memory, fall back to the CPU.
11774 if (!block_processor_has_memory(va_block, id)) {
11775 *read_duplicate = false;
11776 id = UVM_ID_CPU;
11777 }
11778
11779 // Now, that we know the new residency processor, select the NUMA node ID
11780 // based on the new processor.
11781 nid = block_select_node_residency(va_block, page_index, id, policy, thrashing_hint);
11782
11783 va_block_context->make_resident.dest_nid = nid;
11784
11785 return id;
11786 }
11787
check_access_counters_dont_revoke(uvm_va_block_t * block,uvm_va_block_context_t * block_context,uvm_va_block_region_t region,const uvm_processor_mask_t * revoke_processors,const uvm_page_mask_t * revoke_page_mask,uvm_prot_t revoke_prot)11788 static bool check_access_counters_dont_revoke(uvm_va_block_t *block,
11789 uvm_va_block_context_t *block_context,
11790 uvm_va_block_region_t region,
11791 const uvm_processor_mask_t *revoke_processors,
11792 const uvm_page_mask_t *revoke_page_mask,
11793 uvm_prot_t revoke_prot)
11794 {
11795 uvm_processor_id_t id;
11796 for_each_id_in_mask(id, revoke_processors) {
11797 const uvm_page_mask_t *mapped_with_prot = block_map_with_prot_mask_get(block, id, revoke_prot);
11798
11799 uvm_page_mask_and(&block_context->caller_page_mask, revoke_page_mask, mapped_with_prot);
11800
11801 UVM_ASSERT(uvm_page_mask_region_weight(&block_context->caller_page_mask, region) == 0);
11802 }
11803
11804 return true;
11805 }
11806
11807 // Update service_context->prefetch_hint, service_context->per_processor_masks,
11808 // and service_context->region.
uvm_va_block_get_prefetch_hint(uvm_va_block_t * va_block,const uvm_va_policy_t * policy,uvm_service_block_context_t * service_context)11809 static void uvm_va_block_get_prefetch_hint(uvm_va_block_t *va_block,
11810 const uvm_va_policy_t *policy,
11811 uvm_service_block_context_t *service_context)
11812 {
11813 uvm_processor_id_t new_residency;
11814 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
11815
11816 // Performance heuristics policy: we only consider prefetching when there
11817 // are migrations to a single processor, only.
11818 if (uvm_processor_mask_get_count(&service_context->resident_processors) == 1) {
11819 uvm_page_index_t page_index;
11820 uvm_page_mask_t *new_residency_mask;
11821
11822 new_residency = uvm_processor_mask_find_first_id(&service_context->resident_processors);
11823 new_residency_mask = &service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency;
11824
11825 // Update prefetch tracking structure with the pages that will migrate
11826 // due to faults
11827 uvm_perf_prefetch_get_hint_va_block(va_block,
11828 service_context->block_context,
11829 new_residency,
11830 new_residency_mask,
11831 service_context->region,
11832 &service_context->prefetch_bitmap_tree,
11833 &service_context->prefetch_hint);
11834
11835 // Obtain the prefetch hint and give a fake fault access type to the
11836 // prefetched pages
11837 if (UVM_ID_IS_VALID(service_context->prefetch_hint.residency)) {
11838 const uvm_page_mask_t *prefetch_pages_mask = &service_context->prefetch_hint.prefetch_pages_mask;
11839
11840 for_each_va_block_page_in_mask(page_index, prefetch_pages_mask, va_block) {
11841 UVM_ASSERT(!uvm_page_mask_test(new_residency_mask, page_index));
11842
11843 service_context->access_type[page_index] = UVM_FAULT_ACCESS_TYPE_PREFETCH;
11844
11845 if (uvm_va_policy_is_read_duplicate(policy, va_space) ||
11846 (policy->read_duplication != UVM_READ_DUPLICATION_DISABLED &&
11847 uvm_page_mask_test(&va_block->read_duplicated_pages, page_index))) {
11848 if (service_context->read_duplicate_count++ == 0)
11849 uvm_page_mask_zero(&service_context->read_duplicate_mask);
11850
11851 uvm_page_mask_set(&service_context->read_duplicate_mask, page_index);
11852 }
11853 }
11854
11855 uvm_page_mask_or(new_residency_mask, new_residency_mask, prefetch_pages_mask);
11856 service_context->region = uvm_va_block_region_from_mask(va_block, new_residency_mask);
11857 }
11858 }
11859 else {
11860 service_context->prefetch_hint.residency = UVM_ID_INVALID;
11861 }
11862 }
11863
uvm_va_block_service_copy(uvm_processor_id_t processor_id,uvm_processor_id_t new_residency,uvm_va_block_t * va_block,uvm_va_block_retry_t * block_retry,uvm_service_block_context_t * service_context)11864 NV_STATUS uvm_va_block_service_copy(uvm_processor_id_t processor_id,
11865 uvm_processor_id_t new_residency,
11866 uvm_va_block_t *va_block,
11867 uvm_va_block_retry_t *block_retry,
11868 uvm_service_block_context_t *service_context)
11869 {
11870 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
11871 uvm_processor_mask_t *all_involved_processors =
11872 &service_context->block_context->make_resident.all_involved_processors;
11873 uvm_page_mask_t *new_residency_mask =
11874 &service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency;
11875 uvm_page_mask_t *did_migrate_mask = &service_context->block_context->make_resident.pages_changed_residency;
11876 uvm_page_mask_t *caller_page_mask = &service_context->block_context->caller_page_mask;
11877 uvm_make_resident_cause_t cause;
11878 NV_STATUS status;
11879
11880 // 1- Migrate pages
11881 switch (service_context->operation) {
11882 case UVM_SERVICE_OPERATION_REPLAYABLE_FAULTS:
11883 cause = UVM_MAKE_RESIDENT_CAUSE_REPLAYABLE_FAULT;
11884 break;
11885 case UVM_SERVICE_OPERATION_NON_REPLAYABLE_FAULTS:
11886 cause = UVM_MAKE_RESIDENT_CAUSE_NON_REPLAYABLE_FAULT;
11887 break;
11888 case UVM_SERVICE_OPERATION_ACCESS_COUNTERS:
11889 cause = UVM_MAKE_RESIDENT_CAUSE_ACCESS_COUNTER;
11890 break;
11891 default:
11892 UVM_ASSERT_MSG(false, "Invalid operation value %d\n", service_context->operation);
11893 // Set cause to silence compiler warning that it may be unused.
11894 cause = UVM_MAKE_RESIDENT_CAUSE_ACCESS_COUNTER;
11895 break;
11896 }
11897
11898 // Reset masks before all of the make_resident calls
11899 uvm_page_mask_zero(did_migrate_mask);
11900 uvm_processor_mask_zero(all_involved_processors);
11901
11902 // Handle read duplication first so that the caller_page_mask will be free
11903 // to use below and still valid in uvm_va_block_service_finish().
11904 // TODO: Bug 3660922: need to implement HMM read duplication support.
11905 if (service_context->read_duplicate_count != 0 &&
11906 uvm_page_mask_and(caller_page_mask,
11907 new_residency_mask,
11908 &service_context->read_duplicate_mask)) {
11909 status = uvm_va_block_make_resident_read_duplicate(va_block,
11910 block_retry,
11911 service_context->block_context,
11912 new_residency,
11913 service_context->region,
11914 caller_page_mask,
11915 &service_context->prefetch_hint.prefetch_pages_mask,
11916 cause);
11917 if (status != NV_OK)
11918 return status;
11919 }
11920
11921 if (service_context->read_duplicate_count == 0 ||
11922 uvm_page_mask_andnot(caller_page_mask, new_residency_mask, &service_context->read_duplicate_mask)) {
11923 if (service_context->read_duplicate_count == 0)
11924 uvm_page_mask_copy(caller_page_mask, new_residency_mask);
11925 status = uvm_va_block_make_resident_copy(va_block,
11926 block_retry,
11927 service_context->block_context,
11928 new_residency,
11929 service_context->region,
11930 caller_page_mask,
11931 &service_context->prefetch_hint.prefetch_pages_mask,
11932 cause);
11933 if (status != NV_OK)
11934 return status;
11935 }
11936
11937 if (UVM_ID_IS_CPU(processor_id) && !uvm_processor_mask_empty(all_involved_processors))
11938 service_context->cpu_fault.did_migrate = true;
11939
11940 // 2- Check for ECC errors on all GPUs involved in the migration if CPU is
11941 // the destination. Migrations in response to CPU faults are special
11942 // because they're on the only path (apart from tools) where CUDA is not
11943 // involved and wouldn't have a chance to do its own ECC checking.
11944 if (service_context->operation == UVM_SERVICE_OPERATION_REPLAYABLE_FAULTS &&
11945 UVM_ID_IS_CPU(new_residency) &&
11946 !uvm_processor_mask_empty(all_involved_processors)) {
11947 uvm_gpu_t *gpu;
11948
11949 // Before checking for ECC errors, make sure all of the GPU work
11950 // is finished. Creating mappings on the CPU would have to wait
11951 // for the tracker anyway so this shouldn't hurt performance.
11952 status = uvm_tracker_wait(&va_block->tracker);
11953 if (status != NV_OK)
11954 return status;
11955
11956 for_each_va_space_gpu_in_mask(gpu, va_space, all_involved_processors) {
11957 // We cannot call into RM here so use the no RM ECC check.
11958 status = uvm_gpu_check_ecc_error_no_rm(gpu);
11959 if (status == NV_WARN_MORE_PROCESSING_REQUIRED) {
11960 // In case we need to call into RM to be sure whether
11961 // there is an ECC error or not, signal that to the
11962 // caller by adding the GPU to the mask.
11963 //
11964 // In that case the ECC error might be noticed only after
11965 // the CPU mappings have been already created below,
11966 // exposing different CPU threads to the possibly corrupt
11967 // data, but this thread will fault eventually and that's
11968 // considered to be an acceptable trade-off between
11969 // performance and ECC error containment.
11970 uvm_processor_mask_set(&service_context->cpu_fault.gpus_to_check_for_ecc, gpu->id);
11971 status = NV_OK;
11972 }
11973 if (status != NV_OK)
11974 return status;
11975 }
11976 }
11977
11978 return NV_OK;
11979 }
11980
uvm_va_block_service_finish(uvm_processor_id_t processor_id,uvm_va_block_t * va_block,uvm_service_block_context_t * service_context)11981 NV_STATUS uvm_va_block_service_finish(uvm_processor_id_t processor_id,
11982 uvm_va_block_t *va_block,
11983 uvm_service_block_context_t *service_context)
11984 {
11985 uvm_processor_id_t new_residency = service_context->block_context->make_resident.dest_id;
11986 uvm_page_mask_t *new_residency_mask =
11987 &service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency;
11988 uvm_page_mask_t *did_migrate_mask = &service_context->block_context->make_resident.pages_changed_residency;
11989 uvm_page_mask_t *caller_page_mask = &service_context->block_context->caller_page_mask;
11990 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
11991 uvm_prot_t new_prot;
11992 uvm_page_index_t page_index;
11993 uvm_processor_mask_t *revoke_processors;
11994 NV_STATUS status = NV_OK;
11995
11996 // Update residency.
11997 if (service_context->read_duplicate_count == 0 || !uvm_page_mask_empty(caller_page_mask))
11998 uvm_va_block_make_resident_finish(va_block,
11999 service_context->block_context,
12000 service_context->region,
12001 caller_page_mask);
12002
12003 uvm_page_mask_andnot(&service_context->did_not_migrate_mask, new_residency_mask, did_migrate_mask);
12004
12005 // The loops below depend on the enums having the following values in order
12006 // to index into service_context->mappings_by_prot[].
12007 BUILD_BUG_ON(UVM_PROT_READ_ONLY != 1);
12008 BUILD_BUG_ON(UVM_PROT_READ_WRITE != 2);
12009 BUILD_BUG_ON(UVM_PROT_READ_WRITE_ATOMIC != 3);
12010 BUILD_BUG_ON(UVM_PROT_MAX != 4);
12011
12012 revoke_processors = uvm_processor_mask_cache_alloc();
12013 if (!revoke_processors)
12014 return NV_ERR_NO_MEMORY;
12015
12016 // 1- Compute mapping protections for the requesting processor on the new
12017 // residency.
12018 for (new_prot = UVM_PROT_READ_ONLY; new_prot < UVM_PROT_MAX; ++new_prot)
12019 service_context->mappings_by_prot[new_prot - 1].count = 0;
12020
12021 for_each_va_block_page_in_region_mask(page_index, new_residency_mask, service_context->region) {
12022 new_prot = compute_new_permission(va_block,
12023 service_context->block_context,
12024 page_index,
12025 processor_id,
12026 new_residency,
12027 service_context->access_type[page_index]);
12028
12029 if (service_context->mappings_by_prot[new_prot - 1].count++ == 0)
12030 uvm_page_mask_zero(&service_context->mappings_by_prot[new_prot - 1].page_mask);
12031
12032 uvm_page_mask_set(&service_context->mappings_by_prot[new_prot - 1].page_mask, page_index);
12033 }
12034
12035 // 2- Revoke permissions
12036 //
12037 // NOTE: uvm_va_block_make_resident_copy destroys mappings to old locations.
12038 // Thus, we need to revoke only if residency did not change and we
12039 // are mapping higher than READ ONLY.
12040 for (new_prot = UVM_PROT_READ_WRITE; new_prot <= UVM_PROT_READ_WRITE_ATOMIC; ++new_prot) {
12041 bool pages_need_revocation;
12042 uvm_prot_t revoke_prot;
12043 bool this_processor_has_enabled_atomics;
12044
12045 if (service_context->mappings_by_prot[new_prot - 1].count == 0)
12046 continue;
12047
12048 pages_need_revocation = uvm_page_mask_and(&service_context->revocation_mask,
12049 &service_context->did_not_migrate_mask,
12050 &service_context->mappings_by_prot[new_prot - 1].page_mask);
12051 if (!pages_need_revocation)
12052 continue;
12053
12054 uvm_processor_mask_and(revoke_processors, &va_block->mapped, &va_space->faultable_processors);
12055
12056 // Do not revoke the processor that took the fault
12057 uvm_processor_mask_clear(revoke_processors, processor_id);
12058
12059 this_processor_has_enabled_atomics = uvm_processor_mask_test(&va_space->system_wide_atomics_enabled_processors,
12060 processor_id);
12061
12062 // Atomic operations on processors with system-wide atomics
12063 // disabled or with native atomics access to new_residency
12064 // behave like writes.
12065 if (new_prot == UVM_PROT_READ_WRITE ||
12066 !this_processor_has_enabled_atomics ||
12067 uvm_processor_mask_test(&va_space->has_native_atomics[uvm_id_value(new_residency)], processor_id)) {
12068
12069 // Exclude processors with native atomics on the resident copy
12070 uvm_processor_mask_andnot(revoke_processors,
12071 revoke_processors,
12072 &va_space->has_native_atomics[uvm_id_value(new_residency)]);
12073
12074 // Exclude processors with disabled system-wide atomics
12075 uvm_processor_mask_and(revoke_processors,
12076 revoke_processors,
12077 &va_space->system_wide_atomics_enabled_processors);
12078 }
12079
12080 if (UVM_ID_IS_CPU(processor_id)) {
12081 revoke_prot = UVM_PROT_READ_WRITE_ATOMIC;
12082 }
12083 else {
12084 revoke_prot = (new_prot == UVM_PROT_READ_WRITE_ATOMIC)? UVM_PROT_READ_WRITE:
12085 UVM_PROT_READ_WRITE_ATOMIC;
12086 }
12087
12088 // UVM-Lite processors must always have RWA mappings
12089 if (uvm_processor_mask_andnot(revoke_processors, revoke_processors, block_get_uvm_lite_gpus(va_block))) {
12090 // Access counters should never trigger revocations apart from
12091 // read-duplication, which are performed in the calls to
12092 // uvm_va_block_make_resident_read_duplicate, above.
12093 if (service_context->operation == UVM_SERVICE_OPERATION_ACCESS_COUNTERS) {
12094 UVM_ASSERT(check_access_counters_dont_revoke(va_block,
12095 service_context->block_context,
12096 service_context->region,
12097 revoke_processors,
12098 &service_context->revocation_mask,
12099 revoke_prot));
12100 }
12101
12102 // Downgrade other processors' mappings
12103 status = uvm_va_block_revoke_prot_mask(va_block,
12104 service_context->block_context,
12105 revoke_processors,
12106 service_context->region,
12107 &service_context->revocation_mask,
12108 revoke_prot);
12109 if (status != NV_OK)
12110 break;
12111 }
12112 }
12113
12114 uvm_processor_mask_cache_free(revoke_processors);
12115
12116 if (status != NV_OK)
12117 return status;
12118
12119 // 3- Map requesting processor with the necessary privileges
12120 for (new_prot = UVM_PROT_READ_ONLY; new_prot <= UVM_PROT_READ_WRITE_ATOMIC; ++new_prot) {
12121 const uvm_page_mask_t *map_prot_mask = &service_context->mappings_by_prot[new_prot - 1].page_mask;
12122
12123 if (service_context->mappings_by_prot[new_prot - 1].count == 0)
12124 continue;
12125
12126 // 3.1 - Unmap CPU pages
12127 // HMM cpu mappings can be upgraded at any time without notification
12128 // so no need to downgrade first.
12129 if (service_context->operation != UVM_SERVICE_OPERATION_ACCESS_COUNTERS &&
12130 UVM_ID_IS_CPU(processor_id) &&
12131 !uvm_va_block_is_hmm(va_block)) {
12132 // The kernel can downgrade managed CPU mappings at any time without
12133 // notifying us, which means our PTE state could be stale. We
12134 // handle this by unmapping the CPU PTE and re-mapping it again.
12135 //
12136 // A CPU fault is unexpected if:
12137 // curr_prot == RW || (!is_write && curr_prot == RO)
12138 status = uvm_va_block_unmap(va_block,
12139 service_context->block_context,
12140 UVM_ID_CPU,
12141 service_context->region,
12142 map_prot_mask,
12143 NULL);
12144 if (status != NV_OK)
12145 return status;
12146 }
12147
12148 // 3.2 - Add new mappings
12149
12150 // The faulting processor can be mapped remotely due to user policy or
12151 // the thrashing mitigation heuristics. Therefore, we set the cause
12152 // accordingly in each case.
12153
12154 // Map pages that are thrashing first
12155 if (service_context->thrashing_pin_count > 0 && va_space->tools.enabled) {
12156 uvm_page_mask_t *helper_page_mask = &service_context->block_context->caller_page_mask;
12157 bool pages_need_mapping = uvm_page_mask_and(helper_page_mask,
12158 map_prot_mask,
12159 &service_context->thrashing_pin_mask);
12160 if (pages_need_mapping) {
12161 status = uvm_va_block_map(va_block,
12162 service_context->block_context,
12163 processor_id,
12164 service_context->region,
12165 helper_page_mask,
12166 new_prot,
12167 UvmEventMapRemoteCauseThrashing,
12168 &va_block->tracker);
12169 if (status != NV_OK)
12170 return status;
12171
12172 // Remove thrashing pages from the map mask
12173 pages_need_mapping = uvm_page_mask_andnot(helper_page_mask,
12174 map_prot_mask,
12175 &service_context->thrashing_pin_mask);
12176 if (!pages_need_mapping)
12177 continue;
12178
12179 map_prot_mask = helper_page_mask;
12180 }
12181 }
12182
12183 status = uvm_va_block_map(va_block,
12184 service_context->block_context,
12185 processor_id,
12186 service_context->region,
12187 map_prot_mask,
12188 new_prot,
12189 UvmEventMapRemoteCausePolicy,
12190 &va_block->tracker);
12191 if (status != NV_OK)
12192 return status;
12193 }
12194
12195 // 4- If pages did migrate, map SetAccessedBy processors, except for
12196 // UVM-Lite
12197 for (new_prot = UVM_PROT_READ_ONLY; new_prot <= UVM_PROT_READ_WRITE_ATOMIC; ++new_prot) {
12198 bool pages_need_mapping;
12199
12200 if (service_context->mappings_by_prot[new_prot - 1].count == 0)
12201 continue;
12202
12203 pages_need_mapping = uvm_page_mask_and(caller_page_mask,
12204 new_residency_mask,
12205 &service_context->mappings_by_prot[new_prot - 1].page_mask);
12206 if (!pages_need_mapping)
12207 continue;
12208
12209 // Map pages that are thrashing
12210 if (service_context->thrashing_pin_count > 0) {
12211 uvm_page_index_t page_index;
12212
12213 for_each_va_block_page_in_region_mask(page_index,
12214 &service_context->thrashing_pin_mask,
12215 service_context->region) {
12216 uvm_processor_mask_t *map_thrashing_processors = NULL;
12217 NvU64 page_addr = uvm_va_block_cpu_page_address(va_block, page_index);
12218
12219 // Check protection type
12220 if (!uvm_page_mask_test(caller_page_mask, page_index))
12221 continue;
12222
12223 map_thrashing_processors = uvm_perf_thrashing_get_thrashing_processors(va_block, page_addr);
12224
12225 status = uvm_va_block_add_mappings_after_migration(va_block,
12226 service_context->block_context,
12227 new_residency,
12228 processor_id,
12229 uvm_va_block_region_for_page(page_index),
12230 caller_page_mask,
12231 new_prot,
12232 map_thrashing_processors);
12233 if (status != NV_OK)
12234 return status;
12235 }
12236
12237 pages_need_mapping = uvm_page_mask_andnot(caller_page_mask,
12238 caller_page_mask,
12239 &service_context->thrashing_pin_mask);
12240 if (!pages_need_mapping)
12241 continue;
12242 }
12243
12244 // Map the rest of pages in a single shot
12245 status = uvm_va_block_add_mappings_after_migration(va_block,
12246 service_context->block_context,
12247 new_residency,
12248 processor_id,
12249 service_context->region,
12250 caller_page_mask,
12251 new_prot,
12252 NULL);
12253 if (status != NV_OK)
12254 return status;
12255 }
12256
12257 return NV_OK;
12258 }
12259
uvm_va_block_service_locked(uvm_processor_id_t processor_id,uvm_va_block_t * va_block,uvm_va_block_retry_t * block_retry,uvm_service_block_context_t * service_context)12260 NV_STATUS uvm_va_block_service_locked(uvm_processor_id_t processor_id,
12261 uvm_va_block_t *va_block,
12262 uvm_va_block_retry_t *block_retry,
12263 uvm_service_block_context_t *service_context)
12264 {
12265 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
12266 uvm_processor_id_t new_residency;
12267 NV_STATUS status = NV_OK;
12268
12269 uvm_assert_mutex_locked(&va_block->lock);
12270 UVM_ASSERT(uvm_hmm_check_context_vma_is_valid(va_block,
12271 service_context->block_context->hmm.vma,
12272 service_context->region));
12273
12274 // GPU fault servicing must be done under the VA space read lock. GPU fault
12275 // servicing is required for RM to make forward progress, and we allow other
12276 // threads to call into RM while holding the VA space lock in read mode. If
12277 // we took the VA space lock in write mode on the GPU fault service path,
12278 // we could deadlock because the thread in RM which holds the VA space lock
12279 // for read wouldn't be able to complete until fault servicing completes.
12280 if (service_context->operation != UVM_SERVICE_OPERATION_REPLAYABLE_FAULTS || UVM_ID_IS_CPU(processor_id))
12281 uvm_assert_rwsem_locked(&va_space->lock);
12282 else
12283 uvm_assert_rwsem_locked_read(&va_space->lock);
12284
12285 uvm_va_block_get_prefetch_hint(va_block,
12286 uvm_va_policy_get_region(va_block, service_context->region),
12287 service_context);
12288
12289 for_each_id_in_mask(new_residency, &service_context->resident_processors) {
12290 if (uvm_va_block_is_hmm(va_block)) {
12291 status = uvm_hmm_va_block_service_locked(processor_id,
12292 new_residency,
12293 va_block,
12294 block_retry,
12295 service_context);
12296 if (status != NV_OK)
12297 break;
12298
12299 continue;
12300 }
12301
12302 status = uvm_va_block_service_copy(processor_id, new_residency, va_block, block_retry, service_context);
12303 if (status != NV_OK)
12304 break;
12305
12306 status = uvm_va_block_service_finish(processor_id, va_block, service_context);
12307 if (status != NV_OK)
12308 break;
12309 }
12310
12311 return status;
12312 }
12313
uvm_va_block_check_logical_permissions(uvm_va_block_t * va_block,uvm_va_block_context_t * va_block_context,uvm_processor_id_t processor_id,uvm_page_index_t page_index,uvm_fault_access_type_t access_type,bool allow_migration)12314 NV_STATUS uvm_va_block_check_logical_permissions(uvm_va_block_t *va_block,
12315 uvm_va_block_context_t *va_block_context,
12316 uvm_processor_id_t processor_id,
12317 uvm_page_index_t page_index,
12318 uvm_fault_access_type_t access_type,
12319 bool allow_migration)
12320 {
12321 uvm_va_range_t *va_range = va_block->va_range;
12322 uvm_prot_t access_prot = uvm_fault_access_type_to_prot(access_type);
12323
12324 UVM_ASSERT(uvm_hmm_check_context_vma_is_valid(va_block,
12325 va_block_context->hmm.vma,
12326 uvm_va_block_region_for_page(page_index)));
12327
12328 // CPU permissions are checked later by block_map_cpu_page.
12329 //
12330 // TODO: Bug 1766124: permissions are checked by block_map_cpu_page because
12331 // it can also be called from change_pte. Make change_pte call this
12332 // function and only check CPU permissions here.
12333 if (UVM_ID_IS_GPU(processor_id)) {
12334 if (va_range && uvm_va_range_is_managed_zombie(va_range))
12335 return NV_ERR_INVALID_ADDRESS;
12336
12337 // GPU faults only check vma permissions if a mm is registered with the
12338 // VA space (ie. uvm_va_space_mm_retain_lock(va_space) != NULL) or if
12339 // uvm_enable_builtin_tests is set, because the Linux kernel can change
12340 // vm_flags at any moment (for example on mprotect) and here we are not
12341 // guaranteed to have vma->vm_mm->mmap_lock. During tests we ensure that
12342 // this scenario does not happen.
12343 if (((va_block->hmm.va_space && va_block->hmm.va_space->va_space_mm.mm) || uvm_enable_builtin_tests) &&
12344 (access_prot > compute_logical_prot(va_block, va_block_context->hmm.vma, page_index)))
12345 return NV_ERR_INVALID_ACCESS_TYPE;
12346 }
12347
12348 // Non-migratable range:
12349 // - CPU accesses are always fatal, regardless of the VA range residency
12350 // - GPU accesses are fatal if the GPU can't map the preferred location
12351 if (!allow_migration) {
12352 UVM_ASSERT(!uvm_va_block_is_hmm(va_block));
12353
12354 if (UVM_ID_IS_CPU(processor_id)) {
12355 return NV_ERR_INVALID_OPERATION;
12356 }
12357 else {
12358 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
12359
12360 return uvm_processor_mask_test(
12361 &va_space->accessible_from[uvm_id_value(uvm_va_range_get_policy(va_range)->preferred_location)],
12362 processor_id)?
12363 NV_OK : NV_ERR_INVALID_ACCESS_TYPE;
12364 }
12365 }
12366
12367 return NV_OK;
12368 }
12369
12370 // Check if we are faulting on a page with valid permissions to check if we can
12371 // skip fault handling. See uvm_va_block_t::cpu::fault_authorized for more
12372 // details
skip_cpu_fault_with_valid_permissions(uvm_va_block_t * va_block,uvm_page_index_t page_index,uvm_fault_access_type_t fault_access_type)12373 static bool skip_cpu_fault_with_valid_permissions(uvm_va_block_t *va_block,
12374 uvm_page_index_t page_index,
12375 uvm_fault_access_type_t fault_access_type)
12376 {
12377 // TODO: Bug 3900038: is skip_cpu_fault_with_valid_permissions() needed for
12378 // HMM?
12379 if (uvm_va_block_is_hmm(va_block))
12380 return false;
12381
12382 if (block_page_is_processor_authorized(va_block,
12383 page_index,
12384 UVM_ID_CPU,
12385 uvm_fault_access_type_to_prot(fault_access_type))) {
12386 NvU64 now = NV_GETTIME();
12387 pid_t pid = current->pid;
12388
12389 // Latch the pid/timestamp/page_index values for the first time
12390 if (!va_block->cpu.fault_authorized.first_fault_stamp) {
12391 va_block->cpu.fault_authorized.first_fault_stamp = now;
12392 va_block->cpu.fault_authorized.first_pid = pid;
12393 va_block->cpu.fault_authorized.page_index = page_index;
12394
12395 return true;
12396 }
12397
12398 // If the same thread shows up again, this means that the kernel
12399 // downgraded the page's PTEs. Service the fault to force a remap of
12400 // the page.
12401 if (va_block->cpu.fault_authorized.first_pid == pid &&
12402 va_block->cpu.fault_authorized.page_index == page_index) {
12403 va_block->cpu.fault_authorized.first_fault_stamp = 0;
12404 }
12405 else {
12406 // If the window has expired, clear the information and service the
12407 // fault. Otherwise, just return
12408 if (now - va_block->cpu.fault_authorized.first_fault_stamp > uvm_perf_authorized_cpu_fault_tracking_window_ns)
12409 va_block->cpu.fault_authorized.first_fault_stamp = 0;
12410 else
12411 return true;
12412 }
12413 }
12414
12415 return false;
12416 }
12417
block_cpu_fault_locked(uvm_va_block_t * va_block,uvm_va_block_retry_t * va_block_retry,NvU64 fault_addr,uvm_fault_access_type_t fault_access_type,uvm_service_block_context_t * service_context)12418 static NV_STATUS block_cpu_fault_locked(uvm_va_block_t *va_block,
12419 uvm_va_block_retry_t *va_block_retry,
12420 NvU64 fault_addr,
12421 uvm_fault_access_type_t fault_access_type,
12422 uvm_service_block_context_t *service_context)
12423 {
12424 uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
12425 NV_STATUS status = NV_OK;
12426 uvm_page_index_t page_index;
12427 uvm_perf_thrashing_hint_t thrashing_hint;
12428 uvm_processor_id_t new_residency;
12429 bool read_duplicate;
12430 const uvm_va_policy_t *policy;
12431 const bool hmm_migratable = true;
12432
12433 uvm_assert_rwsem_locked(&va_space->lock);
12434
12435 UVM_ASSERT(fault_addr >= va_block->start);
12436 UVM_ASSERT(fault_addr <= va_block->end);
12437
12438 uvm_assert_mmap_lock_locked(service_context->block_context->mm);
12439
12440 policy = uvm_va_policy_get(va_block, fault_addr);
12441
12442 if (service_context->num_retries == 0) {
12443 // notify event to tools/performance heuristics
12444 uvm_perf_event_notify_cpu_fault(&va_space->perf_events,
12445 va_block,
12446 policy->preferred_location,
12447 fault_addr,
12448 fault_access_type > UVM_FAULT_ACCESS_TYPE_READ,
12449 KSTK_EIP(current));
12450 }
12451
12452 // Check logical permissions
12453 page_index = uvm_va_block_cpu_page_index(va_block, fault_addr);
12454 status = uvm_va_block_check_logical_permissions(va_block,
12455 service_context->block_context,
12456 UVM_ID_CPU,
12457 page_index,
12458 fault_access_type,
12459 uvm_range_group_address_migratable(va_space, fault_addr));
12460 if (status != NV_OK)
12461 return status;
12462
12463 uvm_processor_mask_zero(&service_context->cpu_fault.gpus_to_check_for_ecc);
12464
12465 if (skip_cpu_fault_with_valid_permissions(va_block, page_index, fault_access_type))
12466 return NV_OK;
12467
12468 thrashing_hint = uvm_perf_thrashing_get_hint(va_block, service_context->block_context, fault_addr, UVM_ID_CPU);
12469 // Throttling is implemented by sleeping in the fault handler on the CPU
12470 if (thrashing_hint.type == UVM_PERF_THRASHING_HINT_TYPE_THROTTLE) {
12471 service_context->cpu_fault.wakeup_time_stamp = thrashing_hint.throttle.end_time_stamp;
12472 return NV_WARN_MORE_PROCESSING_REQUIRED;
12473 }
12474
12475 service_context->read_duplicate_count = 0;
12476 service_context->thrashing_pin_count = 0;
12477 service_context->operation = UVM_SERVICE_OPERATION_REPLAYABLE_FAULTS;
12478
12479 if (thrashing_hint.type == UVM_PERF_THRASHING_HINT_TYPE_PIN) {
12480 uvm_page_mask_zero(&service_context->thrashing_pin_mask);
12481 uvm_page_mask_set(&service_context->thrashing_pin_mask, page_index);
12482 service_context->thrashing_pin_count = 1;
12483 }
12484
12485 // Compute new residency and update the masks
12486 new_residency = uvm_va_block_select_residency(va_block,
12487 service_context->block_context,
12488 page_index,
12489 UVM_ID_CPU,
12490 uvm_fault_access_type_mask_bit(fault_access_type),
12491 policy,
12492 &thrashing_hint,
12493 UVM_SERVICE_OPERATION_REPLAYABLE_FAULTS,
12494 hmm_migratable,
12495 &read_duplicate);
12496
12497 // Initialize the minimum necessary state in the fault service context
12498 uvm_processor_mask_zero(&service_context->resident_processors);
12499
12500 // Set new residency and update the masks
12501 uvm_processor_mask_set(&service_context->resident_processors, new_residency);
12502
12503 // The masks need to be fully zeroed as the fault region may grow due to prefetching
12504 uvm_page_mask_zero(&service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency);
12505 uvm_page_mask_set(&service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency, page_index);
12506
12507 if (read_duplicate) {
12508 uvm_page_mask_zero(&service_context->read_duplicate_mask);
12509 uvm_page_mask_set(&service_context->read_duplicate_mask, page_index);
12510 service_context->read_duplicate_count = 1;
12511 }
12512
12513 service_context->access_type[page_index] = fault_access_type;
12514
12515 service_context->region = uvm_va_block_region_for_page(page_index);
12516
12517 status = uvm_va_block_service_locked(UVM_ID_CPU, va_block, va_block_retry, service_context);
12518 UVM_ASSERT(status != NV_WARN_MISMATCHED_TARGET);
12519
12520 ++service_context->num_retries;
12521
12522 return status;
12523 }
12524
uvm_va_block_cpu_fault(uvm_va_block_t * va_block,NvU64 fault_addr,bool is_write,uvm_service_block_context_t * service_context)12525 NV_STATUS uvm_va_block_cpu_fault(uvm_va_block_t *va_block,
12526 NvU64 fault_addr,
12527 bool is_write,
12528 uvm_service_block_context_t *service_context)
12529 {
12530 NV_STATUS status;
12531 uvm_va_block_retry_t va_block_retry;
12532 uvm_fault_access_type_t fault_access_type;
12533
12534 if (is_write)
12535 fault_access_type = UVM_FAULT_ACCESS_TYPE_ATOMIC_STRONG;
12536 else
12537 fault_access_type = UVM_FAULT_ACCESS_TYPE_READ;
12538
12539 service_context->num_retries = 0;
12540 service_context->cpu_fault.did_migrate = false;
12541
12542 // We have to use vm_insert_page instead of handing the page to the kernel
12543 // and letting it insert the mapping, and we must do that while holding the
12544 // lock on this VA block. Otherwise there will be a window in which we think
12545 // we've mapped the page but the CPU mapping hasn't actually been created
12546 // yet. During that window a GPU fault event could arrive and claim
12547 // ownership of that VA, "unmapping" it. Then later the kernel would
12548 // eventually establish the mapping, and we'd end up with both CPU and GPU
12549 // thinking they each owned the page.
12550 //
12551 // This function must only be called when it's safe to call vm_insert_page.
12552 // That is, there must be a reference held on the vma's vm_mm, and
12553 // vm_mm->mmap_lock is held in at least read mode. Note that current->mm
12554 // might not be vma->vm_mm.
12555 status = UVM_VA_BLOCK_LOCK_RETRY(va_block,
12556 &va_block_retry,
12557 block_cpu_fault_locked(va_block,
12558 &va_block_retry,
12559 fault_addr,
12560 fault_access_type,
12561 service_context));
12562 return status;
12563 }
12564
uvm_va_block_find(uvm_va_space_t * va_space,NvU64 addr,uvm_va_block_t ** out_block)12565 NV_STATUS uvm_va_block_find(uvm_va_space_t *va_space, NvU64 addr, uvm_va_block_t **out_block)
12566 {
12567 uvm_va_range_t *va_range;
12568 uvm_va_block_t *block;
12569 size_t index;
12570
12571 va_range = uvm_va_range_find(va_space, addr);
12572 if (!va_range)
12573 return uvm_hmm_va_block_find(va_space, addr, out_block);
12574
12575 UVM_ASSERT(uvm_hmm_va_block_find(va_space, addr, out_block) == NV_ERR_INVALID_ADDRESS ||
12576 uvm_hmm_va_block_find(va_space, addr, out_block) == NV_ERR_OBJECT_NOT_FOUND);
12577
12578 if (va_range->type != UVM_VA_RANGE_TYPE_MANAGED)
12579 return NV_ERR_INVALID_ADDRESS;
12580
12581 index = uvm_va_range_block_index(va_range, addr);
12582 block = uvm_va_range_block(va_range, index);
12583 if (!block)
12584 return NV_ERR_OBJECT_NOT_FOUND;
12585
12586 *out_block = block;
12587 return NV_OK;
12588 }
12589
uvm_va_block_find_create_in_range(uvm_va_space_t * va_space,uvm_va_range_t * va_range,NvU64 addr,uvm_va_block_t ** out_block)12590 NV_STATUS uvm_va_block_find_create_in_range(uvm_va_space_t *va_space,
12591 uvm_va_range_t *va_range,
12592 NvU64 addr,
12593 uvm_va_block_t **out_block)
12594 {
12595 size_t index;
12596
12597 if (uvm_enable_builtin_tests && atomic_dec_if_positive(&va_space->test.va_block_allocation_fail_nth) == 0)
12598 return NV_ERR_NO_MEMORY;
12599
12600 UVM_ASSERT(va_range);
12601 UVM_ASSERT(addr >= va_range->node.start);
12602 UVM_ASSERT(addr <= va_range->node.end);
12603
12604 UVM_ASSERT(uvm_hmm_va_block_find(va_space, addr, out_block) == NV_ERR_INVALID_ADDRESS ||
12605 uvm_hmm_va_block_find(va_space, addr, out_block) == NV_ERR_OBJECT_NOT_FOUND);
12606
12607 if (va_range->type != UVM_VA_RANGE_TYPE_MANAGED)
12608 return NV_ERR_INVALID_ADDRESS;
12609
12610 index = uvm_va_range_block_index(va_range, addr);
12611 return uvm_va_range_block_create(va_range, index, out_block);
12612 }
12613
uvm_va_block_find_create_managed(uvm_va_space_t * va_space,NvU64 addr,uvm_va_block_t ** out_block)12614 NV_STATUS uvm_va_block_find_create_managed(uvm_va_space_t *va_space,
12615 NvU64 addr,
12616 uvm_va_block_t **out_block)
12617 {
12618 uvm_va_range_t *va_range = uvm_va_range_find(va_space, addr);
12619
12620 if (va_range)
12621 return uvm_va_block_find_create_in_range(va_space, va_range, addr, out_block);
12622 else
12623 return NV_ERR_INVALID_ADDRESS;
12624 }
12625
uvm_va_block_find_create(uvm_va_space_t * va_space,NvU64 addr,struct vm_area_struct ** hmm_vma,uvm_va_block_t ** out_block)12626 NV_STATUS uvm_va_block_find_create(uvm_va_space_t *va_space,
12627 NvU64 addr,
12628 struct vm_area_struct **hmm_vma,
12629 uvm_va_block_t **out_block)
12630 {
12631 uvm_va_range_t *va_range = uvm_va_range_find(va_space, addr);
12632
12633 if (hmm_vma)
12634 *hmm_vma = NULL;
12635
12636 if (va_range)
12637 return uvm_va_block_find_create_in_range(va_space, va_range, addr, out_block);
12638 else
12639 return uvm_hmm_va_block_find_create(va_space, addr, hmm_vma, out_block);
12640 }
12641
va_block_write_cpu_to_gpu(uvm_va_block_t * va_block,uvm_gpu_t * gpu,uvm_gpu_address_t dst_gpu_address,NvU64 dst,uvm_mem_t * src_mem,size_t size)12642 static NV_STATUS va_block_write_cpu_to_gpu(uvm_va_block_t *va_block,
12643 uvm_gpu_t *gpu,
12644 uvm_gpu_address_t dst_gpu_address,
12645 NvU64 dst,
12646 uvm_mem_t *src_mem,
12647 size_t size)
12648 {
12649 NV_STATUS status;
12650 uvm_push_t push;
12651 uvm_gpu_address_t src_gpu_address;
12652
12653 if (g_uvm_global.conf_computing_enabled) {
12654 return uvm_conf_computing_util_memcopy_cpu_to_gpu(gpu,
12655 dst_gpu_address,
12656 uvm_mem_get_cpu_addr_kernel(src_mem),
12657 size,
12658 &va_block->tracker,
12659 "Encrypted write to [0x%llx, 0x%llx)",
12660 dst,
12661 dst + size);
12662 }
12663
12664 status = uvm_push_begin_acquire(gpu->channel_manager,
12665 UVM_CHANNEL_TYPE_CPU_TO_GPU,
12666 &va_block->tracker,
12667 &push,
12668 "Direct write to [0x%llx, 0x%llx)",
12669 dst,
12670 dst + size);
12671 if (status != NV_OK)
12672 return status;
12673
12674 src_gpu_address = uvm_mem_gpu_address_virtual_kernel(src_mem, gpu);
12675 gpu->parent->ce_hal->memcopy(&push, dst_gpu_address, src_gpu_address, size);
12676 return uvm_push_end_and_wait(&push);
12677 }
12678
uvm_va_block_write_from_cpu(uvm_va_block_t * va_block,uvm_va_block_context_t * block_context,NvU64 dst,uvm_mem_t * src_mem,size_t size)12679 NV_STATUS uvm_va_block_write_from_cpu(uvm_va_block_t *va_block,
12680 uvm_va_block_context_t *block_context,
12681 NvU64 dst,
12682 uvm_mem_t *src_mem,
12683 size_t size)
12684 {
12685 NV_STATUS status;
12686 uvm_page_index_t page_index = uvm_va_block_cpu_page_index(va_block, dst);
12687 NvU64 page_offset = dst & (PAGE_SIZE - 1);
12688 uvm_processor_id_t proc = uvm_va_block_page_get_closest_resident(va_block, block_context, page_index, UVM_ID_CPU);
12689 uvm_va_block_region_t region = uvm_va_block_region_for_page(page_index);
12690
12691 uvm_assert_mutex_locked(&va_block->lock);
12692 UVM_ASSERT_MSG(page_offset + size <= PAGE_SIZE, "Write spans multiple pages: dst 0x%llx, size 0x%zx\n", dst, size);
12693
12694 if (UVM_ID_IS_INVALID(proc))
12695 proc = UVM_ID_CPU;
12696
12697 // Use make_resident() in all cases to break read-duplication, but
12698 // block_retry can be NULL as if the page is not resident yet we will make
12699 // it resident on the CPU.
12700 // Notably we don't care about coherence with respect to atomics from other
12701 // processors.
12702 status = uvm_va_block_make_resident(va_block,
12703 NULL,
12704 block_context,
12705 proc,
12706 region,
12707 NULL,
12708 NULL,
12709 UVM_MAKE_RESIDENT_CAUSE_API_TOOLS);
12710
12711 if (status != NV_OK)
12712 return status;
12713
12714 if (UVM_ID_IS_CPU(proc)) {
12715 char *mapped_page;
12716 struct page *page = uvm_va_block_get_cpu_page(va_block, page_index);
12717 void *src = uvm_mem_get_cpu_addr_kernel(src_mem);
12718
12719 status = uvm_tracker_wait(&va_block->tracker);
12720 if (status != NV_OK)
12721 return status;
12722
12723 mapped_page = (char *)kmap(page);
12724 memcpy(mapped_page + page_offset, src, size);
12725 kunmap(page);
12726
12727 return NV_OK;
12728 }
12729 else {
12730 uvm_gpu_t *dst_gpu;
12731 uvm_gpu_address_t dst_gpu_address;
12732
12733 UVM_ASSERT(UVM_ID_IS_GPU(proc));
12734
12735 dst_gpu = block_get_gpu(va_block, proc);
12736
12737 dst_gpu_address = block_phys_page_copy_address(va_block,
12738 block_phys_page(proc, NUMA_NO_NODE, page_index),
12739 dst_gpu);
12740 dst_gpu_address.address += page_offset;
12741
12742 return va_block_write_cpu_to_gpu(va_block, dst_gpu, dst_gpu_address, dst, src_mem, size);
12743 }
12744 }
12745
va_block_read_gpu_to_cpu(uvm_va_block_t * va_block,uvm_mem_t * dst_mem,uvm_gpu_t * gpu,uvm_gpu_address_t src_gpu_address,NvU64 src,size_t size)12746 static NV_STATUS va_block_read_gpu_to_cpu(uvm_va_block_t *va_block,
12747 uvm_mem_t *dst_mem,
12748 uvm_gpu_t *gpu,
12749 uvm_gpu_address_t src_gpu_address,
12750 NvU64 src,
12751 size_t size)
12752 {
12753 NV_STATUS status;
12754 uvm_push_t push;
12755 uvm_gpu_address_t dst_gpu_address;
12756
12757 if (g_uvm_global.conf_computing_enabled) {
12758 return uvm_conf_computing_util_memcopy_gpu_to_cpu(gpu,
12759 uvm_mem_get_cpu_addr_kernel(dst_mem),
12760 src_gpu_address,
12761 size,
12762 &va_block->tracker,
12763 "Encrypted read from [0x%llx, 0x%llx)",
12764 src,
12765 src + size);
12766 }
12767
12768 status = uvm_push_begin_acquire(gpu->channel_manager,
12769 UVM_CHANNEL_TYPE_GPU_TO_CPU,
12770 &va_block->tracker,
12771 &push,
12772 "Direct read from [0x%llx, 0x%llx)",
12773 src,
12774 src + size);
12775 if (status != NV_OK)
12776 return status;
12777
12778 dst_gpu_address = uvm_mem_gpu_address_virtual_kernel(dst_mem, gpu);
12779 gpu->parent->ce_hal->memcopy(&push, dst_gpu_address, src_gpu_address, size);
12780 return uvm_push_end_and_wait(&push);
12781 }
12782
uvm_va_block_read_to_cpu(uvm_va_block_t * va_block,uvm_va_block_context_t * va_block_context,uvm_mem_t * dst_mem,NvU64 src,size_t size)12783 NV_STATUS uvm_va_block_read_to_cpu(uvm_va_block_t *va_block,
12784 uvm_va_block_context_t *va_block_context,
12785 uvm_mem_t *dst_mem,
12786 NvU64 src,
12787 size_t size)
12788 {
12789 uvm_page_index_t page_index = uvm_va_block_cpu_page_index(va_block, src);
12790 NvU64 page_offset = src & (PAGE_SIZE - 1);
12791 uvm_processor_id_t proc;
12792 void *dst = uvm_mem_get_cpu_addr_kernel(dst_mem);
12793
12794 uvm_assert_mutex_locked(&va_block->lock);
12795 UVM_ASSERT_MSG(page_offset + size <= PAGE_SIZE, "Read spans multiple pages: src 0x%llx, size 0x%zx\n", src, size);
12796
12797 proc = uvm_va_block_page_get_closest_resident(va_block, va_block_context, page_index, UVM_ID_CPU);
12798 if (UVM_ID_IS_INVALID(proc)) {
12799 memset(dst, 0, size);
12800 return NV_OK;
12801 }
12802 else if (UVM_ID_IS_CPU(proc)) {
12803 NV_STATUS status;
12804 char *mapped_page;
12805 struct page *page = uvm_va_block_get_cpu_page(va_block, page_index);
12806
12807 status = uvm_tracker_wait(&va_block->tracker);
12808 if (status != NV_OK)
12809 return status;
12810
12811 mapped_page = (char *)kmap(page);
12812 memcpy(dst, mapped_page + page_offset, size);
12813 kunmap(page);
12814
12815 return NV_OK;
12816 }
12817 else {
12818 uvm_gpu_address_t src_gpu_address;
12819 uvm_gpu_t *gpu = block_get_gpu(va_block, proc);
12820
12821 src_gpu_address = block_phys_page_copy_address(va_block,
12822 block_phys_page(proc, NUMA_NO_NODE, page_index),
12823 gpu);
12824 src_gpu_address.address += page_offset;
12825
12826 return va_block_read_gpu_to_cpu(va_block, dst_mem, gpu, src_gpu_address, src, size);
12827 }
12828 }
12829
12830 // Deferred work item reestablishing accessed by mappings after eviction. On
12831 // GPUs with access counters enabled, the evicted GPU will also get remote
12832 // mappings.
block_add_eviction_mappings(void * args)12833 static void block_add_eviction_mappings(void *args)
12834 {
12835 uvm_va_block_t *va_block = (uvm_va_block_t*)args;
12836 uvm_va_space_t *va_space;
12837 uvm_processor_id_t id;
12838 uvm_va_block_context_t *block_context = NULL;
12839 struct mm_struct *mm = NULL;
12840
12841 uvm_mutex_lock(&va_block->lock);
12842 va_space = uvm_va_block_get_va_space_maybe_dead(va_block);
12843 uvm_mutex_unlock(&va_block->lock);
12844
12845 if (!va_space) {
12846 // Block has been killed in the meantime
12847 goto done;
12848 }
12849
12850 mm = uvm_va_space_mm_retain_lock(va_space);
12851
12852 block_context = uvm_va_block_context_alloc(mm);
12853 if (!block_context)
12854 goto done;
12855
12856 // The block wasn't dead when we checked above and that's enough to
12857 // guarantee that the VA space is still around, because
12858 // uvm_va_space_destroy() flushes the associated nv_kthread_q, and that
12859 // flush waits for this function call to finish.
12860 uvm_va_space_down_read(va_space);
12861
12862 // Now that we have the VA space lock held, we can check whether the block
12863 // is still alive since the VA space write lock is needed to kill blocks.
12864 if (uvm_va_block_is_dead(va_block))
12865 goto unlock;
12866
12867 if (uvm_va_block_is_hmm(va_block)) {
12868 uvm_hmm_block_add_eviction_mappings(va_space, va_block, block_context);
12869 }
12870 else {
12871 uvm_va_range_t *va_range = va_block->va_range;
12872 NV_STATUS status = NV_OK;
12873
12874 for_each_id_in_mask(id, &uvm_va_range_get_policy(va_range)->accessed_by) {
12875 status = uvm_va_block_set_accessed_by(va_block, block_context, id);
12876 if (status != NV_OK)
12877 break;
12878 }
12879
12880 if (status == NV_OK && uvm_va_space_map_remote_on_eviction(va_space)) {
12881 uvm_processor_mask_t *map_processors = &block_context->map_processors_eviction;
12882
12883 // Exclude the processors that have been already mapped due to
12884 // AccessedBy
12885 uvm_processor_mask_andnot(map_processors,
12886 &va_block->evicted_gpus,
12887 &uvm_va_range_get_policy(va_range)->accessed_by);
12888
12889 for_each_gpu_id_in_mask(id, map_processors) {
12890 uvm_gpu_t *gpu = uvm_va_space_get_gpu(va_space, id);
12891 uvm_va_block_gpu_state_t *gpu_state;
12892
12893 if (!gpu->parent->access_counters_supported)
12894 continue;
12895
12896 gpu_state = uvm_va_block_gpu_state_get(va_block, id);
12897 UVM_ASSERT(gpu_state);
12898
12899 // TODO: Bug 2096389: uvm_va_block_add_mappings does not add
12900 // remote mappings to read-duplicated pages. Add support for it
12901 // or create a new function.
12902 status = UVM_VA_BLOCK_LOCK_RETRY(va_block, NULL,
12903 uvm_va_block_add_mappings(va_block,
12904 block_context,
12905 id,
12906 uvm_va_block_region_from_block(va_block),
12907 &gpu_state->evicted,
12908 UvmEventMapRemoteCauseEviction));
12909 if (status != NV_OK)
12910 break;
12911 }
12912 }
12913
12914 if (status != NV_OK) {
12915 UVM_ERR_PRINT("Deferred mappings to evicted memory for block [0x%llx, 0x%llx] failed %s, processor %s\n",
12916 va_block->start,
12917 va_block->end,
12918 nvstatusToString(status),
12919 uvm_va_space_processor_name(va_space, id));
12920 }
12921 }
12922
12923 unlock:
12924 uvm_va_space_up_read(va_space);
12925 uvm_va_block_context_free(block_context);
12926
12927 done:
12928 uvm_va_space_mm_release_unlock(va_space, mm);
12929 uvm_va_block_release(va_block);
12930 }
12931
block_add_eviction_mappings_entry(void * args)12932 static void block_add_eviction_mappings_entry(void *args)
12933 {
12934 UVM_ENTRY_VOID(block_add_eviction_mappings(args));
12935 }
12936
uvm_va_block_evict_chunks(uvm_va_block_t * va_block,uvm_gpu_t * gpu,uvm_gpu_chunk_t * root_chunk,uvm_tracker_t * tracker)12937 NV_STATUS uvm_va_block_evict_chunks(uvm_va_block_t *va_block,
12938 uvm_gpu_t *gpu,
12939 uvm_gpu_chunk_t *root_chunk,
12940 uvm_tracker_t *tracker)
12941 {
12942 NV_STATUS status = NV_OK;
12943 NvU32 i;
12944 uvm_va_block_gpu_state_t *gpu_state;
12945 uvm_va_block_region_t chunk_region;
12946 size_t num_gpu_chunks = block_num_gpu_chunks(va_block, gpu);
12947 size_t chunks_to_evict = 0;
12948 uvm_service_block_context_t *service_context;
12949 uvm_va_block_context_t *block_context;
12950 uvm_page_mask_t *pages_to_evict;
12951 uvm_va_block_test_t *va_block_test = uvm_va_block_get_test(va_block);
12952 uvm_va_space_t *va_space = uvm_va_block_get_va_space_maybe_dead(va_block);
12953 struct mm_struct *mm;
12954 bool accessed_by_set = false;
12955
12956 uvm_assert_mutex_locked(&va_block->lock);
12957
12958 // The block might have been killed in the meantime
12959 if (!va_space)
12960 return NV_OK;
12961
12962 gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id);
12963 if (!gpu_state)
12964 return NV_OK;
12965
12966 if (va_block_test && va_block_test->inject_eviction_error) {
12967 va_block_test->inject_eviction_error = false;
12968 return NV_ERR_NO_MEMORY;
12969 }
12970
12971 // We cannot take this block's VA space or mmap_lock locks on the eviction
12972 // path, however, we retain mm in order to support accounting of CPU memory
12973 // allocations. If mappings need to be created,
12974 // block_add_eviction_mappings() will be scheduled below.
12975 mm = uvm_va_space_mm_retain(va_space);
12976
12977 service_context = uvm_service_block_context_alloc(mm);
12978 if (!service_context) {
12979 if (mm)
12980 uvm_va_space_mm_release(va_space);
12981
12982 return NV_ERR_NO_MEMORY;
12983 }
12984
12985 block_context = service_context->block_context;
12986
12987 pages_to_evict = &block_context->caller_page_mask;
12988 uvm_page_mask_zero(pages_to_evict);
12989 chunk_region.outer = 0;
12990
12991 // Find all chunks that are subchunks of the root chunk
12992 for (i = 0; i < num_gpu_chunks; ++i) {
12993 uvm_chunk_size_t chunk_size;
12994 size_t chunk_index = block_gpu_chunk_index(va_block, gpu, chunk_region.outer, &chunk_size);
12995 UVM_ASSERT(chunk_index == i);
12996 chunk_region.first = chunk_region.outer;
12997 chunk_region.outer = chunk_region.first + chunk_size / PAGE_SIZE;
12998
12999 if (!gpu_state->chunks[i])
13000 continue;
13001 if (!uvm_gpu_chunk_same_root(gpu_state->chunks[i], root_chunk))
13002 continue;
13003
13004 if (uvm_va_block_is_hmm(va_block)) {
13005 status = uvm_hmm_va_block_evict_chunk_prep(va_block, block_context, gpu_state->chunks[i], chunk_region);
13006 if (status != NV_OK)
13007 break;
13008 }
13009
13010 uvm_page_mask_region_fill(pages_to_evict, chunk_region);
13011 ++chunks_to_evict;
13012 }
13013
13014 if (chunks_to_evict == 0)
13015 goto out;
13016
13017 // Only move pages resident on the GPU
13018 uvm_page_mask_and(pages_to_evict, pages_to_evict, uvm_va_block_resident_mask_get(va_block, gpu->id, NUMA_NO_NODE));
13019 uvm_processor_mask_zero(&block_context->make_resident.all_involved_processors);
13020
13021 if (uvm_va_block_is_hmm(va_block)) {
13022 status = uvm_hmm_va_block_evict_chunks(va_block,
13023 service_context,
13024 pages_to_evict,
13025 uvm_va_block_region_from_block(va_block),
13026 &accessed_by_set);
13027 }
13028 else {
13029 const uvm_va_policy_t *policy = uvm_va_range_get_policy(va_block->va_range);
13030 accessed_by_set = uvm_processor_mask_get_count(&policy->accessed_by) > 0;
13031
13032 // TODO: Bug 1765193: make_resident() breaks read-duplication, but it's
13033 // not necessary to do so for eviction. Add a version that unmaps only
13034 // the processors that have mappings to the pages being evicted.
13035 status = uvm_va_block_make_resident(va_block,
13036 NULL,
13037 block_context,
13038 UVM_ID_CPU,
13039 uvm_va_block_region_from_block(va_block),
13040 pages_to_evict,
13041 NULL,
13042 UVM_MAKE_RESIDENT_CAUSE_EVICTION);
13043 }
13044 if (status != NV_OK)
13045 goto out;
13046
13047 // VA space lock may not be held and hence we cannot reestablish any
13048 // mappings here and need to defer it to a work queue.
13049 //
13050 // Reading the accessed_by mask without the VA space lock is safe because
13051 // adding a new processor to the mask triggers going over all the VA blocks
13052 // in the range and locking them. And we hold one of the VA block's locks.
13053 //
13054 // If uvm_va_range_set_accessed_by() hasn't called
13055 // uvm_va_block_set_accessed_by() for this block yet then it will take care
13056 // of adding the mapping after we are done. If it already did then we are
13057 // guaranteed to see the new processor in the accessed_by mask because we
13058 // locked the block's lock that the thread calling
13059 // uvm_va_range_set_accessed_by() unlocked after updating the mask.
13060 //
13061 // If a processor gets removed from the mask then we might not notice and
13062 // schedule the work item anyway, but that's benign as
13063 // block_add_eviction_mappings() re-examines the mask.
13064 //
13065 // Checking if access counters migrations are enabled on a VA space is racy
13066 // without holding the VA space lock. However, this is fine as
13067 // block_add_eviction_mappings() reexamines the value with the VA space
13068 // lock being held.
13069 if (accessed_by_set || (gpu->parent->access_counters_supported && uvm_va_space_map_remote_on_eviction(va_space))) {
13070 // Always retain the VA block first so that it's safe for the deferred
13071 // callback to release it immediately after it runs.
13072 uvm_va_block_retain(va_block);
13073
13074 if (!nv_kthread_q_schedule_q_item(&g_uvm_global.global_q,
13075 &va_block->eviction_mappings_q_item)) {
13076 // And release it if no new callback was scheduled
13077 uvm_va_block_release_no_destroy(va_block);
13078 }
13079 }
13080
13081 status = uvm_tracker_add_tracker_safe(tracker, &va_block->tracker);
13082 if (status != NV_OK)
13083 goto out;
13084
13085 for (i = 0; i < num_gpu_chunks; ++i) {
13086 uvm_gpu_id_t accessing_gpu_id;
13087 uvm_gpu_chunk_t *chunk = gpu_state->chunks[i];
13088
13089 if (!chunk)
13090 continue;
13091 if (!uvm_gpu_chunk_same_root(chunk, root_chunk))
13092 continue;
13093
13094 // Remove the mappings of indirect peers from the reverse map. We
13095 // access the indirect peer mask from the VA space without holding the
13096 // VA space lock. Therefore, we can race with enable_peer/disable_peer
13097 // operations. However this is fine:
13098 //
13099 // The enable_peer sequence is as follows:
13100 //
13101 // set_bit in va_space->indirect_peers
13102 // uvm_va_block_enable_peer;
13103 //
13104 // - If we read the mask BEFORE it is set or AFTER the mapping has
13105 // been added to the map there is no race.
13106 // - If we read the mask AFTER it is set but BEFORE adding the mapping
13107 // to the reverse map, we will try to remove it although it is not
13108 // there yet. Therefore, we use
13109 // uvm_pmm_sysmem_mappings_remove_gpu_mapping_on_eviction, which does
13110 // not check if the mapping is present in the reverse map.
13111 //
13112 // The disable_peer sequence is as follows:
13113 //
13114 // uvm_va_block_disable_peer;
13115 // clear_bit in va_space->indirect_peers
13116 //
13117 // - If we read the mask BEFORE the mapping has been added to the map
13118 // or AFTER the bit has been cleared, there is no race.
13119 // - If we read the mask AFTER the mapping has been removed and BEFORE
13120 // the bit is cleared, we will try to remove the mapping, too.
13121 // Again, uvm_pmm_sysmem_mappings_remove_gpu_mapping_on_eviction works
13122 // in this scenario.
13123 // Obtain the uvm_gpu_t directly via the parent GPU's id since indirect
13124 // peers are not supported when SMC is enabled.
13125 for_each_gpu_id_in_mask(accessing_gpu_id, &va_space->indirect_peers[uvm_id_value(gpu->id)]) {
13126 uvm_gpu_t *accessing_gpu = uvm_va_space_get_gpu(va_space, accessing_gpu_id);
13127 NvU64 peer_addr = uvm_pmm_gpu_indirect_peer_addr(&gpu->pmm, chunk, accessing_gpu);
13128
13129 uvm_pmm_sysmem_mappings_remove_gpu_mapping_on_eviction(&accessing_gpu->pmm_reverse_sysmem_mappings,
13130 peer_addr);
13131 }
13132
13133 uvm_mmu_chunk_unmap(chunk, tracker);
13134
13135 uvm_pmm_gpu_mark_chunk_evicted(&gpu->pmm, gpu_state->chunks[i]);
13136 gpu_state->chunks[i] = NULL;
13137 }
13138
13139 out:
13140 uvm_service_block_context_free(service_context);
13141
13142 if (mm)
13143 uvm_va_space_mm_release(va_space);
13144
13145 return status;
13146 }
13147
block_gpu_force_4k_ptes(uvm_va_block_t * block,uvm_va_block_context_t * block_context,uvm_gpu_t * gpu)13148 static NV_STATUS block_gpu_force_4k_ptes(uvm_va_block_t *block, uvm_va_block_context_t *block_context, uvm_gpu_t *gpu)
13149 {
13150 uvm_va_block_gpu_state_t *gpu_state = block_gpu_state_get_alloc(block, gpu);
13151 uvm_push_t push;
13152 NV_STATUS status;
13153
13154 // See comment in uvm_va_block_set_cancel
13155 UVM_ASSERT(!gpu->parent->fault_cancel_va_supported);
13156
13157 if (!gpu_state)
13158 return NV_ERR_NO_MEMORY;
13159
13160 // Force all pages to be 4K and prevent future upgrades during cancel
13161 gpu_state->force_4k_ptes = true;
13162
13163 // If we have no page tables we're done. For fault cancel we need to make
13164 // sure that fatal faults are on different 4k PTEs than non-fatal faults,
13165 // and we need to service all non-fatal faults before issuing the cancel. So
13166 // either all faults are fatal and we have no PTEs (we're PROT_NONE), or
13167 // we'll allocate PTEs later when we service the non-fatal faults. Those
13168 // PTEs will be 4k since force_4k_ptes is set.
13169 if (!block_gpu_has_page_tables(block, gpu))
13170 return NV_OK;
13171
13172 // Are we 4k already?
13173 if (!gpu_state->pte_is_2m && bitmap_empty(gpu_state->big_ptes, MAX_BIG_PAGES_PER_UVM_VA_BLOCK))
13174 return NV_OK;
13175
13176 status = block_alloc_ptes_with_retry(block, gpu, UVM_PAGE_SIZE_4K, NULL);
13177 if (status != NV_OK)
13178 return status;
13179
13180 status = uvm_push_begin_acquire(gpu->channel_manager,
13181 UVM_CHANNEL_TYPE_MEMOPS,
13182 &block->tracker,
13183 &push,
13184 "Forcing 4k PTEs on block [0x%llx, 0x%llx)",
13185 block->start,
13186 block->end + 1);
13187 if (status != NV_OK)
13188 return status;
13189
13190 if (gpu_state->pte_is_2m)
13191 block_gpu_split_2m(block, block_context, gpu, NULL, &push);
13192 else
13193 block_gpu_split_big(block, block_context, gpu, gpu_state->big_ptes, &push);
13194
13195 uvm_push_end(&push);
13196
13197 UVM_ASSERT(block_check_mappings(block, block_context));
13198
13199 return uvm_tracker_add_push_safe(&block->tracker, &push);
13200 }
13201
uvm_va_block_set_cancel(uvm_va_block_t * va_block,uvm_va_block_context_t * block_context,uvm_gpu_t * gpu)13202 NV_STATUS uvm_va_block_set_cancel(uvm_va_block_t *va_block, uvm_va_block_context_t *block_context, uvm_gpu_t *gpu)
13203 {
13204 uvm_assert_mutex_locked(&va_block->lock);
13205
13206 // Volta+ devices support a global VA cancel method that does not require
13207 // 4k PTEs. Thus, skip doing this PTE splitting, particularly because it
13208 // could result in 4k PTEs on P9 systems which otherwise would never need
13209 // them.
13210 if (gpu->parent->fault_cancel_va_supported)
13211 return NV_OK;
13212
13213 return block_gpu_force_4k_ptes(va_block, block_context, gpu);
13214 }
13215
uvm_test_va_block_inject_error(UVM_TEST_VA_BLOCK_INJECT_ERROR_PARAMS * params,struct file * filp)13216 NV_STATUS uvm_test_va_block_inject_error(UVM_TEST_VA_BLOCK_INJECT_ERROR_PARAMS *params, struct file *filp)
13217 {
13218 uvm_va_space_t *va_space = uvm_va_space_get(filp);
13219 struct mm_struct *mm;
13220 uvm_va_block_t *va_block;
13221 uvm_va_block_test_t *va_block_test;
13222 NV_STATUS status = NV_OK;
13223
13224 mm = uvm_va_space_mm_or_current_retain_lock(va_space);
13225 uvm_va_space_down_read(va_space);
13226
13227 if (mm)
13228 status = uvm_va_block_find_create(va_space, params->lookup_address, NULL, &va_block);
13229 else
13230 status = uvm_va_block_find_create_managed(va_space, params->lookup_address, &va_block);
13231
13232 if (status != NV_OK)
13233 goto out;
13234
13235 va_block_test = uvm_va_block_get_test(va_block);
13236 UVM_ASSERT(va_block_test);
13237
13238 uvm_mutex_lock(&va_block->lock);
13239
13240 if (params->page_table_allocation_retry_force_count)
13241 va_block_test->page_table_allocation_retry_force_count = params->page_table_allocation_retry_force_count;
13242
13243 if (params->user_pages_allocation_retry_force_count)
13244 va_block_test->user_pages_allocation_retry_force_count = params->user_pages_allocation_retry_force_count;
13245
13246 if (params->cpu_chunk_allocation_size_mask) {
13247 if (params->cpu_chunk_allocation_size_mask & ~UVM_CPU_CHUNK_SIZES ||
13248 !(params->cpu_chunk_allocation_size_mask & PAGE_SIZE)) {
13249 status = NV_ERR_INVALID_ARGUMENT;
13250 goto block_unlock;
13251 }
13252
13253 va_block_test->cpu_chunk_allocation_size_mask = params->cpu_chunk_allocation_size_mask & UVM_CPU_CHUNK_SIZES;
13254 }
13255
13256 if (params->cpu_chunk_allocation_target_id != NUMA_NO_NODE)
13257 va_block_test->cpu_chunk_allocation_target_id = params->cpu_chunk_allocation_target_id;
13258
13259 if (params->cpu_chunk_allocation_actual_id != NUMA_NO_NODE)
13260 va_block_test->cpu_chunk_allocation_actual_id = params->cpu_chunk_allocation_actual_id;
13261
13262 if (params->eviction_error)
13263 va_block_test->inject_eviction_error = params->eviction_error;
13264
13265 if (params->cpu_pages_allocation_error_count)
13266 va_block_test->inject_cpu_pages_allocation_error_count = params->cpu_pages_allocation_error_count;
13267
13268 if (params->populate_error)
13269 va_block_test->inject_populate_error = params->populate_error;
13270
13271 block_unlock:
13272 uvm_mutex_unlock(&va_block->lock);
13273
13274 out:
13275 uvm_va_space_up_read(va_space);
13276 uvm_va_space_mm_or_current_release_unlock(va_space, mm);
13277 return status;
13278 }
13279
13280 static uvm_prot_t g_uvm_test_pte_mapping_to_prot[UVM_TEST_PTE_MAPPING_MAX] =
13281 {
13282 [UVM_TEST_PTE_MAPPING_INVALID] = UVM_PROT_NONE,
13283 [UVM_TEST_PTE_MAPPING_READ_ONLY] = UVM_PROT_READ_ONLY,
13284 [UVM_TEST_PTE_MAPPING_READ_WRITE] = UVM_PROT_READ_WRITE,
13285 [UVM_TEST_PTE_MAPPING_READ_WRITE_ATOMIC] = UVM_PROT_READ_WRITE_ATOMIC,
13286 };
13287
13288 static UVM_TEST_PTE_MAPPING g_uvm_prot_to_test_pte_mapping[UVM_PROT_MAX] =
13289 {
13290 [UVM_PROT_NONE] = UVM_TEST_PTE_MAPPING_INVALID,
13291 [UVM_PROT_READ_ONLY] = UVM_TEST_PTE_MAPPING_READ_ONLY,
13292 [UVM_PROT_READ_WRITE] = UVM_TEST_PTE_MAPPING_READ_WRITE,
13293 [UVM_PROT_READ_WRITE_ATOMIC] = UVM_TEST_PTE_MAPPING_READ_WRITE_ATOMIC,
13294 };
13295
uvm_test_change_pte_mapping(UVM_TEST_CHANGE_PTE_MAPPING_PARAMS * params,struct file * filp)13296 NV_STATUS uvm_test_change_pte_mapping(UVM_TEST_CHANGE_PTE_MAPPING_PARAMS *params, struct file *filp)
13297 {
13298 uvm_va_space_t *va_space = uvm_va_space_get(filp);
13299 uvm_va_block_t *block;
13300 struct mm_struct *mm;
13301 NV_STATUS status = NV_OK;
13302 uvm_prot_t curr_prot, new_prot;
13303 uvm_gpu_t *gpu = NULL;
13304 uvm_processor_id_t id;
13305 uvm_tracker_t local_tracker;
13306 uvm_va_block_region_t region;
13307 uvm_va_block_context_t *block_context = NULL;
13308
13309 if (!PAGE_ALIGNED(params->va))
13310 return NV_ERR_INVALID_ADDRESS;
13311
13312 if (params->mapping >= UVM_TEST_PTE_MAPPING_MAX)
13313 return NV_ERR_INVALID_ARGUMENT;
13314
13315 new_prot = g_uvm_test_pte_mapping_to_prot[params->mapping];
13316
13317 // mmap_lock isn't needed for invalidating CPU mappings, but it will be
13318 // needed for inserting them.
13319 mm = uvm_va_space_mm_or_current_retain_lock(va_space);
13320 uvm_va_space_down_read(va_space);
13321
13322 if (uvm_uuid_is_cpu(¶ms->uuid)) {
13323 id = UVM_ID_CPU;
13324 }
13325 else {
13326 gpu = uvm_va_space_get_gpu_by_uuid_with_gpu_va_space(va_space, ¶ms->uuid);
13327 if (!gpu) {
13328 status = NV_ERR_INVALID_DEVICE;
13329 goto out;
13330 }
13331
13332 // Check if the GPU can access the VA
13333 if (!uvm_gpu_can_address(gpu, params->va, PAGE_SIZE)) {
13334 status = NV_ERR_OUT_OF_RANGE;
13335 goto out;
13336 }
13337
13338 id = gpu->id;
13339 }
13340
13341 block_context = uvm_va_block_context_alloc(mm);
13342 if (!block_context) {
13343 status = NV_ERR_NO_MEMORY;
13344 goto out;
13345 }
13346
13347 if (mm)
13348 status = uvm_va_block_find_create(va_space, params->va, &block_context->hmm.vma, &block);
13349 else
13350 status = uvm_va_block_find_create_managed(va_space, params->va, &block);
13351
13352 if (status != NV_OK)
13353 goto out;
13354
13355 // TODO: Bug 3912902: UvmTestChangePteMapping() doesn't work on CPU.
13356 if (UVM_ID_IS_CPU(id) && uvm_va_block_is_hmm(block))
13357 goto out;
13358
13359 uvm_mutex_lock(&block->lock);
13360
13361 region = uvm_va_block_region_from_start_size(block, params->va, PAGE_SIZE);
13362 curr_prot = block_page_prot(block, id, region.first);
13363
13364 if (new_prot == curr_prot) {
13365 status = NV_OK;
13366 goto out_block;
13367 }
13368
13369 // TODO: Bug 1766124: Upgrades might require revoking other processors'
13370 // access privileges. We just fail for now. Only downgrades are
13371 // supported. If we allowed upgrades, we would need to check the mm
13372 // like we do for revocation below.
13373 if (new_prot > curr_prot) {
13374 status = NV_ERR_INVALID_OPERATION;
13375 goto out_block;
13376 }
13377
13378 if (new_prot == UVM_PROT_NONE) {
13379 status = uvm_va_block_unmap(block, block_context, id, region, NULL, &block->tracker);
13380 }
13381 else {
13382 UVM_ASSERT(block_is_page_resident_anywhere(block, region.first));
13383
13384 // Revoking CPU mappings performs a combination of unmap + map. The map
13385 // portion requires a valid mm.
13386 if (UVM_ID_IS_CPU(id) && !uvm_va_range_vma_check(block->va_range, mm)) {
13387 status = NV_ERR_INVALID_STATE;
13388 }
13389 else {
13390 status = uvm_va_block_revoke_prot(block,
13391 block_context,
13392 id,
13393 region,
13394 NULL,
13395 new_prot + 1,
13396 &block->tracker);
13397 }
13398 }
13399
13400 out_block:
13401 if (status == NV_OK)
13402 status = uvm_tracker_init_from(&local_tracker, &block->tracker);
13403
13404 uvm_mutex_unlock(&block->lock);
13405
13406 if (status == NV_OK)
13407 status = uvm_tracker_wait_deinit(&local_tracker);
13408
13409 out:
13410 uvm_va_space_up_read(va_space);
13411 uvm_va_space_mm_or_current_release_unlock(va_space, mm);
13412
13413 uvm_va_block_context_free(block_context);
13414
13415 return status;
13416 }
13417
uvm_test_va_block_info(UVM_TEST_VA_BLOCK_INFO_PARAMS * params,struct file * filp)13418 NV_STATUS uvm_test_va_block_info(UVM_TEST_VA_BLOCK_INFO_PARAMS *params, struct file *filp)
13419 {
13420 uvm_va_space_t *va_space = uvm_va_space_get(filp);
13421 uvm_va_block_t *va_block;
13422 uvm_va_range_t *va_range;
13423 struct mm_struct *mm;
13424 size_t index;
13425 NV_STATUS status = NV_OK;
13426
13427 BUILD_BUG_ON(UVM_TEST_VA_BLOCK_SIZE != UVM_VA_BLOCK_SIZE);
13428
13429 mm = uvm_va_space_mm_or_current_retain_lock(va_space);
13430 uvm_va_space_down_read(va_space);
13431
13432 va_range = uvm_va_range_find(va_space, params->lookup_address);
13433 if (!va_range) {
13434 status = uvm_hmm_va_block_find(va_space, params->lookup_address, &va_block);
13435 if (status == NV_ERR_OBJECT_NOT_FOUND) {
13436 status = uvm_hmm_va_block_range_bounds(va_space,
13437 mm,
13438 params->lookup_address,
13439 ¶ms->va_block_start,
13440 ¶ms->va_block_end,
13441 NULL);
13442 goto out;
13443 }
13444 else if (status != NV_OK) {
13445 goto out;
13446 }
13447 }
13448 else {
13449 index = uvm_va_range_block_index(va_range, params->lookup_address);
13450 va_block = uvm_va_range_block(va_range, index);
13451 if (!va_block) {
13452 status = NV_ERR_OBJECT_NOT_FOUND;
13453 goto out;
13454 }
13455 }
13456
13457 params->va_block_start = va_block->start;
13458 params->va_block_end = va_block->end;
13459
13460 out:
13461 uvm_va_space_up_read(va_space);
13462 uvm_va_space_mm_or_current_release_unlock(va_space, mm);
13463 return status;
13464 }
13465
uvm_test_va_residency_info(UVM_TEST_VA_RESIDENCY_INFO_PARAMS * params,struct file * filp)13466 NV_STATUS uvm_test_va_residency_info(UVM_TEST_VA_RESIDENCY_INFO_PARAMS *params, struct file *filp)
13467 {
13468 NV_STATUS status = NV_OK;
13469 uvm_va_space_t *va_space = uvm_va_space_get(filp);
13470 uvm_va_range_t *va_range;
13471 uvm_va_block_t *block = NULL;
13472 uvm_va_block_context_t *block_context = NULL;
13473 struct mm_struct *mm;
13474 NvU32 count = 0;
13475 uvm_processor_mask_t *resident_on_mask;
13476 uvm_processor_id_t id;
13477 uvm_page_index_t page_index;
13478 unsigned release_block_count = 0;
13479 NvU64 addr = UVM_ALIGN_DOWN(params->lookup_address, PAGE_SIZE);
13480 size_t index;
13481
13482 resident_on_mask = uvm_processor_mask_cache_alloc();
13483 if (!resident_on_mask)
13484 return NV_ERR_NO_MEMORY;
13485
13486 mm = uvm_va_space_mm_or_current_retain_lock(va_space);
13487
13488 block_context = uvm_va_block_context_alloc(mm);
13489 if (!block_context) {
13490 status = NV_ERR_NO_MEMORY;
13491 goto out_unlocked;
13492 }
13493
13494 uvm_va_space_down_read(va_space);
13495
13496 // Inline uvm_va_block_find() to get the va_range.
13497 va_range = uvm_va_range_find(va_space, addr);
13498 if (!va_range) {
13499 NvU64 start, end;
13500
13501 status = uvm_hmm_va_block_find(va_space, addr, &block);
13502 if (status != NV_OK) {
13503 if (status != NV_ERR_OBJECT_NOT_FOUND)
13504 goto out;
13505 status = uvm_hmm_va_block_range_bounds(va_space, mm, addr, &start, &end, params);
13506 goto out;
13507 }
13508 // Update current CPU mapping information.
13509 status = uvm_hmm_va_block_update_residency_info(block, mm, addr, false);
13510 if (status != NV_OK) {
13511 block = NULL;
13512 goto out;
13513 }
13514 }
13515 else if (va_range->type != UVM_VA_RANGE_TYPE_MANAGED) {
13516 status = NV_ERR_INVALID_ADDRESS;
13517 goto out;
13518 }
13519 else {
13520 index = uvm_va_range_block_index(va_range, addr);
13521 block = uvm_va_range_block(va_range, index);
13522 if (!block) {
13523 params->resident_on_count = 0;
13524 params->populated_on_count = 0;
13525 params->mapped_on_count = 0;
13526 params->resident_nid = -1;
13527
13528 status = NV_OK;
13529
13530 goto out;
13531 }
13532 }
13533
13534 uvm_mutex_lock(&block->lock);
13535
13536 page_index = uvm_va_block_cpu_page_index(block, addr);
13537 uvm_va_block_page_resident_processors(block, page_index, resident_on_mask);
13538
13539 params->resident_nid = -1;
13540 for_each_id_in_mask(id, resident_on_mask) {
13541 block_phys_page_t block_page;
13542 int nid = block_get_page_node_residency(block, page_index);
13543
13544 block_page = block_phys_page(id, nid, page_index);
13545 uvm_va_space_processor_uuid(va_space, ¶ms->resident_on[count], id);
13546 params->resident_physical_size[count] = block_phys_page_size(block, block_page);
13547 if (UVM_ID_IS_CPU(id)) {
13548 params->resident_physical_address[count] = page_to_phys(uvm_va_block_get_cpu_page(block, page_index));
13549 params->resident_nid = nid;
13550
13551 // Check that the page is only resident on a single CPU NUMA node.
13552 for_each_possible_uvm_node(nid) {
13553 if (uvm_va_block_cpu_is_page_resident_on(block, nid, page_index) && nid != params->resident_nid) {
13554 status = NV_ERR_INVALID_STATE;
13555 goto out;
13556 }
13557 }
13558 }
13559 else {
13560 params->resident_physical_address[count] =
13561 block_phys_page_address(block, block_page, uvm_va_space_get_gpu(va_space, id)).address;
13562 }
13563
13564 ++count;
13565 }
13566
13567 params->resident_on_count = count;
13568
13569 count = 0;
13570 for_each_id_in_mask(id, &block->mapped) {
13571 uvm_processor_id_t processor_to_map;
13572 block_phys_page_t block_page;
13573 NvU32 page_size = uvm_va_block_page_size_processor(block, id, page_index);
13574 int nid = NUMA_NO_NODE;
13575
13576 if (page_size == 0)
13577 continue;
13578
13579 uvm_va_space_processor_uuid(va_space, ¶ms->mapped_on[count], id);
13580
13581 params->mapping_type[count] = g_uvm_prot_to_test_pte_mapping[block_page_prot(block, id, page_index)];
13582 UVM_ASSERT(params->mapping_type[count] != UVM_TEST_PTE_MAPPING_INVALID);
13583 processor_to_map = block_get_processor_to_map(block, block_context, id, page_index);
13584 if (UVM_ID_IS_CPU(processor_to_map))
13585 nid = block_get_page_node_residency(block, page_index);
13586
13587 block_page = block_phys_page(processor_to_map, nid, page_index);
13588
13589 if (!UVM_ID_IS_CPU(id)) {
13590 uvm_gpu_phys_address_t gpu_phys_addr = block_phys_page_address(block,
13591 block_page,
13592 uvm_va_space_get_gpu(va_space, id));
13593 params->mapping_physical_address[count] = gpu_phys_addr.address;
13594 }
13595 else {
13596 struct page *page = block_page_get(block, block_page);
13597
13598 params->mapping_physical_address[count] = page_to_phys(page);
13599 }
13600
13601 params->page_size[count] = page_size;
13602 ++count;
13603 }
13604
13605 if (params->resident_on_count == 1) {
13606 if (uvm_processor_mask_test(resident_on_mask, UVM_ID_CPU)) {
13607 if (uvm_pmm_sysmem_mappings_indirect_supported()) {
13608 for_each_gpu_id(id) {
13609 NvU32 page_size = uvm_va_block_page_size_processor(block, id, page_index);
13610 uvm_reverse_map_t sysmem_page;
13611 uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page_resident(block, page_index);
13612 size_t num_pages;
13613 uvm_gpu_t *gpu;
13614
13615 if (!uvm_va_block_gpu_state_get(block, id))
13616 continue;
13617
13618 gpu = uvm_va_space_get_gpu(va_space, id);
13619
13620 if (!gpu->parent->access_counters_can_use_physical_addresses)
13621 continue;
13622
13623 num_pages = uvm_pmm_sysmem_mappings_dma_to_virt(&gpu->pmm_reverse_sysmem_mappings,
13624 uvm_cpu_chunk_get_parent_gpu_phys_addr(chunk,
13625 gpu->parent),
13626 uvm_cpu_chunk_get_size(chunk),
13627 &sysmem_page,
13628 1);
13629 if (page_size > 0)
13630 UVM_ASSERT(num_pages == 1);
13631 else
13632 UVM_ASSERT(num_pages <= 1);
13633
13634 if (num_pages == 1) {
13635 UVM_ASSERT(sysmem_page.va_block == block);
13636 UVM_ASSERT(uvm_reverse_map_start(&sysmem_page) <= addr);
13637 UVM_ASSERT(uvm_reverse_map_end(&sysmem_page) > addr);
13638
13639 ++release_block_count;
13640 }
13641 }
13642 }
13643 }
13644 else {
13645 uvm_gpu_id_t id = uvm_processor_mask_find_first_id(resident_on_mask);
13646 uvm_reverse_map_t gpu_mapping;
13647 size_t num_pages;
13648 uvm_gpu_t *gpu = uvm_va_space_get_gpu(va_space, id);
13649 uvm_gpu_phys_address_t phys_addr;
13650
13651 phys_addr = uvm_va_block_gpu_phys_page_address(block, page_index, gpu);
13652 num_pages = uvm_pmm_gpu_phys_to_virt(&gpu->pmm, phys_addr.address, PAGE_SIZE, &gpu_mapping);
13653
13654 // Chunk may be in TEMP_PINNED state so it may not have a VA block
13655 // assigned. In that case, we don't get a valid translation.
13656 if (num_pages > 0) {
13657 UVM_ASSERT(num_pages == 1);
13658 UVM_ASSERT(gpu_mapping.va_block == block);
13659 UVM_ASSERT(uvm_reverse_map_start(&gpu_mapping) == addr);
13660
13661 ++release_block_count;
13662 }
13663 }
13664 }
13665
13666 params->mapped_on_count = count;
13667
13668 count = 0;
13669 for_each_id(id) {
13670 if (!block_processor_page_is_populated(block, id, page_index))
13671 continue;
13672
13673 uvm_va_space_processor_uuid(va_space, ¶ms->populated_on[count], id);
13674 ++count;
13675 }
13676
13677 params->populated_on_count = count;
13678
13679 out:
13680 if (block) {
13681 if (!params->is_async && status == NV_OK)
13682 status = uvm_tracker_wait(&block->tracker);
13683 uvm_mutex_unlock(&block->lock);
13684 while (release_block_count--)
13685 uvm_va_block_release(block);
13686 }
13687
13688 uvm_va_space_up_read(va_space);
13689 uvm_va_block_context_free(block_context);
13690
13691 out_unlocked:
13692 uvm_va_space_mm_or_current_release_unlock(va_space, mm);
13693 uvm_processor_mask_cache_free(resident_on_mask);
13694
13695 return status;
13696 }
13697
uvm_va_block_mark_cpu_dirty(uvm_va_block_t * va_block)13698 void uvm_va_block_mark_cpu_dirty(uvm_va_block_t *va_block)
13699 {
13700 block_mark_region_cpu_dirty(va_block, uvm_va_block_region_from_block(va_block));
13701 }
13702