1 /*******************************************************************************
2 Copyright (c) 2015-2024 NVIDIA Corporation
3
4 Permission is hereby granted, free of charge, to any person obtaining a copy
5 of this software and associated documentation files (the "Software"), to
6 deal in the Software without restriction, including without limitation the
7 rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
8 sell copies of the Software, and to permit persons to whom the Software is
9 furnished to do so, subject to the following conditions:
10
11 The above copyright notice and this permission notice shall be
12 included in all copies or substantial portions of the Software.
13
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20 DEALINGS IN THE SOFTWARE.
21
22 *******************************************************************************/
23
24 #include "uvm_api.h"
25 #include "uvm_va_space.h"
26 #include "uvm_va_range.h"
27 #include "uvm_lock.h"
28 #include "uvm_global.h"
29 #include "uvm_kvmalloc.h"
30 #include "uvm_perf_heuristics.h"
31 #include "uvm_user_channel.h"
32 #include "uvm_tools.h"
33 #include "uvm_thread_context.h"
34 #include "uvm_hal.h"
35 #include "uvm_map_external.h"
36 #include "uvm_ats.h"
37 #include "uvm_gpu_access_counters.h"
38 #include "uvm_hmm.h"
39 #include "uvm_va_space_mm.h"
40 #include "uvm_test.h"
41 #include "uvm_common.h"
42 #include "nv_uvm_interface.h"
43 #include "nv-kthread-q.h"
44
processor_mask_array_test(const uvm_processor_mask_t * mask,uvm_processor_id_t mask_id,uvm_processor_id_t id)45 static bool processor_mask_array_test(const uvm_processor_mask_t *mask,
46 uvm_processor_id_t mask_id,
47 uvm_processor_id_t id)
48 {
49 return uvm_processor_mask_test(&mask[uvm_id_value(mask_id)], id);
50 }
51
processor_mask_array_clear(uvm_processor_mask_t * mask,uvm_processor_id_t mask_id,uvm_processor_id_t id)52 static void processor_mask_array_clear(uvm_processor_mask_t *mask,
53 uvm_processor_id_t mask_id,
54 uvm_processor_id_t id)
55 {
56 uvm_processor_mask_clear(&mask[uvm_id_value(mask_id)], id);
57 }
58
processor_mask_array_set(uvm_processor_mask_t * mask,uvm_processor_id_t mask_id,uvm_processor_id_t id)59 static void processor_mask_array_set(uvm_processor_mask_t *mask,
60 uvm_processor_id_t mask_id,
61 uvm_processor_id_t id)
62 {
63 uvm_processor_mask_set(&mask[uvm_id_value(mask_id)], id);
64 }
65
processor_mask_array_empty(const uvm_processor_mask_t * mask,uvm_processor_id_t mask_id)66 static bool processor_mask_array_empty(const uvm_processor_mask_t *mask, uvm_processor_id_t mask_id)
67 {
68 return uvm_processor_mask_empty(&mask[uvm_id_value(mask_id)]);
69 }
70
71 static NV_STATUS enable_peers(uvm_va_space_t *va_space, uvm_gpu_t *gpu0, uvm_gpu_t *gpu1);
72 static void disable_peers(uvm_va_space_t *va_space,
73 uvm_gpu_t *gpu0,
74 uvm_gpu_t *gpu1,
75 struct list_head *deferred_free_list);
76 static void remove_gpu_va_space(uvm_gpu_va_space_t *gpu_va_space,
77 struct mm_struct *mm,
78 struct list_head *deferred_free_list);
79 static void va_space_remove_dummy_thread_contexts(uvm_va_space_t *va_space);
80
init_tools_data(uvm_va_space_t * va_space)81 static void init_tools_data(uvm_va_space_t *va_space)
82 {
83 int i;
84
85 uvm_init_rwsem(&va_space->tools.lock, UVM_LOCK_ORDER_VA_SPACE_TOOLS);
86
87 for (i = 0; i < ARRAY_SIZE(va_space->tools.counters); i++)
88 INIT_LIST_HEAD(va_space->tools.counters + i);
89 for (i = 0; i < ARRAY_SIZE(va_space->tools.queues_v1); i++)
90 INIT_LIST_HEAD(va_space->tools.queues_v1 + i);
91 for (i = 0; i < ARRAY_SIZE(va_space->tools.queues_v2); i++)
92 INIT_LIST_HEAD(va_space->tools.queues_v2 + i);
93 }
94
register_gpu_peers(uvm_va_space_t * va_space,uvm_gpu_t * gpu)95 static NV_STATUS register_gpu_peers(uvm_va_space_t *va_space, uvm_gpu_t *gpu)
96 {
97 uvm_gpu_t *other_gpu;
98
99 uvm_assert_rwsem_locked(&va_space->lock);
100
101 for_each_va_space_gpu(other_gpu, va_space) {
102 uvm_gpu_peer_t *peer_caps;
103
104 if (uvm_id_equal(other_gpu->id, gpu->id))
105 continue;
106
107 peer_caps = uvm_gpu_peer_caps(gpu, other_gpu);
108
109 if (peer_caps->link_type >= UVM_GPU_LINK_NVLINK_1 || gpu->parent == other_gpu->parent) {
110 NV_STATUS status = enable_peers(va_space, gpu, other_gpu);
111 if (status != NV_OK)
112 return status;
113 }
114 }
115
116 return NV_OK;
117 }
118
va_space_check_processors_masks(uvm_va_space_t * va_space)119 static bool va_space_check_processors_masks(uvm_va_space_t *va_space)
120 {
121 uvm_processor_id_t processor;
122 uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, NULL);
123 uvm_processor_mask_t *processors = &block_context->scratch_processor_mask;
124
125 uvm_assert_rwsem_locked_write(&va_space->lock);
126
127 uvm_processor_mask_copy(processors, &va_space->registered_gpus);
128 uvm_processor_mask_set(processors, UVM_ID_CPU);
129
130 for_each_id_in_mask(processor, processors) {
131 uvm_processor_id_t other_processor;
132 bool check_can_copy_from = true;
133
134 if (UVM_ID_IS_GPU(processor)) {
135 uvm_gpu_t *gpu = uvm_va_space_get_gpu(va_space, processor);
136
137 // Peer copies between two processors can be disabled even when they
138 // are NvLink peers, or there is HW support for atomics between
139 // them.
140 if (gpu->parent->peer_copy_mode == UVM_GPU_PEER_COPY_MODE_UNSUPPORTED)
141 check_can_copy_from = false;
142 }
143
144 UVM_ASSERT(processor_mask_array_test(va_space->can_access, processor, processor));
145 UVM_ASSERT(processor_mask_array_test(va_space->accessible_from, processor, processor));
146 UVM_ASSERT(processor_mask_array_test(va_space->can_copy_from, processor, processor));
147 UVM_ASSERT(processor_mask_array_test(va_space->can_copy_from, processor, UVM_ID_CPU));
148 UVM_ASSERT(processor_mask_array_test(va_space->can_copy_from, UVM_ID_CPU, processor));
149
150 // NVLINK
151 UVM_ASSERT(!processor_mask_array_test(va_space->has_nvlink, processor, processor));
152
153 if (check_can_copy_from) {
154 UVM_ASSERT(uvm_processor_mask_subset(&va_space->has_nvlink[uvm_id_value(processor)],
155 &va_space->can_copy_from[uvm_id_value(processor)]));
156 }
157
158 // Peers
159 UVM_ASSERT(!processor_mask_array_test(va_space->indirect_peers, processor, processor));
160 UVM_ASSERT(uvm_processor_mask_subset(&va_space->indirect_peers[uvm_id_value(processor)],
161 &va_space->has_native_atomics[uvm_id_value(processor)]));
162
163 // Atomics
164 UVM_ASSERT(processor_mask_array_test(va_space->has_native_atomics, processor, processor));
165
166 if (check_can_copy_from) {
167 UVM_ASSERT(uvm_processor_mask_subset(&va_space->has_native_atomics[uvm_id_value(processor)],
168 &va_space->can_copy_from[uvm_id_value(processor)]));
169 }
170
171 UVM_ASSERT(uvm_processor_mask_subset(&va_space->has_native_atomics[uvm_id_value(processor)],
172 &va_space->can_access[uvm_id_value(processor)]));
173
174 for_each_id_in_mask(other_processor, &va_space->can_access[uvm_id_value(processor)])
175 UVM_ASSERT(processor_mask_array_test(va_space->accessible_from, other_processor, processor));
176
177 for_each_id_in_mask(other_processor, &va_space->accessible_from[uvm_id_value(processor)])
178 UVM_ASSERT(processor_mask_array_test(va_space->can_access, other_processor, processor));
179 }
180
181 return true;
182 }
183
uvm_va_space_create(struct address_space * mapping,uvm_va_space_t ** va_space_ptr,NvU64 flags)184 NV_STATUS uvm_va_space_create(struct address_space *mapping, uvm_va_space_t **va_space_ptr, NvU64 flags)
185 {
186 NV_STATUS status;
187 uvm_va_space_t *va_space = uvm_kvmalloc_zero(sizeof(*va_space));
188 uvm_gpu_id_t gpu_id;
189
190 *va_space_ptr = NULL;
191 if (!va_space)
192 return NV_ERR_NO_MEMORY;
193
194 if (flags & ~UVM_INIT_FLAGS_MASK) {
195 uvm_kvfree(va_space);
196 return NV_ERR_INVALID_ARGUMENT;
197 }
198
199 uvm_init_rwsem(&va_space->lock, UVM_LOCK_ORDER_VA_SPACE);
200 uvm_mutex_init(&va_space->closest_processors.mask_mutex, UVM_LOCK_ORDER_LEAF);
201 uvm_mutex_init(&va_space->serialize_writers_lock, UVM_LOCK_ORDER_VA_SPACE_SERIALIZE_WRITERS);
202 uvm_mutex_init(&va_space->read_acquire_write_release_lock,
203 UVM_LOCK_ORDER_VA_SPACE_READ_ACQUIRE_WRITE_RELEASE_LOCK);
204 uvm_spin_lock_init(&va_space->va_space_mm.lock, UVM_LOCK_ORDER_LEAF);
205 uvm_range_tree_init(&va_space->va_range_tree);
206 uvm_ats_init_va_space(va_space);
207
208 // Init to 0 since we rely on atomic_inc_return behavior to return 1 as the first ID
209 atomic64_set(&va_space->range_group_id_counter, 0);
210
211 INIT_RADIX_TREE(&va_space->range_groups, NV_UVM_GFP_FLAGS);
212 uvm_range_tree_init(&va_space->range_group_ranges);
213
214 bitmap_zero(va_space->enabled_peers, UVM_MAX_UNIQUE_GPU_PAIRS);
215
216 // CPU is not explicitly registered in the va space
217 processor_mask_array_set(va_space->can_access, UVM_ID_CPU, UVM_ID_CPU);
218 processor_mask_array_set(va_space->accessible_from, UVM_ID_CPU, UVM_ID_CPU);
219 processor_mask_array_set(va_space->can_copy_from, UVM_ID_CPU, UVM_ID_CPU);
220 processor_mask_array_set(va_space->has_native_atomics, UVM_ID_CPU, UVM_ID_CPU);
221
222 // CPU always participates in system-wide atomics
223 uvm_processor_mask_set(&va_space->system_wide_atomics_enabled_processors, UVM_ID_CPU);
224 uvm_processor_mask_set(&va_space->faultable_processors, UVM_ID_CPU);
225
226 // Initialize the CPU/GPU affinity array. New CPU NUMA nodes are added at
227 // GPU registration time, but they are never freed on unregister_gpu
228 // (although the GPU is removed from the corresponding mask).
229 for_each_gpu_id(gpu_id) {
230 uvm_cpu_gpu_affinity_t *affinity = &va_space->gpu_cpu_numa_affinity[uvm_id_gpu_index(gpu_id)];
231
232 affinity->numa_node = -1;
233 uvm_processor_mask_zero(&affinity->gpus);
234 }
235
236 init_waitqueue_head(&va_space->va_space_mm.last_retainer_wait_queue);
237 init_waitqueue_head(&va_space->gpu_va_space_deferred_free.wait_queue);
238
239 va_space->mapping = mapping;
240 va_space->test.page_prefetch_enabled = true;
241
242 init_tools_data(va_space);
243
244 uvm_down_write_mmap_lock(current->mm);
245 uvm_va_space_down_write(va_space);
246
247 va_space->va_block_context = uvm_va_block_context_alloc(NULL);
248 if (!va_space->va_block_context) {
249 status = NV_ERR_NO_MEMORY;
250 goto fail;
251 }
252
253 status = uvm_perf_init_va_space_events(va_space, &va_space->perf_events);
254 if (status != NV_OK)
255 goto fail;
256
257 status = uvm_perf_heuristics_load(va_space);
258 if (status != NV_OK)
259 goto fail;
260
261 status = uvm_gpu_init_va_space(va_space);
262 if (status != NV_OK)
263 goto fail;
264
265 UVM_ASSERT(va_space_check_processors_masks(va_space));
266
267 va_space->initialization_flags = flags;
268
269 status = uvm_va_space_mm_register(va_space);
270 if (status != NV_OK)
271 goto fail;
272
273 uvm_hmm_va_space_initialize(va_space);
274
275 uvm_va_space_up_write(va_space);
276 uvm_up_write_mmap_lock(current->mm);
277
278 uvm_mutex_lock(&g_uvm_global.va_spaces.lock);
279 list_add_tail(&va_space->list_node, &g_uvm_global.va_spaces.list);
280 uvm_mutex_unlock(&g_uvm_global.va_spaces.lock);
281
282 *va_space_ptr = va_space;
283
284 return NV_OK;
285
286 fail:
287 uvm_perf_heuristics_unload(va_space);
288 uvm_perf_destroy_va_space_events(&va_space->perf_events);
289 uvm_va_block_context_free(va_space->va_block_context);
290 uvm_va_space_up_write(va_space);
291 uvm_up_write_mmap_lock(current->mm);
292
293 // See the comment in uvm_va_space_mm_unregister() for why this has to be
294 // called after releasing the locks.
295 uvm_va_space_mm_unregister(va_space);
296
297 uvm_kvfree(va_space);
298
299 return status;
300 }
301
302 // This function does *not* release the GPU, nor the GPU's PCIE peer pairings.
303 // Those are returned so the caller can do it after dropping the VA space lock.
unregister_gpu(uvm_va_space_t * va_space,uvm_gpu_t * gpu,struct mm_struct * mm,struct list_head * deferred_free_list,uvm_processor_mask_t * peers_to_release)304 static void unregister_gpu(uvm_va_space_t *va_space,
305 uvm_gpu_t *gpu,
306 struct mm_struct *mm,
307 struct list_head *deferred_free_list,
308 uvm_processor_mask_t *peers_to_release)
309 {
310 uvm_gpu_t *peer_gpu;
311 uvm_va_range_t *va_range;
312 NvU32 peer_table_index;
313
314 uvm_assert_rwsem_locked_write(&va_space->lock);
315
316 if (peers_to_release)
317 uvm_processor_mask_zero(peers_to_release);
318
319 // If a GPU VA Space was explicitly registered, but not explicitly
320 // unregistered, unregister it and add all of its objects to the free list.
321 remove_gpu_va_space(uvm_gpu_va_space_get(va_space, gpu), mm, deferred_free_list);
322
323 uvm_for_each_va_range(va_range, va_space)
324 uvm_va_range_unregister_gpu(va_range, gpu, mm, deferred_free_list);
325
326 uvm_hmm_unregister_gpu(va_space, gpu, mm);
327
328 // If this GPU has any peer-to-peer pair that was explicitly enabled, but
329 // not explicitly disabled, disable it.
330 // Notably do this only after unregistering the GPU from VA ranges to make
331 // sure there is no pending work using the peer mappings within the VA
332 // blocks (in particular migrations using the peer identity mappings).
333 for_each_va_space_gpu(peer_gpu, va_space) {
334 if (gpu == peer_gpu)
335 continue;
336
337 peer_table_index = uvm_gpu_peer_table_index(gpu->id, peer_gpu->id);
338 if (test_bit(peer_table_index, va_space->enabled_peers)) {
339 disable_peers(va_space, gpu, peer_gpu, deferred_free_list);
340
341 // Only PCIE peers need to be globally released. NVLINK peers are
342 // brought up and torn down automatically within add_gpu and
343 // remove_gpu.
344 if (peers_to_release && g_uvm_global.peers[peer_table_index].link_type == UVM_GPU_LINK_PCIE)
345 uvm_processor_mask_set(peers_to_release, peer_gpu->id);
346 }
347 }
348
349 if (gpu->parent->isr.replayable_faults.handling) {
350 UVM_ASSERT(uvm_processor_mask_test(&va_space->faultable_processors, gpu->id));
351 uvm_processor_mask_clear(&va_space->faultable_processors, gpu->id);
352 uvm_processor_mask_clear(&va_space->system_wide_atomics_enabled_processors, gpu->id);
353 }
354 else {
355 UVM_ASSERT(uvm_processor_mask_test(&va_space->non_faultable_processors, gpu->id));
356 uvm_processor_mask_clear(&va_space->non_faultable_processors, gpu->id);
357 }
358
359 processor_mask_array_clear(va_space->can_access, gpu->id, gpu->id);
360 processor_mask_array_clear(va_space->can_access, gpu->id, UVM_ID_CPU);
361 processor_mask_array_clear(va_space->can_access, UVM_ID_CPU, gpu->id);
362 UVM_ASSERT(processor_mask_array_empty(va_space->can_access, gpu->id));
363
364 processor_mask_array_clear(va_space->accessible_from, gpu->id, gpu->id);
365 processor_mask_array_clear(va_space->accessible_from, gpu->id, UVM_ID_CPU);
366 processor_mask_array_clear(va_space->accessible_from, UVM_ID_CPU, gpu->id);
367 UVM_ASSERT(processor_mask_array_empty(va_space->accessible_from, gpu->id));
368
369 processor_mask_array_clear(va_space->can_copy_from, gpu->id, gpu->id);
370 processor_mask_array_clear(va_space->can_copy_from, gpu->id, UVM_ID_CPU);
371 processor_mask_array_clear(va_space->can_copy_from, UVM_ID_CPU, gpu->id);
372 UVM_ASSERT(processor_mask_array_empty(va_space->can_copy_from, gpu->id));
373
374 processor_mask_array_clear(va_space->has_nvlink, gpu->id, UVM_ID_CPU);
375 processor_mask_array_clear(va_space->has_nvlink, UVM_ID_CPU, gpu->id);
376 UVM_ASSERT(processor_mask_array_empty(va_space->has_nvlink, gpu->id));
377
378 UVM_ASSERT(processor_mask_array_empty(va_space->indirect_peers, gpu->id));
379
380 processor_mask_array_clear(va_space->has_native_atomics, gpu->id, gpu->id);
381 processor_mask_array_clear(va_space->has_native_atomics, gpu->id, UVM_ID_CPU);
382 processor_mask_array_clear(va_space->has_native_atomics, UVM_ID_CPU, gpu->id);
383 UVM_ASSERT(processor_mask_array_empty(va_space->has_native_atomics, gpu->id));
384
385 uvm_processor_mask_clear(&va_space->registered_gpus, gpu->id);
386 va_space->registered_gpus_table[uvm_id_gpu_index(gpu->id)] = NULL;
387
388 // Remove the GPU from the CPU/GPU affinity masks
389 if (gpu->parent->closest_cpu_numa_node != -1) {
390 uvm_gpu_id_t gpu_id;
391
392 for_each_gpu_id(gpu_id) {
393 uvm_cpu_gpu_affinity_t *affinity = &va_space->gpu_cpu_numa_affinity[uvm_id_gpu_index(gpu_id)];
394
395 if (affinity->numa_node == gpu->parent->closest_cpu_numa_node) {
396 uvm_processor_mask_clear(&affinity->gpus, gpu->id);
397 break;
398 }
399 }
400 }
401
402 if (va_space->gpu_unregister_dma_buffer[uvm_id_gpu_index(gpu->id)]) {
403 uvm_conf_computing_dma_buffer_free(&gpu->conf_computing.dma_buffer_pool,
404 va_space->gpu_unregister_dma_buffer[uvm_id_gpu_index(gpu->id)],
405 &va_space->gpu_unregister_dma_buffer[uvm_id_gpu_index(gpu->id)]->tracker);
406 }
407 va_space_check_processors_masks(va_space);
408 }
409
gpu_va_space_stop_all_channels(uvm_gpu_va_space_t * gpu_va_space)410 static void gpu_va_space_stop_all_channels(uvm_gpu_va_space_t *gpu_va_space)
411 {
412 uvm_user_channel_t *user_channel;
413
414 list_for_each_entry(user_channel, &gpu_va_space->registered_channels, list_node)
415 uvm_user_channel_stop(user_channel);
416
417 // Prevent new channels from being registered since we'll be dropping the
418 // VA space lock shortly with the expectation that no more channels will
419 // arrive.
420 atomic_set(&gpu_va_space->disallow_new_channels, 1);
421 }
422
423 // Detaches (unregisters) all user channels in a GPU VA space. The channels must
424 // have previously been stopped.
425 //
426 // The detached channels are added to the input list. The caller is expected to
427 // drop the VA space lock and call uvm_deferred_free_object_list to complete the
428 // destroy operation.
uvm_gpu_va_space_detach_all_user_channels(uvm_gpu_va_space_t * gpu_va_space,struct list_head * deferred_free_list)429 static void uvm_gpu_va_space_detach_all_user_channels(uvm_gpu_va_space_t *gpu_va_space,
430 struct list_head *deferred_free_list)
431 {
432 uvm_user_channel_t *user_channel, *next_channel;
433 list_for_each_entry_safe(user_channel, next_channel, &gpu_va_space->registered_channels, list_node)
434 uvm_user_channel_detach(user_channel, deferred_free_list);
435 }
436
uvm_va_space_detach_all_user_channels(uvm_va_space_t * va_space,struct list_head * deferred_free_list)437 void uvm_va_space_detach_all_user_channels(uvm_va_space_t *va_space, struct list_head *deferred_free_list)
438 {
439 uvm_gpu_va_space_t *gpu_va_space;
440 for_each_gpu_va_space(gpu_va_space, va_space)
441 uvm_gpu_va_space_detach_all_user_channels(gpu_va_space, deferred_free_list);
442 }
443
uvm_va_space_destroy(uvm_va_space_t * va_space)444 void uvm_va_space_destroy(uvm_va_space_t *va_space)
445 {
446 uvm_va_range_t *va_range, *va_range_next;
447 uvm_gpu_t *gpu;
448 uvm_gpu_id_t gpu_id;
449 uvm_processor_mask_t *retained_gpus = &va_space->registered_gpus_teardown;
450 LIST_HEAD(deferred_free_list);
451
452 // Remove the VA space from the global list before we start tearing things
453 // down so other threads can't see the VA space in a partially-valid state.
454 uvm_mutex_lock(&g_uvm_global.va_spaces.lock);
455 list_del(&va_space->list_node);
456 uvm_mutex_unlock(&g_uvm_global.va_spaces.lock);
457
458 uvm_perf_heuristics_stop(va_space);
459
460 // Stop all channels before unmapping anything. This kills the channels and
461 // prevents spurious MMU faults from being generated (bug 1722021), but
462 // doesn't prevent the bottom half from servicing old faults for those
463 // channels.
464 //
465 // This involves making RM calls, so we have to do that with the VA space
466 // lock in read mode.
467 uvm_va_space_down_read_rm(va_space);
468 uvm_va_space_stop_all_user_channels(va_space);
469 uvm_va_space_up_read_rm(va_space);
470
471 // The bottom half GPU page fault handler(s) could still look up and use
472 // this va_space via the GPU's instance_ptr_table. Lock them out while we
473 // tear down. Once we're done, the bottom half will fail to find any
474 // registered GPUs in the VA space, so those faults will be canceled.
475 uvm_va_space_down_write(va_space);
476
477 uvm_processor_mask_copy(retained_gpus, &va_space->registered_gpus);
478
479 bitmap_copy(va_space->enabled_peers_teardown, va_space->enabled_peers, UVM_MAX_UNIQUE_GPU_PAIRS);
480
481 uvm_va_space_detach_all_user_channels(va_space, &deferred_free_list);
482
483 // Destroy all VA ranges. We do this before unregistering the GPUs for
484 // performance, since GPU unregister will walk all VA ranges in the VA space
485 // multiple times.
486 uvm_for_each_va_range_safe(va_range, va_range_next, va_space) {
487 // All channel ranges should've been destroyed by the channel unregister
488 // above
489 UVM_ASSERT(va_range->type != UVM_VA_RANGE_TYPE_CHANNEL);
490 uvm_va_range_destroy(va_range, &deferred_free_list);
491 }
492
493 uvm_range_group_radix_tree_destroy(va_space);
494
495 // Unregister all GPUs in the VA space. Note that this does not release the
496 // GPUs nor peers. We do that below.
497 for_each_va_space_gpu(gpu, va_space)
498 unregister_gpu(va_space, gpu, NULL, &deferred_free_list, NULL);
499
500 uvm_hmm_va_space_destroy(va_space);
501
502 uvm_perf_heuristics_unload(va_space);
503 uvm_perf_destroy_va_space_events(&va_space->perf_events);
504
505 va_space_remove_dummy_thread_contexts(va_space);
506
507 // Destroy the VA space's block context node tracking after all ranges have
508 // been destroyed as the VA blocks may reference it.
509 uvm_va_block_context_free(va_space->va_block_context);
510
511 uvm_va_space_up_write(va_space);
512
513 UVM_ASSERT(uvm_processor_mask_empty(&va_space->registered_gpus));
514 UVM_ASSERT(uvm_processor_mask_empty(&va_space->registered_gpu_va_spaces));
515
516 for_each_gpu_id(gpu_id)
517 UVM_ASSERT(va_space->registered_gpus_table[uvm_id_gpu_index(gpu_id)] == NULL);
518
519 // The instance pointer mappings for this VA space have been removed so no
520 // new bottom halves can get to this VA space, but there could still be
521 // bottom halves running from before we removed the mapping. Rather than
522 // ref-count the VA space, just wait for them to finish.
523 //
524 // This is also required to synchronize any pending
525 // block_deferred_accessed_by() work items.
526
527 nv_kthread_q_flush(&g_uvm_global.global_q);
528
529 for_each_gpu_in_mask(gpu, retained_gpus) {
530 // Free the processor masks allocated in uvm_va_space_register_gpu().
531 // The mask is also freed in uvm_va_space_unregister_gpu() but that
532 // function won't be called in uvm_release() and uvm_release_deferred()
533 // path.
534 uvm_processor_mask_cache_free(va_space->peers_to_release[uvm_id_value(gpu->id)]);
535
536 // Set the pointer to NULL to avoid accidental re-use and double free.
537 va_space->peers_to_release[uvm_id_value(gpu->id)] = NULL;
538
539 if (!gpu->parent->isr.replayable_faults.handling) {
540 UVM_ASSERT(!gpu->parent->isr.non_replayable_faults.handling);
541 continue;
542 }
543
544 nv_kthread_q_flush(&gpu->parent->isr.bottom_half_q);
545
546 // The same applies to the kill channel kthreads. However, they need to
547 // be flushed after their bottom-half counterparts since the latter may
548 // schedule a channel kill.
549 if (gpu->parent->isr.non_replayable_faults.handling)
550 nv_kthread_q_flush(&gpu->parent->isr.kill_channel_q);
551
552 if (gpu->parent->access_counters_supported)
553 uvm_parent_gpu_access_counters_disable(gpu->parent, va_space);
554
555 }
556
557 // Check that all CPU/GPU affinity masks are empty
558 for_each_gpu_id(gpu_id) {
559 const uvm_cpu_gpu_affinity_t *affinity = &va_space->gpu_cpu_numa_affinity[uvm_id_gpu_index(gpu_id)];
560
561 UVM_ASSERT(uvm_processor_mask_empty(&affinity->gpus));
562 }
563
564 // ensure that there are no pending events that refer to this va_space
565 uvm_tools_flush_events();
566
567 // Perform cleanup we can't do while holding the VA space lock
568
569 uvm_deferred_free_object_list(&deferred_free_list);
570
571 // Normally we'd expect this to happen as part of uvm_mm_release()
572 // but if userspace never initialized uvm_mm_fd that won't happen.
573 // We don't have to take the va_space_mm spinlock and update state
574 // here because we know no other thread can be in or subsequently
575 // call uvm_api_mm_initialize successfully because the UVM
576 // file-descriptor has been released.
577 if (va_space->va_space_mm.state == UVM_VA_SPACE_MM_STATE_UNINITIALIZED)
578 uvm_va_space_mm_unregister(va_space);
579 UVM_ASSERT(!uvm_va_space_mm_alive(&va_space->va_space_mm));
580
581 uvm_mutex_lock(&g_uvm_global.global_lock);
582
583 // Release the GPUs and their peer counts. Do not use
584 // for_each_gpu_in_mask for the outer loop as it reads the GPU
585 // state, which might get destroyed.
586 for_each_gpu_id_in_mask(gpu_id, retained_gpus) {
587 uvm_gpu_t *peer_gpu;
588
589 gpu = uvm_gpu_get(gpu_id);
590
591 uvm_processor_mask_clear(retained_gpus, gpu_id);
592
593 for_each_gpu_in_mask(peer_gpu, retained_gpus) {
594 NvU32 peer_table_index = uvm_gpu_peer_table_index(gpu->id, peer_gpu->id);
595 if (test_bit(peer_table_index, va_space->enabled_peers_teardown)) {
596 uvm_gpu_peer_t *peer_caps = &g_uvm_global.peers[peer_table_index];
597
598 if (peer_caps->link_type == UVM_GPU_LINK_PCIE)
599 uvm_gpu_release_pcie_peer_access(gpu, peer_gpu);
600
601 __clear_bit(peer_table_index, va_space->enabled_peers_teardown);
602 }
603 }
604
605 uvm_gpu_release_locked(gpu);
606 }
607
608 UVM_ASSERT(bitmap_empty(va_space->enabled_peers, UVM_MAX_UNIQUE_GPU_PAIRS));
609 UVM_ASSERT(bitmap_empty(va_space->enabled_peers_teardown, UVM_MAX_UNIQUE_GPU_PAIRS));
610
611 uvm_mutex_unlock(&g_uvm_global.global_lock);
612
613 uvm_kvfree(va_space->mapping);
614 uvm_kvfree(va_space);
615 }
616
uvm_va_space_stop_all_user_channels(uvm_va_space_t * va_space)617 void uvm_va_space_stop_all_user_channels(uvm_va_space_t *va_space)
618 {
619 uvm_gpu_va_space_t *gpu_va_space;
620 uvm_user_channel_t *user_channel;
621
622 // Skip if all channels have been already stopped.
623 if (atomic_read(&va_space->user_channels_stopped))
624 return;
625
626 uvm_assert_rwsem_locked_read(&va_space->lock);
627
628 for_each_gpu_va_space(gpu_va_space, va_space) {
629 list_for_each_entry(user_channel, &gpu_va_space->registered_channels, list_node)
630 uvm_user_channel_stop(user_channel);
631 }
632
633 // Since we're holding the VA space lock in read mode, multiple threads
634 // could set this concurrently. user_channels_stopped never transitions back
635 // to 0 after being set to 1 so that's not a problem.
636 atomic_set(&va_space->user_channels_stopped, 1);
637 }
638
uvm_va_space_get_gpu_by_uuid(uvm_va_space_t * va_space,const NvProcessorUuid * gpu_uuid)639 uvm_gpu_t *uvm_va_space_get_gpu_by_uuid(uvm_va_space_t *va_space, const NvProcessorUuid *gpu_uuid)
640 {
641 uvm_gpu_t *gpu;
642
643 for_each_va_space_gpu(gpu, va_space) {
644 if (uvm_uuid_eq(&gpu->uuid, gpu_uuid))
645 return gpu;
646 }
647
648 return NULL;
649 }
650
uvm_va_space_get_gpu_by_uuid_with_gpu_va_space(uvm_va_space_t * va_space,const NvProcessorUuid * gpu_uuid)651 uvm_gpu_t *uvm_va_space_get_gpu_by_uuid_with_gpu_va_space(uvm_va_space_t *va_space,
652 const NvProcessorUuid *gpu_uuid)
653 {
654 uvm_gpu_t *gpu;
655
656 gpu = uvm_va_space_get_gpu_by_uuid(va_space, gpu_uuid);
657 if (!gpu || !uvm_processor_mask_test(&va_space->registered_gpu_va_spaces, gpu->id))
658 return NULL;
659
660 return gpu;
661 }
662
uvm_va_space_retain_gpu_by_uuid(uvm_va_space_t * va_space,const NvProcessorUuid * gpu_uuid)663 uvm_gpu_t *uvm_va_space_retain_gpu_by_uuid(uvm_va_space_t *va_space, const NvProcessorUuid *gpu_uuid)
664 {
665 uvm_gpu_t *gpu;
666
667 uvm_va_space_down_read(va_space);
668
669 gpu = uvm_va_space_get_gpu_by_uuid(va_space, gpu_uuid);
670 if (gpu)
671 uvm_gpu_retain(gpu);
672
673 uvm_va_space_up_read(va_space);
674
675 return gpu;
676 }
677
uvm_va_space_can_read_duplicate(uvm_va_space_t * va_space,uvm_gpu_t * changing_gpu)678 bool uvm_va_space_can_read_duplicate(uvm_va_space_t *va_space, uvm_gpu_t *changing_gpu)
679 {
680 NvU32 count = va_space->num_non_faultable_gpu_va_spaces;
681
682 if (changing_gpu && !uvm_processor_mask_test(&va_space->faultable_processors, changing_gpu->id)) {
683 if (uvm_processor_mask_test(&va_space->registered_gpu_va_spaces, changing_gpu->id)) {
684 // A non-faultable GPU is getting removed.
685 UVM_ASSERT(count > 0);
686 --count;
687 }
688 else {
689 // A non-faultable GPU is getting added.
690 ++count;
691 }
692 }
693
694 return count == 0;
695 }
696
697 // Note that the "VA space" in the function name refers to a UVM per-process
698 // VA space. (This is different from a per-GPU VA space.)
uvm_va_space_register_gpu(uvm_va_space_t * va_space,const NvProcessorUuid * gpu_uuid,const uvm_rm_user_object_t * user_rm_device,NvBool * numa_enabled,NvS32 * numa_node_id,NvProcessorUuid * uuid_out)699 NV_STATUS uvm_va_space_register_gpu(uvm_va_space_t *va_space,
700 const NvProcessorUuid *gpu_uuid,
701 const uvm_rm_user_object_t *user_rm_device,
702 NvBool *numa_enabled,
703 NvS32 *numa_node_id,
704 NvProcessorUuid *uuid_out)
705 {
706 NV_STATUS status;
707 uvm_va_range_t *va_range;
708 uvm_gpu_t *gpu;
709 uvm_gpu_t *other_gpu;
710 bool gpu_can_access_sysmem = true;
711 uvm_processor_mask_t *peers_to_release = NULL;
712
713 status = uvm_gpu_retain_by_uuid(gpu_uuid, user_rm_device, &gpu);
714 if (status != NV_OK)
715 return status;
716
717 uvm_uuid_copy(uuid_out, &gpu->uuid);
718
719 // Enabling access counters requires taking the ISR lock, so it is done
720 // without holding the (deeper order) VA space lock. Enabling the counters
721 // after dropping the VA space lock would create a window of time in which
722 // another thread could see the GPU as registered, but access counters would
723 // be disabled. Therefore, the counters are enabled before taking the VA
724 // space lock.
725 if (uvm_parent_gpu_access_counters_required(gpu->parent)) {
726 status = uvm_gpu_access_counters_enable(gpu, va_space);
727 if (status != NV_OK) {
728 uvm_gpu_release(gpu);
729 return status;
730 }
731 }
732
733 uvm_va_space_down_write(va_space);
734
735 // Make sure the gpu hasn't been already registered in this va space
736 if (uvm_processor_mask_test(&va_space->registered_gpus, gpu->id)) {
737 status = NV_ERR_INVALID_DEVICE;
738 goto done;
739 }
740
741 // Mixing coherent and non-coherent GPUs is not supported
742 for_each_va_space_gpu(other_gpu, va_space) {
743 if (uvm_parent_gpu_is_coherent(gpu->parent) != uvm_parent_gpu_is_coherent(other_gpu->parent)) {
744 status = NV_ERR_INVALID_DEVICE;
745 goto done;
746 }
747 }
748
749 // The VA space's mm is being torn down, so don't allow more work
750 if (va_space->disallow_new_registers) {
751 status = NV_ERR_PAGE_TABLE_NOT_AVAIL;
752 goto done;
753 }
754
755 if (g_uvm_global.conf_computing_enabled) {
756 NvU32 gpu_index = uvm_id_gpu_index(gpu->id);
757 status = uvm_conf_computing_dma_buffer_alloc(&gpu->conf_computing.dma_buffer_pool,
758 &va_space->gpu_unregister_dma_buffer[gpu_index],
759 NULL);
760 if (status != NV_OK)
761 goto done;
762
763 gpu_can_access_sysmem = false;
764 }
765
766 UVM_ASSERT(!va_space->peers_to_release[uvm_id_value(gpu->id)]);
767
768 peers_to_release = uvm_processor_mask_cache_alloc();
769 if (!peers_to_release) {
770 status = NV_ERR_NO_MEMORY;
771 goto done;
772 }
773
774 va_space->peers_to_release[uvm_id_value(gpu->id)] = peers_to_release;
775
776 uvm_processor_mask_set(&va_space->registered_gpus, gpu->id);
777 va_space->registered_gpus_table[uvm_id_gpu_index(gpu->id)] = gpu;
778
779 if (gpu->parent->isr.replayable_faults.handling) {
780 UVM_ASSERT(!uvm_processor_mask_test(&va_space->faultable_processors, gpu->id));
781 uvm_processor_mask_set(&va_space->faultable_processors, gpu->id);
782
783 UVM_ASSERT(!uvm_processor_mask_test(&va_space->system_wide_atomics_enabled_processors, gpu->id));
784 // System-wide atomics are enabled by default
785 uvm_processor_mask_set(&va_space->system_wide_atomics_enabled_processors, gpu->id);
786 }
787 else {
788 UVM_ASSERT(!uvm_processor_mask_test(&va_space->non_faultable_processors, gpu->id));
789 uvm_processor_mask_set(&va_space->non_faultable_processors, gpu->id);
790 }
791
792 // All GPUs have native atomics on their own memory
793 processor_mask_array_set(va_space->has_native_atomics, gpu->id, gpu->id);
794
795 // TODO: Bug 3252572: Support the new link type UVM_GPU_LINK_C2C
796 if (gpu->parent->system_bus.link >= UVM_GPU_LINK_NVLINK_1) {
797 processor_mask_array_set(va_space->has_nvlink, gpu->id, UVM_ID_CPU);
798 processor_mask_array_set(va_space->has_nvlink, UVM_ID_CPU, gpu->id);
799 }
800
801 if (uvm_parent_gpu_is_coherent(gpu->parent)) {
802 processor_mask_array_set(va_space->has_native_atomics, gpu->id, UVM_ID_CPU);
803
804 if (gpu->mem_info.numa.enabled) {
805 processor_mask_array_set(va_space->can_access, UVM_ID_CPU, gpu->id);
806 processor_mask_array_set(va_space->accessible_from, gpu->id, UVM_ID_CPU);
807 processor_mask_array_set(va_space->has_native_atomics, UVM_ID_CPU, gpu->id);
808 }
809 }
810
811 // All processors have direct access to their own memory
812 processor_mask_array_set(va_space->can_access, gpu->id, gpu->id);
813 processor_mask_array_set(va_space->accessible_from, gpu->id, gpu->id);
814
815 if (gpu_can_access_sysmem) {
816 processor_mask_array_set(va_space->can_access, gpu->id, UVM_ID_CPU);
817 processor_mask_array_set(va_space->accessible_from, UVM_ID_CPU, gpu->id);
818 }
819
820 processor_mask_array_set(va_space->can_copy_from, gpu->id, gpu->id);
821 processor_mask_array_set(va_space->can_copy_from, gpu->id, UVM_ID_CPU);
822 processor_mask_array_set(va_space->can_copy_from, UVM_ID_CPU, gpu->id);
823
824 // Update the CPU/GPU affinity masks
825 if (gpu->parent->closest_cpu_numa_node != -1) {
826 uvm_gpu_id_t gpu_id;
827
828 for_each_gpu_id(gpu_id) {
829 uvm_cpu_gpu_affinity_t *affinity = &va_space->gpu_cpu_numa_affinity[uvm_id_gpu_index(gpu_id)];
830
831 // If this is the first time this node is seen, take a new entry of
832 // the array. Entries are never released in order to avoid having
833 // to deal with holes.
834 if (affinity->numa_node == -1) {
835 UVM_ASSERT(uvm_processor_mask_empty(&affinity->gpus));
836 affinity->numa_node = gpu->parent->closest_cpu_numa_node;
837 }
838
839 if (affinity->numa_node == gpu->parent->closest_cpu_numa_node) {
840 uvm_processor_mask_set(&affinity->gpus, gpu->id);
841 break;
842 }
843 }
844 }
845
846 status = register_gpu_peers(va_space, gpu);
847 if (status != NV_OK)
848 goto cleanup;
849
850 uvm_perf_heuristics_register_gpu(va_space, gpu);
851
852 uvm_for_each_va_range(va_range, va_space) {
853 status = uvm_va_range_register_gpu(va_range, gpu);
854 if (status != NV_OK)
855 goto cleanup;
856 }
857
858 if (gpu->mem_info.numa.enabled) {
859 *numa_enabled = NV_TRUE;
860 *numa_node_id = (NvS32)uvm_gpu_numa_node(gpu);
861 }
862 else {
863 *numa_enabled = NV_FALSE;
864 *numa_node_id = -1;
865 }
866
867 goto done;
868
869 cleanup:
870 // Clear out all of the processor mask bits. No VA ranges have mapped or
871 // allocated anything on this GPU yet if we fail here, so we don't need
872 // a deferred_free_list, mm, etc.
873 unregister_gpu(va_space, gpu, NULL, NULL, NULL);
874
875 va_space->peers_to_release[uvm_id_value(gpu->id)] = NULL;
876
877 uvm_processor_mask_cache_free(peers_to_release);
878
879 done:
880 UVM_ASSERT(va_space_check_processors_masks(va_space));
881
882 uvm_va_space_up_write(va_space);
883
884 if (status != NV_OK) {
885 // There is no risk of disabling access counters on a previously
886 // registered GPU: the enablement step would have failed before even
887 // discovering that the GPU is already registered.
888 if (uvm_parent_gpu_access_counters_required(gpu->parent))
889 uvm_parent_gpu_access_counters_disable(gpu->parent, va_space);
890
891 uvm_gpu_release(gpu);
892 }
893
894 return status;
895 }
896
uvm_va_space_unregister_gpu(uvm_va_space_t * va_space,const NvProcessorUuid * gpu_uuid)897 NV_STATUS uvm_va_space_unregister_gpu(uvm_va_space_t *va_space, const NvProcessorUuid *gpu_uuid)
898 {
899 uvm_gpu_t *gpu;
900 uvm_gpu_va_space_t *gpu_va_space;
901 struct mm_struct *mm;
902 uvm_gpu_id_t peer_gpu_id;
903 uvm_processor_mask_t *peers_to_release;
904 LIST_HEAD(deferred_free_list);
905
906 // Stopping channels requires holding the VA space lock in read mode, so do
907 // it first. We start in write mode then drop to read in order to flush out
908 // other threads which are in the read-mode portion of any of the register
909 // or unregister operations.
910 uvm_va_space_down_write(va_space);
911
912 gpu = uvm_va_space_get_gpu_by_uuid(va_space, gpu_uuid);
913 if (!gpu) {
914 uvm_va_space_up_write(va_space);
915 return NV_ERR_INVALID_DEVICE;
916 }
917
918 // We have to drop the VA space lock below mid-unregister. We have to
919 // prevent any other threads from coming in during that window and allowing
920 // new channels to enter the GPU. That means we must disallow:
921 // - GPU VA space register
922 // - GPU unregister (which would allow new GPU registers)
923 if (uvm_processor_mask_test(&va_space->gpu_unregister_in_progress, gpu->id)) {
924 uvm_va_space_up_write(va_space);
925 return NV_ERR_INVALID_DEVICE;
926 }
927
928 uvm_processor_mask_set(&va_space->gpu_unregister_in_progress, gpu->id);
929
930 uvm_va_space_downgrade_write_rm(va_space);
931
932 gpu_va_space = uvm_gpu_va_space_get(va_space, gpu);
933 if (gpu_va_space)
934 gpu_va_space_stop_all_channels(gpu_va_space);
935
936 // We need to drop the lock to re-take it in write mode. We don't have to
937 // retain the GPU because we've prevented other threads from unregistering
938 // it from the VA space until we're done.
939 uvm_va_space_up_read_rm(va_space);
940
941 // If uvm_parent_gpu_access_counters_required(gpu->parent) is true, a
942 // concurrent registration could enable access counters after they are
943 // disabled here.
944 // The concurrent registration will fail later on if it acquires the VA
945 // space lock before the unregistration does (because the GPU is still
946 // registered) and undo the access counters enablement, or succeed if it
947 // acquires the VA space lock after the unregistration does. Both outcomes
948 // result on valid states.
949 if (gpu->parent->access_counters_supported)
950 uvm_parent_gpu_access_counters_disable(gpu->parent, va_space);
951
952 // mmap_lock is needed to establish CPU mappings to any pages evicted from
953 // the GPU if accessed by CPU is set for them.
954 mm = uvm_va_space_mm_or_current_retain_lock(va_space);
955
956 uvm_va_space_down_write(va_space);
957
958 // We blocked out other GPU unregisters, so this GPU must still be
959 // registered. However, the GPU VA space might have been unregistered on us.
960 UVM_ASSERT(uvm_processor_mask_test(&va_space->registered_gpus, gpu->id));
961 if (uvm_processor_mask_test(&va_space->registered_gpu_va_spaces, gpu->id))
962 UVM_ASSERT(uvm_gpu_va_space_get(va_space, gpu) == gpu_va_space);
963
964 peers_to_release = va_space->peers_to_release[uvm_id_value(gpu->id)];
965
966 va_space->peers_to_release[uvm_id_value(gpu->id)] = NULL;
967
968 // This will call disable_peers for all GPU's peers, including NVLink
969 unregister_gpu(va_space, gpu, mm, &deferred_free_list, peers_to_release);
970
971 UVM_ASSERT(uvm_processor_mask_test(&va_space->gpu_unregister_in_progress, gpu->id));
972 uvm_processor_mask_clear(&va_space->gpu_unregister_in_progress, gpu->id);
973
974 uvm_va_space_up_write(va_space);
975
976 // Unlock the mm since the call to uvm_deferred_free_object_list() requires
977 // that we don't hold any locks. We don't release the mm yet because that
978 // could call uvm_va_space_mm_shutdown() which waits for the deferred free
979 // list to be empty which would cause a deadlock.
980 if (mm)
981 uvm_up_read_mmap_lock(mm);
982
983 uvm_deferred_free_object_list(&deferred_free_list);
984
985 // Release the VA space's GPU and peer counts
986 uvm_mutex_lock(&g_uvm_global.global_lock);
987
988 // Do not use for_each_gpu_in_mask as it reads the peer GPU state,
989 // which might get destroyed when we release the peer entry.
990 UVM_ASSERT(peers_to_release);
991
992 for_each_gpu_id_in_mask(peer_gpu_id, peers_to_release) {
993 uvm_gpu_t *peer_gpu = uvm_gpu_get(peer_gpu_id);
994 UVM_ASSERT(uvm_gpu_peer_caps(gpu, peer_gpu)->link_type == UVM_GPU_LINK_PCIE);
995 uvm_gpu_release_pcie_peer_access(gpu, peer_gpu);
996 }
997
998 uvm_processor_mask_cache_free(peers_to_release);
999
1000 uvm_gpu_release_locked(gpu);
1001
1002 uvm_mutex_unlock(&g_uvm_global.global_lock);
1003
1004 uvm_va_space_mm_or_current_release(va_space, mm);
1005
1006 return NV_OK;
1007 }
1008
1009 // This does *not* release the global GPU peer entry
disable_peers(uvm_va_space_t * va_space,uvm_gpu_t * gpu0,uvm_gpu_t * gpu1,struct list_head * deferred_free_list)1010 static void disable_peers(uvm_va_space_t *va_space,
1011 uvm_gpu_t *gpu0,
1012 uvm_gpu_t *gpu1,
1013 struct list_head *deferred_free_list)
1014 {
1015 NvU32 table_index;
1016 uvm_va_range_t *va_range;
1017
1018 uvm_assert_rwsem_locked_write(&va_space->lock);
1019
1020 table_index = uvm_gpu_peer_table_index(gpu0->id, gpu1->id);
1021
1022 if (!test_bit(table_index, va_space->enabled_peers))
1023 return;
1024
1025 // Unmap all page tables in this VA space which have peer mappings between
1026 // these two GPUs.
1027 uvm_for_each_va_range(va_range, va_space)
1028 uvm_va_range_disable_peer(va_range, gpu0, gpu1, deferred_free_list);
1029
1030 processor_mask_array_clear(va_space->can_access, gpu0->id, gpu1->id);
1031 processor_mask_array_clear(va_space->can_access, gpu1->id, gpu0->id);
1032 processor_mask_array_clear(va_space->accessible_from, gpu0->id, gpu1->id);
1033 processor_mask_array_clear(va_space->accessible_from, gpu1->id, gpu0->id);
1034 processor_mask_array_clear(va_space->can_copy_from, gpu0->id, gpu1->id);
1035 processor_mask_array_clear(va_space->can_copy_from, gpu1->id, gpu0->id);
1036 processor_mask_array_clear(va_space->has_nvlink, gpu0->id, gpu1->id);
1037 processor_mask_array_clear(va_space->has_nvlink, gpu1->id, gpu0->id);
1038 processor_mask_array_clear(va_space->indirect_peers, gpu0->id, gpu1->id);
1039 processor_mask_array_clear(va_space->indirect_peers, gpu1->id, gpu0->id);
1040 processor_mask_array_clear(va_space->has_native_atomics, gpu0->id, gpu1->id);
1041 processor_mask_array_clear(va_space->has_native_atomics, gpu1->id, gpu0->id);
1042
1043 __clear_bit(table_index, va_space->enabled_peers);
1044
1045 va_space_check_processors_masks(va_space);
1046 }
1047
enable_peers(uvm_va_space_t * va_space,uvm_gpu_t * gpu0,uvm_gpu_t * gpu1)1048 static NV_STATUS enable_peers(uvm_va_space_t *va_space, uvm_gpu_t *gpu0, uvm_gpu_t *gpu1)
1049 {
1050 NV_STATUS status = NV_OK;
1051 uvm_gpu_va_space_t *gpu_va_space0, *gpu_va_space1;
1052 NvU32 table_index = 0;
1053 uvm_gpu_peer_t *peer_caps;
1054 uvm_va_range_t *va_range;
1055 LIST_HEAD(deferred_free_list);
1056
1057 uvm_assert_rwsem_locked_write(&va_space->lock);
1058
1059 // We know the GPUs were retained already, so now verify that they've been
1060 // registered by this specific VA space.
1061 if (!uvm_processor_mask_test(&va_space->registered_gpus, gpu0->id) ||
1062 !uvm_processor_mask_test(&va_space->registered_gpus, gpu1->id)) {
1063 return NV_ERR_INVALID_DEVICE;
1064 }
1065
1066 table_index = uvm_gpu_peer_table_index(gpu0->id, gpu1->id);
1067 peer_caps = &g_uvm_global.peers[table_index];
1068
1069 UVM_ASSERT(!test_bit(table_index, va_space->enabled_peers));
1070
1071 // If both GPUs have registered GPU VA spaces already, their big page sizes
1072 // must match.
1073 gpu_va_space0 = uvm_gpu_va_space_get(va_space, gpu0);
1074 gpu_va_space1 = uvm_gpu_va_space_get(va_space, gpu1);
1075 if (gpu_va_space0 &&
1076 gpu_va_space1 &&
1077 gpu_va_space0->page_tables.big_page_size != gpu_va_space1->page_tables.big_page_size) {
1078 return NV_ERR_NOT_COMPATIBLE;
1079 }
1080
1081 processor_mask_array_set(va_space->can_access, gpu0->id, gpu1->id);
1082 processor_mask_array_set(va_space->can_access, gpu1->id, gpu0->id);
1083 processor_mask_array_set(va_space->accessible_from, gpu0->id, gpu1->id);
1084 processor_mask_array_set(va_space->accessible_from, gpu1->id, gpu0->id);
1085
1086 if (gpu0->parent->peer_copy_mode != UVM_GPU_PEER_COPY_MODE_UNSUPPORTED) {
1087 UVM_ASSERT_MSG(gpu1->parent->peer_copy_mode == gpu0->parent->peer_copy_mode,
1088 "GPU %s GPU %s\n",
1089 uvm_gpu_name(gpu0),
1090 uvm_gpu_name(gpu1));
1091
1092 processor_mask_array_set(va_space->can_copy_from, gpu1->id, gpu0->id);
1093 processor_mask_array_set(va_space->can_copy_from, gpu0->id, gpu1->id);
1094 }
1095
1096 // Pre-compute nvlink and native atomic masks for the new peers
1097 if (peer_caps->link_type >= UVM_GPU_LINK_NVLINK_1) {
1098 processor_mask_array_set(va_space->has_nvlink, gpu0->id, gpu1->id);
1099 processor_mask_array_set(va_space->has_nvlink, gpu1->id, gpu0->id);
1100
1101 processor_mask_array_set(va_space->has_native_atomics, gpu0->id, gpu1->id);
1102 processor_mask_array_set(va_space->has_native_atomics, gpu1->id, gpu0->id);
1103
1104 if (peer_caps->is_indirect_peer) {
1105 UVM_ASSERT(peer_caps->link_type >= UVM_GPU_LINK_NVLINK_2);
1106 UVM_ASSERT(gpu0->mem_info.numa.enabled);
1107 UVM_ASSERT(gpu1->mem_info.numa.enabled);
1108
1109 processor_mask_array_set(va_space->indirect_peers, gpu0->id, gpu1->id);
1110 processor_mask_array_set(va_space->indirect_peers, gpu1->id, gpu0->id);
1111 }
1112 }
1113 else if (gpu0->parent == gpu1->parent) {
1114 processor_mask_array_set(va_space->has_native_atomics, gpu0->id, gpu1->id);
1115 processor_mask_array_set(va_space->has_native_atomics, gpu1->id, gpu0->id);
1116 }
1117
1118 UVM_ASSERT(va_space_check_processors_masks(va_space));
1119 __set_bit(table_index, va_space->enabled_peers);
1120
1121 uvm_for_each_va_range(va_range, va_space) {
1122 status = uvm_va_range_enable_peer(va_range, gpu0, gpu1);
1123 if (status != NV_OK)
1124 break;
1125 }
1126
1127 if (status != NV_OK) {
1128 disable_peers(va_space, gpu0, gpu1, &deferred_free_list);
1129
1130 // uvm_va_range_disable_peer adds only external allocations to the list,
1131 // but uvm_va_range_enable_peer doesn't do anything for them.
1132 UVM_ASSERT(list_empty(&deferred_free_list));
1133 }
1134
1135 return status;
1136 }
1137
1138 // On success the GPUs and the P2P access have been retained, but the caller
1139 // must not assume that the GPUs are still registered in the VA space after the
1140 // call since the VA space lock is dropped.
retain_pcie_peers_from_uuids(uvm_va_space_t * va_space,const NvProcessorUuid * gpu_uuid_1,const NvProcessorUuid * gpu_uuid_2,uvm_gpu_t ** gpu0,uvm_gpu_t ** gpu1)1141 static NV_STATUS retain_pcie_peers_from_uuids(uvm_va_space_t *va_space,
1142 const NvProcessorUuid *gpu_uuid_1,
1143 const NvProcessorUuid *gpu_uuid_2,
1144 uvm_gpu_t **gpu0,
1145 uvm_gpu_t **gpu1)
1146 {
1147 NV_STATUS status = NV_OK;
1148
1149 uvm_va_space_down_read_rm(va_space);
1150
1151 // The UUIDs should have already been registered
1152 *gpu0 = uvm_va_space_get_gpu_by_uuid(va_space, gpu_uuid_1);
1153 *gpu1 = uvm_va_space_get_gpu_by_uuid(va_space, gpu_uuid_2);
1154
1155 if (*gpu0 && *gpu1 && !uvm_parent_id_equal((*gpu0)->parent->id, (*gpu1)->parent->id))
1156 status = uvm_gpu_retain_pcie_peer_access(*gpu0, *gpu1);
1157 else
1158 status = NV_ERR_INVALID_DEVICE;
1159
1160 uvm_va_space_up_read_rm(va_space);
1161
1162 return status;
1163 }
1164
uvm_va_space_pcie_peer_enabled(uvm_va_space_t * va_space,uvm_gpu_t * gpu0,uvm_gpu_t * gpu1)1165 static bool uvm_va_space_pcie_peer_enabled(uvm_va_space_t *va_space, uvm_gpu_t *gpu0, uvm_gpu_t *gpu1)
1166 {
1167 return !processor_mask_array_test(va_space->has_nvlink, gpu0->id, gpu1->id) &&
1168 gpu0->parent != gpu1->parent &&
1169 uvm_va_space_peer_enabled(va_space, gpu0, gpu1);
1170 }
1171
uvm_va_space_nvlink_peer_enabled(uvm_va_space_t * va_space,uvm_gpu_t * gpu0,uvm_gpu_t * gpu1)1172 static bool uvm_va_space_nvlink_peer_enabled(uvm_va_space_t *va_space, uvm_gpu_t *gpu0, uvm_gpu_t *gpu1)
1173 {
1174 return processor_mask_array_test(va_space->has_nvlink, gpu0->id, gpu1->id);
1175 }
1176
free_gpu_va_space(nv_kref_t * nv_kref)1177 static void free_gpu_va_space(nv_kref_t *nv_kref)
1178 {
1179 uvm_gpu_va_space_t *gpu_va_space = container_of(nv_kref, uvm_gpu_va_space_t, kref);
1180 uvm_gpu_va_space_state_t state = uvm_gpu_va_space_state(gpu_va_space);
1181 UVM_ASSERT(state == UVM_GPU_VA_SPACE_STATE_INIT || state == UVM_GPU_VA_SPACE_STATE_DEAD);
1182 uvm_kvfree(gpu_va_space);
1183 }
1184
uvm_gpu_va_space_release(uvm_gpu_va_space_t * gpu_va_space)1185 void uvm_gpu_va_space_release(uvm_gpu_va_space_t *gpu_va_space)
1186 {
1187 if (gpu_va_space)
1188 nv_kref_put(&gpu_va_space->kref, free_gpu_va_space);
1189 }
1190
uvm_gpu_va_space_acquire_mmap_lock(struct mm_struct * mm)1191 static void uvm_gpu_va_space_acquire_mmap_lock(struct mm_struct *mm)
1192 {
1193 if (mm) {
1194 // uvm_ats_register_gpu_va_space() requires mmap_lock to be held in
1195 // write mode if IBM ATS support is provided through the kernel.
1196 // mmap_lock is optional if IBM ATS support is provided through the
1197 // driver. In all cases, We need mmap_lock at least in read mode to
1198 // handle potential CPU mapping changes in
1199 // uvm_va_range_add_gpu_va_space().
1200 if (UVM_ATS_IBM_SUPPORTED_IN_KERNEL())
1201 uvm_down_write_mmap_lock(mm);
1202 else
1203 uvm_down_read_mmap_lock(mm);
1204 }
1205 }
1206
uvm_gpu_va_space_release_mmap_lock(struct mm_struct * mm)1207 static void uvm_gpu_va_space_release_mmap_lock(struct mm_struct *mm)
1208 {
1209 if (mm) {
1210 if (UVM_ATS_IBM_SUPPORTED_IN_KERNEL())
1211 uvm_up_write_mmap_lock(mm);
1212 else
1213 uvm_up_read_mmap_lock(mm);
1214 }
1215 }
1216
uvm_gpu_va_space_set_page_dir(uvm_gpu_va_space_t * gpu_va_space)1217 static NV_STATUS uvm_gpu_va_space_set_page_dir(uvm_gpu_va_space_t *gpu_va_space)
1218 {
1219 NV_STATUS status;
1220 uvm_gpu_phys_address_t pdb_phys;
1221 NvU64 num_pdes;
1222 NvU32 pasid = -1U;
1223
1224 if (gpu_va_space->ats.enabled) {
1225 pasid = gpu_va_space->ats.pasid;
1226 UVM_ASSERT(pasid != -1U);
1227 }
1228
1229 // Replace the existing PDB, if present, with the new one allocated by UVM.
1230 // This will fail if nvUvmInterfaceSetPageDirectory has already been called
1231 // on the RM VA space object, which prevents the user from registering twice
1232 // and corrupting our state.
1233 //
1234 // TODO: Bug 1733664: RM needs to preempt and disable channels during this
1235 // operation.
1236 pdb_phys = uvm_page_tree_pdb(&gpu_va_space->page_tables)->addr;
1237 num_pdes = uvm_mmu_page_tree_entries(&gpu_va_space->page_tables, 0, UVM_PAGE_SIZE_AGNOSTIC);
1238 status = uvm_rm_locked_call(nvUvmInterfaceSetPageDirectory(gpu_va_space->duped_gpu_va_space,
1239 pdb_phys.address,
1240 num_pdes,
1241 pdb_phys.aperture == UVM_APERTURE_VID,
1242 pasid));
1243 if (status != NV_OK) {
1244 if (status == NV_ERR_NOT_SUPPORTED) {
1245 // Convert to the return code specified by uvm.h for
1246 // already-registered PDBs.
1247 status = NV_ERR_INVALID_DEVICE;
1248 }
1249 else {
1250 UVM_DBG_PRINT("nvUvmInterfaceSetPageDirectory() failed: %s, GPU %s\n",
1251 nvstatusToString(status),
1252 uvm_gpu_name(gpu_va_space->gpu));
1253 }
1254
1255 return status;
1256 }
1257
1258 gpu_va_space->did_set_page_directory = true;
1259 return status;
1260 }
1261
uvm_gpu_va_space_unset_page_dir(uvm_gpu_va_space_t * gpu_va_space)1262 void uvm_gpu_va_space_unset_page_dir(uvm_gpu_va_space_t *gpu_va_space)
1263 {
1264 if (uvm_gpu_va_space_state(gpu_va_space) != UVM_GPU_VA_SPACE_STATE_INIT)
1265 uvm_assert_rwsem_locked_read(&gpu_va_space->va_space->lock);
1266
1267 if (gpu_va_space->did_set_page_directory) {
1268 NV_STATUS status = uvm_rm_locked_call(nvUvmInterfaceUnsetPageDirectory(gpu_va_space->duped_gpu_va_space));
1269 UVM_ASSERT_MSG(status == NV_OK,
1270 "nvUvmInterfaceUnsetPageDirectory() failed: %s, GPU %s\n",
1271 nvstatusToString(status),
1272 uvm_gpu_name(gpu_va_space->gpu));
1273 gpu_va_space->did_set_page_directory = false;
1274 }
1275 }
1276
destroy_gpu_va_space(uvm_gpu_va_space_t * gpu_va_space)1277 static void destroy_gpu_va_space(uvm_gpu_va_space_t *gpu_va_space)
1278 {
1279 NvU64 delay_us = 0;
1280 uvm_va_space_t *va_space;
1281 uvm_gpu_va_space_state_t state;
1282
1283 if (!gpu_va_space)
1284 return;
1285
1286 state = uvm_gpu_va_space_state(gpu_va_space);
1287 UVM_ASSERT(state == UVM_GPU_VA_SPACE_STATE_INIT || state == UVM_GPU_VA_SPACE_STATE_DEAD);
1288
1289 va_space = gpu_va_space->va_space;
1290 UVM_ASSERT(va_space);
1291
1292 delay_us = atomic64_read(&va_space->test.destroy_gpu_va_space_delay_us);
1293
1294 if (delay_us)
1295 udelay(delay_us);
1296
1297 // Serialize this uvm_gpu_va_space_unset_page_dir call with the one in
1298 // uvm_va_space_mm_shutdown, which also starts with the VA space lock in
1299 // write mode. RM will serialize the calls internally, so we lock here only
1300 // to avoid getting benign errors from nvUvmInterfaceUnsetPageDirectory.
1301 //
1302 // If we never got to add_gpu_va_space, then gpu_va_space was never
1303 // registered within the va_space, so uvm_va_space_mm_shutdown couldn't see
1304 // it and we don't have to take the lock. state is guaranteed to be
1305 // UVM_GPU_VA_SPACE_STATE_INIT if add_gpu_va_space wasn't reached.
1306 if (state != UVM_GPU_VA_SPACE_STATE_INIT) {
1307 uvm_va_space_down_write(va_space);
1308 uvm_va_space_downgrade_write_rm(va_space);
1309 }
1310
1311 uvm_gpu_va_space_unset_page_dir(gpu_va_space);
1312
1313 if (state != UVM_GPU_VA_SPACE_STATE_INIT)
1314 uvm_va_space_up_read_rm(va_space);
1315
1316 if (gpu_va_space->page_tables.root)
1317 uvm_page_tree_deinit(&gpu_va_space->page_tables);
1318
1319 if (gpu_va_space->duped_gpu_va_space)
1320 uvm_rm_locked_call_void(nvUvmInterfaceAddressSpaceDestroy(gpu_va_space->duped_gpu_va_space));
1321
1322 // If the state is DEAD, then this GPU VA space is tracked in
1323 // va_space->gpu_va_space_deferred_free. uvm_ats_unregister_gpu_va_space may
1324 // wait for this count to go to 0 via uvm_va_space_mm_shutdown, so we must
1325 // decrement it before calling that function.
1326 if (gpu_va_space->state == UVM_GPU_VA_SPACE_STATE_DEAD) {
1327 int num_pending = atomic_dec_return(&va_space->gpu_va_space_deferred_free.num_pending);
1328 if (num_pending == 0)
1329 wake_up_all(&va_space->gpu_va_space_deferred_free.wait_queue);
1330 else
1331 UVM_ASSERT(num_pending > 0);
1332 }
1333
1334 // Note that this call may wait for faults to finish being serviced, which
1335 // means it may depend on the VA space lock and mmap_lock.
1336 uvm_ats_unregister_gpu_va_space(gpu_va_space);
1337
1338 uvm_ats_unbind_gpu(gpu_va_space);
1339
1340
1341 uvm_gpu_va_space_release(gpu_va_space);
1342 }
1343
create_gpu_va_space(uvm_gpu_t * gpu,uvm_va_space_t * va_space,uvm_rm_user_object_t * user_rm_va_space,uvm_gpu_va_space_t ** out_gpu_va_space)1344 static NV_STATUS create_gpu_va_space(uvm_gpu_t *gpu,
1345 uvm_va_space_t *va_space,
1346 uvm_rm_user_object_t *user_rm_va_space,
1347 uvm_gpu_va_space_t **out_gpu_va_space)
1348 {
1349 NV_STATUS status;
1350 uvm_gpu_va_space_t *gpu_va_space;
1351 UvmGpuAddressSpaceInfo gpu_address_space_info;
1352
1353 *out_gpu_va_space = NULL;
1354
1355 gpu_va_space = uvm_kvmalloc_zero(sizeof(*gpu_va_space));
1356 if (!gpu_va_space)
1357 return NV_ERR_NO_MEMORY;
1358
1359 gpu_va_space->gpu = gpu;
1360 gpu_va_space->va_space = va_space;
1361 INIT_LIST_HEAD(&gpu_va_space->registered_channels);
1362 INIT_LIST_HEAD(&gpu_va_space->channel_va_ranges);
1363 nv_kref_init(&gpu_va_space->kref);
1364
1365 // TODO: Bug 1624521: This interface needs to use rm_control_fd to do
1366 // validation.
1367 (void)user_rm_va_space->rm_control_fd;
1368 status = uvm_rm_locked_call(nvUvmInterfaceDupAddressSpace(uvm_gpu_device_handle(gpu),
1369 user_rm_va_space->user_client,
1370 user_rm_va_space->user_object,
1371 &gpu_va_space->duped_gpu_va_space,
1372 &gpu_address_space_info));
1373 if (status != NV_OK) {
1374 UVM_DBG_PRINT("failed to dup address space with error: %s, for GPU:%s \n",
1375 nvstatusToString(status), uvm_gpu_name(gpu));
1376 goto error;
1377 }
1378
1379 gpu_va_space->ats.enabled = gpu_address_space_info.atsEnabled;
1380
1381 // If ATS support in the UVM driver isn't enabled, fail registration of GPU
1382 // VA spaces which have ATS enabled.
1383 if (!g_uvm_global.ats.enabled && gpu_va_space->ats.enabled) {
1384 UVM_INFO_PRINT("GPU VA space requires ATS, but ATS is not supported or enabled\n");
1385 status = NV_ERR_INVALID_FLAGS;
1386 goto error;
1387 }
1388
1389 // If this GPU VA space uses ATS then pageable memory access must not have
1390 // been disabled in the VA space.
1391 if (gpu_va_space->ats.enabled && !uvm_va_space_pageable_mem_access_supported(va_space)) {
1392 UVM_INFO_PRINT("GPU VA space requires ATS, but pageable memory access is not supported\n");
1393 status = NV_ERR_INVALID_FLAGS;
1394 goto error;
1395 }
1396
1397 // RM allows the creation of VA spaces on Pascal with 128k big pages. We
1398 // don't support that, so just fail those attempts.
1399 //
1400 // TODO: Bug 1789555: Remove this check once RM disallows this case.
1401 if (!gpu->parent->arch_hal->mmu_mode_hal(gpu_address_space_info.bigPageSize)) {
1402 status = NV_ERR_INVALID_FLAGS;
1403 goto error;
1404 }
1405
1406 // Set up this GPU's page tables
1407 UVM_ASSERT(gpu_va_space->page_tables.root == NULL);
1408 status = uvm_page_tree_init(gpu,
1409 gpu_va_space,
1410 UVM_PAGE_TREE_TYPE_USER,
1411 gpu_address_space_info.bigPageSize,
1412 uvm_get_page_tree_location(gpu->parent),
1413 &gpu_va_space->page_tables);
1414 if (status != NV_OK) {
1415 UVM_ERR_PRINT("Initializing the page tree failed: %s, GPU %s\n", nvstatusToString(status), uvm_gpu_name(gpu));
1416 goto error;
1417 }
1418
1419 status = uvm_ats_bind_gpu(gpu_va_space);
1420 if (status != NV_OK)
1421 goto error;
1422
1423 *out_gpu_va_space = gpu_va_space;
1424 return NV_OK;
1425
1426 error:
1427 destroy_gpu_va_space(gpu_va_space);
1428 return status;
1429 }
1430
add_gpu_va_space(uvm_gpu_va_space_t * gpu_va_space)1431 static void add_gpu_va_space(uvm_gpu_va_space_t *gpu_va_space)
1432 {
1433 uvm_va_space_t *va_space = gpu_va_space->va_space;
1434 uvm_gpu_t *gpu = gpu_va_space->gpu;
1435
1436 UVM_ASSERT(va_space);
1437 uvm_assert_rwsem_locked_write(&va_space->lock);
1438
1439 if (!uvm_processor_mask_test(&va_space->faultable_processors, gpu->id))
1440 va_space->num_non_faultable_gpu_va_spaces++;
1441
1442 uvm_processor_mask_set(&va_space->registered_gpu_va_spaces, gpu->id);
1443 va_space->gpu_va_spaces[uvm_id_gpu_index(gpu->id)] = gpu_va_space;
1444 gpu_va_space->state = UVM_GPU_VA_SPACE_STATE_ACTIVE;
1445 }
1446
check_gpu_va_space(uvm_gpu_va_space_t * gpu_va_space)1447 static NV_STATUS check_gpu_va_space(uvm_gpu_va_space_t *gpu_va_space)
1448 {
1449 uvm_va_space_t *va_space = gpu_va_space->va_space;
1450 uvm_gpu_t *gpu = gpu_va_space->gpu;
1451 uvm_gpu_t *other_gpu;
1452 uvm_gpu_va_space_t *other_gpu_va_space;
1453
1454 UVM_ASSERT(va_space);
1455 uvm_assert_rwsem_locked_write(&va_space->lock);
1456
1457 UVM_ASSERT(uvm_gpu_va_space_state(gpu_va_space) == UVM_GPU_VA_SPACE_STATE_INIT);
1458
1459 if (!uvm_processor_mask_test(&va_space->registered_gpus, gpu->id))
1460 return NV_ERR_INVALID_DEVICE;
1461
1462 // RM will return an error from create_gpu_va_space if the given RM VA space
1463 // object has already been registered by any VA space. Now we just need to
1464 // check if a different VA space has already been registered.
1465 if (uvm_processor_mask_test(&va_space->registered_gpu_va_spaces, gpu->id))
1466 return NV_ERR_INVALID_DEVICE;
1467
1468 // If a GPU unregister is in progress but temporarily dropped the VA space
1469 // lock, we can't register new GPU VA spaces.
1470 if (uvm_processor_mask_test(&va_space->gpu_unregister_in_progress, gpu->id))
1471 return NV_ERR_INVALID_DEVICE;
1472
1473 // The VA space's mm is being torn down, so don't allow more work
1474 if (va_space->disallow_new_registers)
1475 return NV_ERR_PAGE_TABLE_NOT_AVAIL;
1476
1477 // This GPU VA space must match its big page size with all enabled peers.
1478 // Also, the new GPU VA space must have the same ATS setting as previously-
1479 // registered GPU VA spaces
1480 for_each_va_space_gpu_in_mask(other_gpu, va_space, &va_space->registered_gpu_va_spaces) {
1481 UVM_ASSERT(other_gpu != gpu);
1482
1483 other_gpu_va_space = uvm_gpu_va_space_get(va_space, other_gpu);
1484 if (other_gpu_va_space->ats.enabled != gpu_va_space->ats.enabled)
1485 return NV_ERR_INVALID_FLAGS;
1486
1487 if (!test_bit(uvm_gpu_peer_table_index(gpu->id, other_gpu->id), va_space->enabled_peers))
1488 continue;
1489
1490 if (gpu_va_space->page_tables.big_page_size != other_gpu_va_space->page_tables.big_page_size)
1491 return NV_ERR_NOT_COMPATIBLE;
1492 }
1493
1494 return NV_OK;
1495 }
1496
uvm_va_space_register_gpu_va_space(uvm_va_space_t * va_space,uvm_rm_user_object_t * user_rm_va_space,const NvProcessorUuid * gpu_uuid)1497 NV_STATUS uvm_va_space_register_gpu_va_space(uvm_va_space_t *va_space,
1498 uvm_rm_user_object_t *user_rm_va_space,
1499 const NvProcessorUuid *gpu_uuid)
1500 {
1501 NV_STATUS status;
1502 uvm_gpu_t *gpu;
1503 uvm_gpu_va_space_t *gpu_va_space;
1504 uvm_va_range_t *va_range;
1505 struct mm_struct *mm;
1506 LIST_HEAD(deferred_free_list);
1507
1508 gpu = uvm_va_space_retain_gpu_by_uuid(va_space, gpu_uuid);
1509 if (!gpu)
1510 return NV_ERR_INVALID_DEVICE;
1511
1512 mm = uvm_va_space_mm_or_current_retain(va_space);
1513 if (!mm) {
1514 status = NV_ERR_PAGE_TABLE_NOT_AVAIL;
1515 goto error_gpu_release;
1516 }
1517
1518 status = create_gpu_va_space(gpu, va_space, user_rm_va_space, &gpu_va_space);
1519 if (status != NV_OK)
1520 goto error_gpu_release;
1521
1522 uvm_gpu_va_space_acquire_mmap_lock(mm);
1523 uvm_va_space_down_write(va_space);
1524
1525 status = check_gpu_va_space(gpu_va_space);
1526 if (status != NV_OK)
1527 goto error_unlock;
1528
1529 status = uvm_ats_register_gpu_va_space(gpu_va_space);
1530 if (status != NV_OK)
1531 goto error_unlock;
1532
1533 uvm_va_space_up_write(va_space);
1534 uvm_gpu_va_space_release_mmap_lock(mm);
1535
1536 status = uvm_gpu_va_space_set_page_dir(gpu_va_space);
1537 if (status != NV_OK)
1538 goto error_destroy;
1539
1540 uvm_gpu_va_space_acquire_mmap_lock(mm);
1541 uvm_va_space_down_write(va_space);
1542
1543 // va_space state might have changed before the lock reacquire for write.
1544 // So, check the state again.
1545 status = check_gpu_va_space(gpu_va_space);
1546 if (status != NV_OK)
1547 goto error_unlock;
1548
1549 add_gpu_va_space(gpu_va_space);
1550
1551 // Tell the VA ranges that they can map this GPU, if they need to.
1552 //
1553 // Ideally we'd downgrade the VA space lock to read mode while adding new
1554 // mappings, but that would complicate error handling since we have to
1555 // remove the GPU VA space if any of these mappings fail.
1556 uvm_for_each_va_range(va_range, va_space) {
1557 status = uvm_va_range_add_gpu_va_space(va_range, gpu_va_space, mm);
1558 if (status != NV_OK)
1559 goto error;
1560 }
1561
1562 uvm_va_space_up_write(va_space);
1563 uvm_gpu_va_space_release_mmap_lock(mm);
1564
1565 uvm_va_space_mm_or_current_release(va_space, mm);
1566 uvm_gpu_release(gpu);
1567
1568 return NV_OK;
1569
1570 error:
1571 UVM_ASSERT(uvm_gpu_va_space_state(gpu_va_space) == UVM_GPU_VA_SPACE_STATE_ACTIVE);
1572 remove_gpu_va_space(gpu_va_space, mm, &deferred_free_list);
1573
1574 // Nothing else could've been attached to this gpu_va_space (channels,
1575 // external allocations) since we're still holding the VA space lock
1576 // since add_gpu_va_space(). Therefore the GPU VA space itself should be
1577 // the only item in the list, and we can just destroy it directly below.
1578 UVM_ASSERT(list_is_singular(&deferred_free_list));
1579 error_unlock:
1580 uvm_va_space_up_write(va_space);
1581 uvm_gpu_va_space_release_mmap_lock(mm);
1582 error_destroy:
1583 destroy_gpu_va_space(gpu_va_space);
1584 error_gpu_release:
1585 uvm_va_space_mm_or_current_release(va_space, mm);
1586 uvm_gpu_release(gpu);
1587 return status;
1588 }
1589
find_gpu_va_space_index(uvm_va_space_t * va_space,uvm_parent_gpu_t * parent_gpu)1590 static NvU32 find_gpu_va_space_index(uvm_va_space_t *va_space,
1591 uvm_parent_gpu_t *parent_gpu)
1592 {
1593 uvm_gpu_id_t gpu_id;
1594 NvU32 index = UVM_ID_MAX_PROCESSORS;
1595
1596 // TODO: Bug 4351121: this conversion from parent ID to gpu ID depends on
1597 // the fact that only one partition is registered per va_space per physical
1598 // GPU. This code will need to change when multiple MIG instances are
1599 // supported.
1600 for_each_sub_processor_id_in_parent_gpu(gpu_id, parent_gpu->id) {
1601 if (uvm_processor_mask_test(&va_space->registered_gpu_va_spaces, gpu_id)) {
1602 UVM_ASSERT(index == UVM_ID_MAX_PROCESSORS);
1603 index = uvm_id_gpu_index(gpu_id);
1604 }
1605 }
1606
1607 return index;
1608 }
1609
uvm_gpu_va_space_get_by_parent_gpu(uvm_va_space_t * va_space,uvm_parent_gpu_t * parent_gpu)1610 uvm_gpu_va_space_t *uvm_gpu_va_space_get_by_parent_gpu(uvm_va_space_t *va_space,
1611 uvm_parent_gpu_t *parent_gpu)
1612 {
1613 uvm_gpu_va_space_t *gpu_va_space;
1614 NvU32 gpu_index;
1615
1616 uvm_assert_rwsem_locked(&va_space->lock);
1617
1618 if (!parent_gpu)
1619 return NULL;
1620
1621 gpu_index = find_gpu_va_space_index(va_space, parent_gpu);
1622 if (gpu_index == UVM_ID_MAX_PROCESSORS)
1623 return NULL;
1624
1625 gpu_va_space = va_space->gpu_va_spaces[gpu_index];
1626 UVM_ASSERT(uvm_gpu_va_space_state(gpu_va_space) == UVM_GPU_VA_SPACE_STATE_ACTIVE);
1627 UVM_ASSERT(gpu_va_space->va_space == va_space);
1628 UVM_ASSERT(gpu_va_space->gpu->parent == parent_gpu);
1629
1630 return gpu_va_space;
1631 }
1632
1633 // The caller must have stopped all channels under this gpu_va_space before
1634 // calling this function.
remove_gpu_va_space(uvm_gpu_va_space_t * gpu_va_space,struct mm_struct * mm,struct list_head * deferred_free_list)1635 static void remove_gpu_va_space(uvm_gpu_va_space_t *gpu_va_space,
1636 struct mm_struct *mm,
1637 struct list_head *deferred_free_list)
1638 {
1639 uvm_va_space_t *va_space;
1640 uvm_va_range_t *va_range;
1641 uvm_va_range_t *va_range_next;
1642 uvm_gpu_t *gpu;
1643
1644 if (!gpu_va_space || uvm_gpu_va_space_state(gpu_va_space) != UVM_GPU_VA_SPACE_STATE_ACTIVE)
1645 return;
1646
1647 va_space = gpu_va_space->va_space;
1648 UVM_ASSERT(va_space);
1649
1650 uvm_assert_rwsem_locked_write(&va_space->lock);
1651
1652 uvm_gpu_va_space_detach_all_user_channels(gpu_va_space, deferred_free_list);
1653
1654 // Removing all registered channels should've removed all VA ranges used by
1655 // those channels.
1656 UVM_ASSERT(list_empty(&gpu_va_space->channel_va_ranges));
1657
1658 // Unmap all page tables in this VA space on this GPU.
1659 // TODO: Bug 1799173: This will need to add objects to deferred_free_list
1660 uvm_for_each_va_range_safe(va_range, va_range_next, va_space)
1661 uvm_va_range_remove_gpu_va_space(va_range, gpu_va_space, mm, deferred_free_list);
1662
1663 uvm_hmm_remove_gpu_va_space(va_space, gpu_va_space, mm);
1664
1665 uvm_deferred_free_object_add(deferred_free_list,
1666 &gpu_va_space->deferred_free,
1667 UVM_DEFERRED_FREE_OBJECT_GPU_VA_SPACE);
1668
1669 // Let uvm_va_space_mm_shutdown know that it has to wait for this GPU VA
1670 // space to be destroyed.
1671 atomic_inc(&va_space->gpu_va_space_deferred_free.num_pending);
1672
1673 gpu = gpu_va_space->gpu;
1674
1675 if (!uvm_processor_mask_test(&va_space->faultable_processors, gpu->id)) {
1676 UVM_ASSERT(va_space->num_non_faultable_gpu_va_spaces);
1677 va_space->num_non_faultable_gpu_va_spaces--;
1678 }
1679
1680 uvm_processor_mask_clear(&va_space->registered_gpu_va_spaces, gpu->id);
1681 va_space->gpu_va_spaces[uvm_id_gpu_index(gpu->id)] = NULL;
1682 gpu_va_space->state = UVM_GPU_VA_SPACE_STATE_DEAD;
1683 }
1684
uvm_va_space_unregister_gpu_va_space(uvm_va_space_t * va_space,const NvProcessorUuid * gpu_uuid)1685 NV_STATUS uvm_va_space_unregister_gpu_va_space(uvm_va_space_t *va_space, const NvProcessorUuid *gpu_uuid)
1686 {
1687 NV_STATUS status = NV_OK;
1688 uvm_gpu_t *gpu;
1689 uvm_gpu_va_space_t *gpu_va_space;
1690 struct mm_struct *mm;
1691 LIST_HEAD(deferred_free_list);
1692
1693 // Stopping channels requires holding the VA space lock in read mode, so do
1694 // it first. This also takes the serialize_writers_lock, so we'll serialize
1695 // with other threads about to perform channel binds in
1696 // uvm_register_channel since.
1697 uvm_va_space_down_read_rm(va_space);
1698
1699 gpu = uvm_va_space_get_gpu_by_uuid_with_gpu_va_space(va_space, gpu_uuid);
1700 if (!gpu) {
1701 uvm_va_space_up_read_rm(va_space);
1702 return NV_ERR_INVALID_DEVICE;
1703 }
1704
1705 gpu_va_space = uvm_gpu_va_space_get(va_space, gpu);
1706 UVM_ASSERT(gpu_va_space);
1707
1708 gpu_va_space_stop_all_channels(gpu_va_space);
1709
1710 // We need to drop the lock to re-take it in write mode
1711 uvm_gpu_va_space_retain(gpu_va_space);
1712 uvm_gpu_retain(gpu);
1713 uvm_va_space_up_read_rm(va_space);
1714
1715 mm = uvm_va_space_mm_or_current_retain_lock(va_space);
1716 uvm_va_space_down_write(va_space);
1717
1718 // We dropped the lock so we have to re-verify that this gpu_va_space is
1719 // still valid. If so, then the GPU is also still registered under the VA
1720 // space. If not, we raced with another unregister thread, so return an
1721 // an error for double-unregister.
1722 if (uvm_gpu_va_space_state(gpu_va_space) == UVM_GPU_VA_SPACE_STATE_DEAD) {
1723 status = NV_ERR_INVALID_DEVICE;
1724 }
1725 else {
1726 UVM_ASSERT(gpu == uvm_va_space_get_gpu_by_uuid_with_gpu_va_space(va_space, gpu_uuid));
1727 UVM_ASSERT(gpu_va_space == uvm_gpu_va_space_get(va_space, gpu));
1728
1729 remove_gpu_va_space(gpu_va_space, mm, &deferred_free_list);
1730 }
1731
1732 uvm_va_space_up_write(va_space);
1733
1734 // Unlock the mm since the call to uvm_deferred_free_object_list() requires
1735 // that we don't hold any locks. We don't release the mm yet because that
1736 // could call uvm_va_space_mm_shutdown() which waits for the deferred free
1737 // list to be empty which would cause a deadlock.
1738 if (mm)
1739 uvm_up_read_mmap_lock(mm);
1740
1741 uvm_deferred_free_object_list(&deferred_free_list);
1742 uvm_gpu_va_space_release(gpu_va_space);
1743 uvm_gpu_release(gpu);
1744
1745 uvm_va_space_mm_or_current_release(va_space, mm);
1746
1747 return status;
1748 }
1749
uvm_va_space_peer_enabled(uvm_va_space_t * va_space,const uvm_gpu_t * gpu0,const uvm_gpu_t * gpu1)1750 bool uvm_va_space_peer_enabled(uvm_va_space_t *va_space, const uvm_gpu_t *gpu0, const uvm_gpu_t *gpu1)
1751 {
1752 size_t table_index;
1753
1754 UVM_ASSERT(uvm_processor_mask_test(&va_space->registered_gpus, gpu0->id));
1755 UVM_ASSERT(uvm_processor_mask_test(&va_space->registered_gpus, gpu1->id));
1756
1757 table_index = uvm_gpu_peer_table_index(gpu0->id, gpu1->id);
1758 return !!test_bit(table_index, va_space->enabled_peers);
1759 }
1760
uvm_processor_mask_find_closest_id(uvm_va_space_t * va_space,const uvm_processor_mask_t * candidates,uvm_processor_id_t src)1761 uvm_processor_id_t uvm_processor_mask_find_closest_id(uvm_va_space_t *va_space,
1762 const uvm_processor_mask_t *candidates,
1763 uvm_processor_id_t src)
1764 {
1765 uvm_processor_mask_t *mask = &va_space->closest_processors.mask;
1766 uvm_processor_id_t closest_id;
1767
1768 // Highest priority: the local processor itself
1769 if (uvm_processor_mask_test(candidates, src))
1770 return src;
1771
1772 uvm_mutex_lock(&va_space->closest_processors.mask_mutex);
1773
1774 if (uvm_processor_mask_and(mask, candidates, &va_space->has_nvlink[uvm_id_value(src)])) {
1775 // NvLink peers
1776 uvm_processor_mask_t *indirect_peers;
1777 uvm_processor_mask_t *direct_peers = &va_space->closest_processors.direct_peers;
1778
1779 indirect_peers = &va_space->indirect_peers[uvm_id_value(src)];
1780
1781 if (uvm_processor_mask_andnot(direct_peers, mask, indirect_peers)) {
1782 // Direct peers, prioritizing GPU peers over CPU
1783 closest_id = uvm_processor_mask_find_first_gpu_id(direct_peers);
1784 if (UVM_ID_IS_INVALID(closest_id))
1785 closest_id = UVM_ID_CPU;
1786 }
1787 else {
1788 // Indirect peers
1789 UVM_ASSERT(UVM_ID_IS_GPU(src));
1790 UVM_ASSERT(!uvm_processor_mask_test(mask, UVM_ID_CPU));
1791
1792 closest_id = uvm_processor_mask_find_first_gpu_id(mask);
1793 }
1794 }
1795 else if (uvm_processor_mask_and(mask, candidates, &va_space->can_access[uvm_id_value(src)])) {
1796 // If source is GPU, prioritize PCIe peers over CPU
1797 // CPUs only have direct access to GPU memory over NVLINK, not PCIe, and
1798 // should have been selected above
1799 UVM_ASSERT(UVM_ID_IS_GPU(src));
1800
1801 closest_id = uvm_processor_mask_find_first_gpu_id(mask);
1802 if (UVM_ID_IS_INVALID(closest_id))
1803 closest_id = UVM_ID_CPU;
1804 }
1805 else {
1806 // No GPUs with direct access are in the mask. Just pick the first
1807 // processor in the mask, if any.
1808 closest_id = uvm_processor_mask_find_first_id(candidates);
1809 }
1810
1811 uvm_mutex_unlock(&va_space->closest_processors.mask_mutex);
1812
1813 return closest_id;
1814 }
1815
uvm_deferred_free_object_channel(uvm_deferred_free_object_t * object,uvm_parent_processor_mask_t * flushed_parent_gpus)1816 static void uvm_deferred_free_object_channel(uvm_deferred_free_object_t *object,
1817 uvm_parent_processor_mask_t *flushed_parent_gpus)
1818 {
1819 uvm_user_channel_t *channel = container_of(object, uvm_user_channel_t, deferred_free);
1820 uvm_gpu_t *gpu = channel->gpu;
1821
1822 // Flush out any faults with this instance pointer still in the buffer. This
1823 // prevents us from re-allocating the same instance pointer for a new
1824 // channel and mis-attributing old faults to it.
1825 if (gpu->parent->replayable_faults_supported &&
1826 !uvm_parent_processor_mask_test(flushed_parent_gpus, gpu->parent->id)) {
1827 uvm_gpu_fault_buffer_flush(gpu);
1828 uvm_parent_processor_mask_set(flushed_parent_gpus, gpu->parent->id);
1829 }
1830
1831 uvm_user_channel_destroy_detached(channel);
1832 }
1833
uvm_deferred_free_object_list(struct list_head * deferred_free_list)1834 void uvm_deferred_free_object_list(struct list_head *deferred_free_list)
1835 {
1836 uvm_deferred_free_object_t *object, *next;
1837 uvm_parent_processor_mask_t flushed_parent_gpus;
1838
1839 // flushed_parent_gpus prevents redundant fault buffer flushes by tracking
1840 // the parent GPUs on which the flush already happened. Flushing the fault
1841 // buffer on one GPU instance will flush it for all other instances on that
1842 // parent GPU.
1843 uvm_parent_processor_mask_zero(&flushed_parent_gpus);
1844
1845 list_for_each_entry_safe(object, next, deferred_free_list, list_node) {
1846 list_del(&object->list_node);
1847
1848 switch (object->type) {
1849 case UVM_DEFERRED_FREE_OBJECT_TYPE_CHANNEL:
1850 uvm_deferred_free_object_channel(object, &flushed_parent_gpus);
1851 break;
1852 case UVM_DEFERRED_FREE_OBJECT_GPU_VA_SPACE:
1853 destroy_gpu_va_space(container_of(object, uvm_gpu_va_space_t, deferred_free));
1854 break;
1855 case UVM_DEFERRED_FREE_OBJECT_TYPE_EXTERNAL_ALLOCATION:
1856 uvm_ext_gpu_map_free(container_of(object, uvm_ext_gpu_map_t, deferred_free));
1857 break;
1858 default:
1859 UVM_ASSERT_MSG(0, "Invalid type %d\n", object->type);
1860 }
1861 }
1862 }
1863
uvm_gpu_va_space_get_user_channel(uvm_gpu_va_space_t * gpu_va_space,uvm_gpu_phys_address_t instance_ptr)1864 uvm_user_channel_t *uvm_gpu_va_space_get_user_channel(uvm_gpu_va_space_t *gpu_va_space,
1865 uvm_gpu_phys_address_t instance_ptr)
1866 {
1867 uvm_user_channel_t *user_channel;
1868 uvm_va_space_t *va_space = gpu_va_space->va_space;
1869
1870 UVM_ASSERT(uvm_gpu_va_space_state(gpu_va_space) == UVM_GPU_VA_SPACE_STATE_ACTIVE);
1871 uvm_assert_rwsem_locked(&va_space->lock);
1872
1873 // TODO: Bug 1880191: This is called on every non-replayable fault service.
1874 // Evaluate the performance impact of this list traversal and potentially
1875 // replace it with something better.
1876 list_for_each_entry(user_channel, &gpu_va_space->registered_channels, list_node) {
1877 if (user_channel->instance_ptr.addr.address == instance_ptr.address &&
1878 user_channel->instance_ptr.addr.aperture == instance_ptr.aperture) {
1879 return user_channel;
1880 }
1881 }
1882
1883 return NULL;
1884 }
1885
uvm_api_enable_peer_access(UVM_ENABLE_PEER_ACCESS_PARAMS * params,struct file * filp)1886 NV_STATUS uvm_api_enable_peer_access(UVM_ENABLE_PEER_ACCESS_PARAMS *params, struct file *filp)
1887 {
1888 uvm_va_space_t *va_space = uvm_va_space_get(filp);
1889 NV_STATUS status = NV_OK;
1890 uvm_gpu_t *gpu0 = NULL;
1891 uvm_gpu_t *gpu1 = NULL;
1892 size_t table_index;
1893
1894 uvm_mutex_lock(&g_uvm_global.global_lock);
1895 status = retain_pcie_peers_from_uuids(va_space, ¶ms->gpuUuidA, ¶ms->gpuUuidB, &gpu0, &gpu1);
1896 uvm_mutex_unlock(&g_uvm_global.global_lock);
1897 if (status != NV_OK)
1898 return status;
1899
1900 uvm_va_space_down_write(va_space);
1901
1902 table_index = uvm_gpu_peer_table_index(gpu0->id, gpu1->id);
1903 if (test_bit(table_index, va_space->enabled_peers))
1904 status = NV_ERR_INVALID_DEVICE;
1905 else
1906 status = enable_peers(va_space, gpu0, gpu1);
1907
1908 uvm_va_space_up_write(va_space);
1909
1910 if (status != NV_OK) {
1911 uvm_mutex_lock(&g_uvm_global.global_lock);
1912 uvm_gpu_release_pcie_peer_access(gpu0, gpu1);
1913 uvm_mutex_unlock(&g_uvm_global.global_lock);
1914 }
1915
1916 return status;
1917 }
1918
uvm_api_disable_peer_access(UVM_DISABLE_PEER_ACCESS_PARAMS * params,struct file * filp)1919 NV_STATUS uvm_api_disable_peer_access(UVM_DISABLE_PEER_ACCESS_PARAMS *params, struct file *filp)
1920 {
1921 uvm_va_space_t *va_space = uvm_va_space_get(filp);
1922 NV_STATUS status = NV_OK;
1923 uvm_gpu_t *gpu0, *gpu1;
1924 LIST_HEAD(deferred_free_list);
1925
1926 uvm_va_space_down_write(va_space);
1927
1928 gpu0 = uvm_va_space_get_gpu_by_uuid(va_space, ¶ms->gpuUuidA);
1929 gpu1 = uvm_va_space_get_gpu_by_uuid(va_space, ¶ms->gpuUuidB);
1930
1931 if (!gpu0 || !gpu1) {
1932 status = NV_ERR_INVALID_DEVICE;
1933 goto error;
1934 }
1935
1936 if (uvm_id_equal(gpu0->id, gpu1->id)) {
1937 status = NV_ERR_INVALID_DEVICE;
1938 goto error;
1939 }
1940
1941 if (!uvm_va_space_pcie_peer_enabled(va_space, gpu0, gpu1)) {
1942 status = NV_ERR_INVALID_DEVICE;
1943 goto error;
1944 }
1945
1946 disable_peers(va_space, gpu0, gpu1, &deferred_free_list);
1947
1948 // disable_peers doesn't release the GPU peer ref count, which means the two
1949 // GPUs will remain retained even if another thread unregisters them from
1950 // this VA space after we drop the lock.
1951 uvm_va_space_up_write(va_space);
1952
1953 uvm_deferred_free_object_list(&deferred_free_list);
1954
1955 uvm_mutex_lock(&g_uvm_global.global_lock);
1956 uvm_gpu_release_pcie_peer_access(gpu0, gpu1);
1957 uvm_mutex_unlock(&g_uvm_global.global_lock);
1958
1959 return NV_OK;
1960
1961 error:
1962 uvm_va_space_up_write(va_space);
1963 return status;
1964 }
1965
uvm_va_space_pageable_mem_access_supported(uvm_va_space_t * va_space)1966 bool uvm_va_space_pageable_mem_access_supported(uvm_va_space_t *va_space)
1967 {
1968 // Any pageable memory access requires that we have mm_struct association
1969 // via va_space_mm.
1970 if (!uvm_va_space_mm_enabled(va_space))
1971 return false;
1972
1973 // We might have systems with both ATS and HMM support. ATS gets priority.
1974 if (g_uvm_global.ats.supported)
1975 return g_uvm_global.ats.enabled;
1976
1977 return uvm_hmm_is_enabled(va_space);
1978 }
1979
uvm_test_get_pageable_mem_access_type(UVM_TEST_GET_PAGEABLE_MEM_ACCESS_TYPE_PARAMS * params,struct file * filp)1980 NV_STATUS uvm_test_get_pageable_mem_access_type(UVM_TEST_GET_PAGEABLE_MEM_ACCESS_TYPE_PARAMS *params,
1981 struct file *filp)
1982 {
1983 uvm_va_space_t *va_space = uvm_va_space_get(filp);
1984
1985 params->type = UVM_TEST_PAGEABLE_MEM_ACCESS_TYPE_NONE;
1986
1987 if (uvm_va_space_pageable_mem_access_supported(va_space)) {
1988 if (g_uvm_global.ats.enabled) {
1989 if (UVM_ATS_IBM_SUPPORTED_IN_KERNEL())
1990 params->type = UVM_TEST_PAGEABLE_MEM_ACCESS_TYPE_ATS_KERNEL;
1991 else
1992 params->type = UVM_TEST_PAGEABLE_MEM_ACCESS_TYPE_ATS_DRIVER;
1993 }
1994 else {
1995 params->type = UVM_TEST_PAGEABLE_MEM_ACCESS_TYPE_HMM;
1996 }
1997 }
1998 else if (uvm_va_space_mm_enabled(va_space)) {
1999 params->type = UVM_TEST_PAGEABLE_MEM_ACCESS_TYPE_MMU_NOTIFIER;
2000 }
2001
2002 return NV_OK;
2003 }
2004
uvm_test_flush_deferred_work(UVM_TEST_FLUSH_DEFERRED_WORK_PARAMS * params,struct file * filp)2005 NV_STATUS uvm_test_flush_deferred_work(UVM_TEST_FLUSH_DEFERRED_WORK_PARAMS *params, struct file *filp)
2006 {
2007 UvmTestDeferredWorkType work_type = params->work_type;
2008
2009 switch (work_type) {
2010 case UvmTestDeferredWorkTypeAcessedByMappings:
2011 nv_kthread_q_flush(&g_uvm_global.global_q);
2012 return NV_OK;
2013 default:
2014 return NV_ERR_INVALID_ARGUMENT;
2015 }
2016 }
2017
uvm_test_enable_nvlink_peer_access(UVM_TEST_ENABLE_NVLINK_PEER_ACCESS_PARAMS * params,struct file * filp)2018 NV_STATUS uvm_test_enable_nvlink_peer_access(UVM_TEST_ENABLE_NVLINK_PEER_ACCESS_PARAMS *params, struct file *filp)
2019 {
2020 uvm_va_space_t *va_space = uvm_va_space_get(filp);
2021 NV_STATUS status = NV_OK;
2022 uvm_gpu_t *gpu0 = NULL;
2023 uvm_gpu_t *gpu1 = NULL;
2024 size_t table_index;
2025 uvm_gpu_peer_t *peer_caps = NULL;
2026
2027 uvm_va_space_down_write(va_space);
2028
2029 gpu0 = uvm_va_space_get_gpu_by_uuid(va_space, ¶ms->gpuUuidA);
2030 gpu1 = uvm_va_space_get_gpu_by_uuid(va_space, ¶ms->gpuUuidB);
2031
2032 if (gpu0 && gpu1 && !uvm_id_equal(gpu0->id, gpu1->id))
2033 peer_caps = uvm_gpu_peer_caps(gpu0, gpu1);
2034
2035 if (!peer_caps || peer_caps->link_type < UVM_GPU_LINK_NVLINK_1) {
2036 uvm_va_space_up_write(va_space);
2037 return NV_ERR_INVALID_DEVICE;
2038 }
2039
2040 table_index = uvm_gpu_peer_table_index(gpu0->id, gpu1->id);
2041
2042 // NVLink peers are automatically enabled in the VA space at VA space
2043 // registration time. In order to avoid tests having to keep track of the
2044 // different initial state for PCIe and NVLink peers, we just return NV_OK
2045 // if NVLink peer were already enabled.
2046 if (test_bit(table_index, va_space->enabled_peers))
2047 status = NV_OK;
2048 else
2049 status = enable_peers(va_space, gpu0, gpu1);
2050
2051 uvm_va_space_up_write(va_space);
2052
2053 return status;
2054 }
2055
uvm_test_disable_nvlink_peer_access(UVM_TEST_DISABLE_NVLINK_PEER_ACCESS_PARAMS * params,struct file * filp)2056 NV_STATUS uvm_test_disable_nvlink_peer_access(UVM_TEST_DISABLE_NVLINK_PEER_ACCESS_PARAMS *params, struct file *filp)
2057 {
2058 uvm_va_space_t *va_space = uvm_va_space_get(filp);
2059 NV_STATUS status = NV_OK;
2060 uvm_gpu_t *gpu0, *gpu1;
2061 LIST_HEAD(deferred_free_list);
2062
2063 uvm_va_space_down_write(va_space);
2064
2065 gpu0 = uvm_va_space_get_gpu_by_uuid(va_space, ¶ms->gpuUuidA);
2066 gpu1 = uvm_va_space_get_gpu_by_uuid(va_space, ¶ms->gpuUuidB);
2067
2068 if (!gpu0 || !gpu1) {
2069 status = NV_ERR_INVALID_DEVICE;
2070 goto error;
2071 }
2072
2073 if (uvm_id_equal(gpu0->id, gpu1->id)) {
2074 status = NV_ERR_INVALID_DEVICE;
2075 goto error;
2076 }
2077
2078 if (!uvm_va_space_nvlink_peer_enabled(va_space, gpu0, gpu1)) {
2079 status = NV_ERR_INVALID_DEVICE;
2080 goto error;
2081 }
2082
2083 disable_peers(va_space, gpu0, gpu1, &deferred_free_list);
2084
2085 uvm_va_space_up_write(va_space);
2086
2087 uvm_deferred_free_object_list(&deferred_free_list);
2088
2089 return NV_OK;
2090
2091 error:
2092 uvm_va_space_up_write(va_space);
2093 return status;
2094 }
2095
uvm_test_va_space_inject_error(UVM_TEST_VA_SPACE_INJECT_ERROR_PARAMS * params,struct file * filp)2096 NV_STATUS uvm_test_va_space_inject_error(UVM_TEST_VA_SPACE_INJECT_ERROR_PARAMS *params, struct file *filp)
2097 {
2098 uvm_va_space_t *va_space = uvm_va_space_get(filp);
2099
2100 atomic_set(&va_space->test.migrate_vma_allocation_fail_nth, params->migrate_vma_allocation_fail_nth);
2101 atomic_set(&va_space->test.va_block_allocation_fail_nth, params->va_block_allocation_fail_nth);
2102
2103 return NV_OK;
2104 }
2105
2106 // Add a fixed number of dummy thread contexts to each thread context table.
2107 // The newly added thread contexts are removed by calling
2108 // uvm_test_va_space_remove_dummy_thread_contexts, or during VA space shutdown.
uvm_test_va_space_add_dummy_thread_contexts(UVM_TEST_VA_SPACE_ADD_DUMMY_THREAD_CONTEXTS_PARAMS * params,struct file * filp)2109 NV_STATUS uvm_test_va_space_add_dummy_thread_contexts(UVM_TEST_VA_SPACE_ADD_DUMMY_THREAD_CONTEXTS_PARAMS *params,
2110 struct file *filp)
2111 {
2112 size_t i;
2113 uvm_va_space_t *va_space;
2114 size_t total_dummy_thread_contexts = params->num_dummy_thread_contexts * UVM_THREAD_CONTEXT_TABLE_SIZE;
2115 NV_STATUS status = NV_OK;
2116
2117 if (params->num_dummy_thread_contexts == 0)
2118 return NV_OK;
2119
2120 va_space = uvm_va_space_get(filp);
2121
2122 uvm_va_space_down_write(va_space);
2123
2124 if (va_space->test.dummy_thread_context_wrappers != NULL) {
2125 status = NV_ERR_INVALID_STATE;
2126 goto out;
2127 }
2128
2129 if (va_space->test.num_dummy_thread_context_wrappers > 0) {
2130 status = NV_ERR_INVALID_STATE;
2131 goto out;
2132 }
2133
2134 if (!uvm_thread_context_wrapper_is_used()) {
2135 status = NV_ERR_INVALID_STATE;
2136 goto out;
2137 }
2138
2139 va_space->test.dummy_thread_context_wrappers = uvm_kvmalloc(sizeof(*va_space->test.dummy_thread_context_wrappers) *
2140 total_dummy_thread_contexts);
2141 if (va_space->test.dummy_thread_context_wrappers == NULL) {
2142 status = NV_ERR_NO_MEMORY;
2143 goto out;
2144 }
2145
2146 va_space->test.num_dummy_thread_context_wrappers = total_dummy_thread_contexts;
2147
2148 for (i = 0; i < total_dummy_thread_contexts; i++) {
2149 uvm_thread_context_t *thread_context = &va_space->test.dummy_thread_context_wrappers[i].context;
2150
2151 // The context pointer is used to fill the task.
2152 thread_context->task = (struct task_struct *) thread_context;
2153
2154 uvm_thread_context_add_at(thread_context, i % UVM_THREAD_CONTEXT_TABLE_SIZE);
2155 }
2156
2157 out:
2158 uvm_va_space_up_write(va_space);
2159
2160 return status;
2161 }
2162
va_space_remove_dummy_thread_contexts(uvm_va_space_t * va_space)2163 static void va_space_remove_dummy_thread_contexts(uvm_va_space_t *va_space)
2164 {
2165 size_t i;
2166
2167 uvm_assert_rwsem_locked_write(&va_space->lock);
2168
2169 if (va_space->test.dummy_thread_context_wrappers == NULL) {
2170 UVM_ASSERT(va_space->test.num_dummy_thread_context_wrappers == 0);
2171 return;
2172 }
2173
2174 UVM_ASSERT(uvm_thread_context_wrapper_is_used());
2175 UVM_ASSERT(uvm_enable_builtin_tests != 0);
2176 UVM_ASSERT(va_space->test.num_dummy_thread_context_wrappers > 0);
2177
2178 for (i = 0; i < va_space->test.num_dummy_thread_context_wrappers; i++) {
2179 uvm_thread_context_t *thread_context = &va_space->test.dummy_thread_context_wrappers[i].context;
2180
2181 uvm_thread_context_remove_at(thread_context, i % UVM_THREAD_CONTEXT_TABLE_SIZE);
2182 }
2183
2184 uvm_kvfree(va_space->test.dummy_thread_context_wrappers);
2185 va_space->test.dummy_thread_context_wrappers = NULL;
2186 va_space->test.num_dummy_thread_context_wrappers = 0;
2187 }
2188
uvm_test_va_space_remove_dummy_thread_contexts(UVM_TEST_VA_SPACE_REMOVE_DUMMY_THREAD_CONTEXTS_PARAMS * params,struct file * filp)2189 NV_STATUS uvm_test_va_space_remove_dummy_thread_contexts(UVM_TEST_VA_SPACE_REMOVE_DUMMY_THREAD_CONTEXTS_PARAMS *params,
2190 struct file *filp)
2191 {
2192 uvm_va_space_t *va_space = uvm_va_space_get(filp);
2193
2194 uvm_va_space_down_write(va_space);
2195
2196 va_space_remove_dummy_thread_contexts(va_space);
2197
2198 uvm_va_space_up_write(va_space);
2199
2200 return NV_OK;
2201 }
2202
uvm_test_destroy_gpu_va_space_delay(UVM_TEST_DESTROY_GPU_VA_SPACE_DELAY_PARAMS * params,struct file * filp)2203 NV_STATUS uvm_test_destroy_gpu_va_space_delay(UVM_TEST_DESTROY_GPU_VA_SPACE_DELAY_PARAMS *params, struct file *filp)
2204 {
2205 uvm_va_space_t *va_space = uvm_va_space_get(filp);
2206
2207 // va_space lock is not needed here.
2208 atomic64_set(&va_space->test.destroy_gpu_va_space_delay_us, params->delay_us);
2209
2210 return NV_OK;
2211 }
2212
uvm_test_force_cpu_to_cpu_copy_with_ce(UVM_TEST_FORCE_CPU_TO_CPU_COPY_WITH_CE_PARAMS * params,struct file * filp)2213 NV_STATUS uvm_test_force_cpu_to_cpu_copy_with_ce(UVM_TEST_FORCE_CPU_TO_CPU_COPY_WITH_CE_PARAMS *params,
2214 struct file *filp)
2215
2216 {
2217 uvm_va_space_t *va_space = uvm_va_space_get(filp);
2218
2219 va_space->test.force_cpu_to_cpu_copy_with_ce = params->force_copy_with_ce;
2220 return NV_OK;
2221 }
2222
uvm_test_va_space_allow_movable_allocations(UVM_TEST_VA_SPACE_ALLOW_MOVABLE_ALLOCATIONS_PARAMS * params,struct file * filp)2223 NV_STATUS uvm_test_va_space_allow_movable_allocations(UVM_TEST_VA_SPACE_ALLOW_MOVABLE_ALLOCATIONS_PARAMS *params,
2224 struct file *filp)
2225 {
2226 uvm_va_space_t *va_space = uvm_va_space_get(filp);
2227
2228 va_space->test.allow_allocation_from_movable = params->allow_movable;
2229 return NV_OK;
2230 }
2231
2232 // List of fault service contexts for CPU faults
2233 static LIST_HEAD(g_cpu_service_block_context_list);
2234
2235 static uvm_spinlock_t g_cpu_service_block_context_list_lock;
2236
uvm_service_block_context_alloc(struct mm_struct * mm)2237 uvm_service_block_context_t *uvm_service_block_context_alloc(struct mm_struct *mm)
2238 {
2239 uvm_service_block_context_t *service_context = uvm_kvmalloc(sizeof(*service_context));
2240
2241 if (!service_context)
2242 return NULL;
2243
2244 service_context->block_context = uvm_va_block_context_alloc(mm);
2245 if (!service_context->block_context) {
2246 uvm_kvfree(service_context);
2247 service_context = NULL;
2248 }
2249
2250 return service_context;
2251 }
2252
uvm_service_block_context_free(uvm_service_block_context_t * service_context)2253 void uvm_service_block_context_free(uvm_service_block_context_t *service_context)
2254 {
2255 if (!service_context)
2256 return;
2257
2258 uvm_va_block_context_free(service_context->block_context);
2259 uvm_kvfree(service_context);
2260 }
2261
uvm_service_block_context_init(void)2262 NV_STATUS uvm_service_block_context_init(void)
2263 {
2264 unsigned num_preallocated_contexts = 4;
2265
2266 uvm_spin_lock_init(&g_cpu_service_block_context_list_lock, UVM_LOCK_ORDER_LEAF);
2267
2268 // Pre-allocate some fault service contexts for the CPU and add them to the global list
2269 while (num_preallocated_contexts-- > 0) {
2270 uvm_service_block_context_t *service_context = uvm_service_block_context_alloc(NULL);
2271
2272 if (!service_context)
2273 return NV_ERR_NO_MEMORY;
2274
2275 list_add(&service_context->cpu_fault.service_context_list, &g_cpu_service_block_context_list);
2276 }
2277
2278 return NV_OK;
2279 }
2280
uvm_service_block_context_exit(void)2281 void uvm_service_block_context_exit(void)
2282 {
2283 uvm_service_block_context_t *service_context, *service_context_tmp;
2284
2285 // Free fault service contexts for the CPU and add clear the global list
2286 list_for_each_entry_safe(service_context,
2287 service_context_tmp,
2288 &g_cpu_service_block_context_list,
2289 cpu_fault.service_context_list) {
2290 uvm_service_block_context_free(service_context);
2291 }
2292
2293 INIT_LIST_HEAD(&g_cpu_service_block_context_list);
2294 }
2295
2296 // Get a fault service context from the global list or allocate a new one if
2297 // there are no available entries.
service_block_context_cpu_alloc(void)2298 static uvm_service_block_context_t *service_block_context_cpu_alloc(void)
2299 {
2300 uvm_service_block_context_t *service_context;
2301
2302 uvm_spin_lock(&g_cpu_service_block_context_list_lock);
2303
2304 service_context = list_first_entry_or_null(&g_cpu_service_block_context_list,
2305 uvm_service_block_context_t,
2306 cpu_fault.service_context_list);
2307
2308 if (service_context)
2309 list_del(&service_context->cpu_fault.service_context_list);
2310
2311 uvm_spin_unlock(&g_cpu_service_block_context_list_lock);
2312
2313 if (!service_context)
2314 service_context = uvm_service_block_context_alloc(NULL);
2315 else
2316 uvm_va_block_context_init(service_context->block_context, NULL);
2317
2318 return service_context;
2319 }
2320
2321 // Put a fault service context in the global list.
service_block_context_cpu_free(uvm_service_block_context_t * service_context)2322 static void service_block_context_cpu_free(uvm_service_block_context_t *service_context)
2323 {
2324 uvm_spin_lock(&g_cpu_service_block_context_list_lock);
2325
2326 list_add(&service_context->cpu_fault.service_context_list, &g_cpu_service_block_context_list);
2327
2328 uvm_spin_unlock(&g_cpu_service_block_context_list_lock);
2329 }
2330
uvm_va_space_cpu_fault(uvm_va_space_t * va_space,struct vm_area_struct * vma,struct vm_fault * vmf,bool is_hmm)2331 static vm_fault_t uvm_va_space_cpu_fault(uvm_va_space_t *va_space,
2332 struct vm_area_struct *vma,
2333 struct vm_fault *vmf,
2334 bool is_hmm)
2335 {
2336 uvm_va_block_t *va_block;
2337 NvU64 fault_addr = nv_page_fault_va(vmf);
2338 bool is_write = vmf->flags & FAULT_FLAG_WRITE;
2339 NV_STATUS status = uvm_global_get_status();
2340 bool tools_enabled;
2341 bool major_fault = false;
2342 bool is_remote_mm = false;
2343 uvm_service_block_context_t *service_context;
2344 uvm_processor_mask_t *gpus_to_check_for_ecc;
2345
2346 if (status != NV_OK)
2347 goto convert_error;
2348
2349 // TODO: Bug 2583279: Lock tracking is disabled for the power management
2350 // lock in order to suppress reporting of a lock policy violation.
2351 // The violation consists in acquiring the power management lock multiple
2352 // times, and it is manifested as an error during release. The
2353 // re-acquisition of the power management locks happens upon re-entry in the
2354 // UVM module, and it is benign on itself, but when combined with certain
2355 // power management scenarios, it is indicative of a potential deadlock.
2356 // Tracking will be re-enabled once the power management locking strategy is
2357 // modified to avoid deadlocks.
2358 if (!uvm_down_read_trylock_no_tracking(&g_uvm_global.pm.lock)) {
2359 status = NV_ERR_BUSY_RETRY;
2360 goto convert_error;
2361 }
2362
2363 service_context = service_block_context_cpu_alloc();
2364 if (!service_context) {
2365 status = NV_ERR_NO_MEMORY;
2366 goto unlock;
2367 }
2368
2369 service_context->cpu_fault.wakeup_time_stamp = 0;
2370
2371 // There are up to three mm_structs to worry about, and they might all be
2372 // different:
2373 //
2374 // 1) vma->vm_mm
2375 // 2) current->mm
2376 // 3) va_space->va_space_mm.mm (though note that if this is valid, then it
2377 // must match vma->vm_mm).
2378 //
2379 // The kernel guarantees that vma->vm_mm has a reference taken with
2380 // mmap_lock held on the CPU fault path, so tell the fault handler to use
2381 // that one. current->mm might differ if we're on the access_process_vm
2382 // (ptrace) path or if another driver is calling get_user_pages.
2383 service_context->block_context->mm = vma->vm_mm;
2384
2385 // The mmap_lock might be held in write mode, but the mode doesn't matter
2386 // for the purpose of lock ordering and we don't rely on it being in write
2387 // anywhere so just record it as read mode in all cases.
2388 uvm_record_lock_mmap_lock_read(vma->vm_mm);
2389
2390 do {
2391 bool do_sleep = false;
2392
2393 if (status == NV_WARN_MORE_PROCESSING_REQUIRED) {
2394 NvU64 now = NV_GETTIME();
2395 if (now < service_context->cpu_fault.wakeup_time_stamp)
2396 do_sleep = true;
2397
2398 if (do_sleep)
2399 uvm_tools_record_throttling_start(va_space, fault_addr, UVM_ID_CPU);
2400
2401 // Drop the VA space lock while we sleep
2402 uvm_va_space_up_read(va_space);
2403
2404 // usleep_range is preferred because msleep has a 20ms granularity
2405 // and udelay uses a busy-wait loop. usleep_range uses
2406 // high-resolution timers and, by adding a range, the Linux
2407 // scheduler may coalesce our wakeup with others, thus saving some
2408 // interrupts.
2409 if (do_sleep) {
2410 unsigned long nap_us = (service_context->cpu_fault.wakeup_time_stamp - now) / 1000;
2411
2412 usleep_range(nap_us, nap_us + nap_us / 2);
2413 }
2414 }
2415
2416 uvm_va_space_down_read(va_space);
2417
2418 if (do_sleep)
2419 uvm_tools_record_throttling_end(va_space, fault_addr, UVM_ID_CPU);
2420
2421 if (is_hmm) {
2422 if (va_space->va_space_mm.mm == vma->vm_mm) {
2423 // Note that normally we should find a va_block for the faulting
2424 // address because the block had to be created when migrating a
2425 // page to the GPU and a device private PTE inserted into the CPU
2426 // page tables in order for migrate_to_ram() to be called. Not
2427 // finding it means the PTE was remapped to a different virtual
2428 // address with mremap() so create a new va_block if needed.
2429 status = uvm_hmm_va_block_find_create(va_space,
2430 fault_addr,
2431 &service_context->block_context->hmm.vma,
2432 &va_block);
2433 if (status != NV_OK)
2434 break;
2435
2436 UVM_ASSERT(service_context->block_context->hmm.vma == vma);
2437 status = uvm_hmm_migrate_begin(va_block);
2438 if (status != NV_OK)
2439 break;
2440
2441 service_context->cpu_fault.vmf = vmf;
2442 }
2443 else {
2444 is_remote_mm = true;
2445 status = uvm_hmm_remote_cpu_fault(vmf);
2446 break;
2447 }
2448 }
2449 else {
2450 status = uvm_va_block_find_create_managed(va_space, fault_addr, &va_block);
2451 if (status != NV_OK) {
2452 UVM_ASSERT_MSG(status == NV_ERR_NO_MEMORY, "status: %s\n", nvstatusToString(status));
2453 break;
2454 }
2455
2456 // Watch out, current->mm might not be vma->vm_mm
2457 UVM_ASSERT(vma == uvm_va_range_vma(va_block->va_range));
2458 }
2459
2460 // Loop until thrashing goes away.
2461 status = uvm_va_block_cpu_fault(va_block, fault_addr, is_write, service_context);
2462
2463 if (is_hmm)
2464 uvm_hmm_migrate_finish(va_block);
2465 } while (status == NV_WARN_MORE_PROCESSING_REQUIRED);
2466
2467 if (status != NV_OK && !(is_hmm && status == NV_ERR_BUSY_RETRY)) {
2468 UvmEventFatalReason reason;
2469
2470 reason = uvm_tools_status_to_fatal_fault_reason(status);
2471 UVM_ASSERT(reason != UvmEventFatalReasonInvalid);
2472
2473 uvm_tools_record_cpu_fatal_fault(va_space, fault_addr, is_write, reason);
2474 }
2475
2476 tools_enabled = va_space->tools.enabled;
2477 gpus_to_check_for_ecc = &service_context->cpu_fault.gpus_to_check_for_ecc;
2478
2479 if (status == NV_OK && !is_remote_mm)
2480 uvm_global_gpu_retain(gpus_to_check_for_ecc);
2481
2482 uvm_va_space_up_read(va_space);
2483 uvm_record_unlock_mmap_lock_read(vma->vm_mm);
2484
2485 if (status == NV_OK && !is_remote_mm) {
2486 status = uvm_global_gpu_check_ecc_error(gpus_to_check_for_ecc);
2487 uvm_global_gpu_release(gpus_to_check_for_ecc);
2488 }
2489
2490 if (tools_enabled)
2491 uvm_tools_flush_events();
2492
2493 // Major faults involve I/O in order to resolve the fault.
2494 // If any pages were DMA'ed between the GPU and host memory, that makes it
2495 // a major fault. A process can also get statistics for major and minor
2496 // faults by calling readproc().
2497 major_fault = service_context->cpu_fault.did_migrate;
2498 service_block_context_cpu_free(service_context);
2499
2500 unlock:
2501 // TODO: Bug 2583279: See the comment above the matching lock acquisition
2502 uvm_up_read_no_tracking(&g_uvm_global.pm.lock);
2503
2504 convert_error:
2505 switch (status) {
2506 case NV_OK:
2507 case NV_ERR_BUSY_RETRY:
2508 return VM_FAULT_NOPAGE | (major_fault ? VM_FAULT_MAJOR : 0);
2509 case NV_ERR_NO_MEMORY:
2510 return VM_FAULT_OOM;
2511 default:
2512 return VM_FAULT_SIGBUS;
2513 }
2514 }
2515
uvm_va_space_cpu_fault_managed(uvm_va_space_t * va_space,struct vm_area_struct * vma,struct vm_fault * vmf)2516 vm_fault_t uvm_va_space_cpu_fault_managed(uvm_va_space_t *va_space,
2517 struct vm_area_struct *vma,
2518 struct vm_fault *vmf)
2519 {
2520 UVM_ASSERT(va_space == uvm_va_space_get(vma->vm_file));
2521
2522 return uvm_va_space_cpu_fault(va_space, vma, vmf, false);
2523 }
2524
uvm_va_space_cpu_fault_hmm(uvm_va_space_t * va_space,struct vm_area_struct * vma,struct vm_fault * vmf)2525 vm_fault_t uvm_va_space_cpu_fault_hmm(uvm_va_space_t *va_space,
2526 struct vm_area_struct *vma,
2527 struct vm_fault *vmf)
2528 {
2529 return uvm_va_space_cpu_fault(va_space, vma, vmf, true);
2530 }
2531