1 /*******************************************************************************
2     Copyright (c) 2015-2024 NVIDIA Corporation
3 
4     Permission is hereby granted, free of charge, to any person obtaining a copy
5     of this software and associated documentation files (the "Software"), to
6     deal in the Software without restriction, including without limitation the
7     rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
8     sell copies of the Software, and to permit persons to whom the Software is
9     furnished to do so, subject to the following conditions:
10 
11         The above copyright notice and this permission notice shall be
12         included in all copies or substantial portions of the Software.
13 
14     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17     THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19     FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20     DEALINGS IN THE SOFTWARE.
21 
22 *******************************************************************************/
23 
24 #include "uvm_api.h"
25 #include "uvm_va_space.h"
26 #include "uvm_va_range.h"
27 #include "uvm_lock.h"
28 #include "uvm_global.h"
29 #include "uvm_kvmalloc.h"
30 #include "uvm_perf_heuristics.h"
31 #include "uvm_user_channel.h"
32 #include "uvm_tools.h"
33 #include "uvm_thread_context.h"
34 #include "uvm_hal.h"
35 #include "uvm_map_external.h"
36 #include "uvm_ats.h"
37 #include "uvm_gpu_access_counters.h"
38 #include "uvm_hmm.h"
39 #include "uvm_va_space_mm.h"
40 #include "uvm_test.h"
41 #include "uvm_common.h"
42 #include "nv_uvm_interface.h"
43 #include "nv-kthread-q.h"
44 
processor_mask_array_test(const uvm_processor_mask_t * mask,uvm_processor_id_t mask_id,uvm_processor_id_t id)45 static bool processor_mask_array_test(const uvm_processor_mask_t *mask,
46                                       uvm_processor_id_t mask_id,
47                                       uvm_processor_id_t id)
48 {
49     return uvm_processor_mask_test(&mask[uvm_id_value(mask_id)], id);
50 }
51 
processor_mask_array_clear(uvm_processor_mask_t * mask,uvm_processor_id_t mask_id,uvm_processor_id_t id)52 static void processor_mask_array_clear(uvm_processor_mask_t *mask,
53                                        uvm_processor_id_t mask_id,
54                                        uvm_processor_id_t id)
55 {
56     uvm_processor_mask_clear(&mask[uvm_id_value(mask_id)], id);
57 }
58 
processor_mask_array_set(uvm_processor_mask_t * mask,uvm_processor_id_t mask_id,uvm_processor_id_t id)59 static void processor_mask_array_set(uvm_processor_mask_t *mask,
60                                      uvm_processor_id_t mask_id,
61                                      uvm_processor_id_t id)
62 {
63     uvm_processor_mask_set(&mask[uvm_id_value(mask_id)], id);
64 }
65 
processor_mask_array_empty(const uvm_processor_mask_t * mask,uvm_processor_id_t mask_id)66 static bool processor_mask_array_empty(const uvm_processor_mask_t *mask, uvm_processor_id_t mask_id)
67 {
68     return uvm_processor_mask_empty(&mask[uvm_id_value(mask_id)]);
69 }
70 
71 static NV_STATUS enable_peers(uvm_va_space_t *va_space, uvm_gpu_t *gpu0, uvm_gpu_t *gpu1);
72 static void disable_peers(uvm_va_space_t *va_space,
73                           uvm_gpu_t *gpu0,
74                           uvm_gpu_t *gpu1,
75                           struct list_head *deferred_free_list);
76 static void remove_gpu_va_space(uvm_gpu_va_space_t *gpu_va_space,
77                                 struct mm_struct *mm,
78                                 struct list_head *deferred_free_list);
79 static void va_space_remove_dummy_thread_contexts(uvm_va_space_t *va_space);
80 
init_tools_data(uvm_va_space_t * va_space)81 static void init_tools_data(uvm_va_space_t *va_space)
82 {
83     int i;
84 
85     uvm_init_rwsem(&va_space->tools.lock, UVM_LOCK_ORDER_VA_SPACE_TOOLS);
86 
87     for (i = 0; i < ARRAY_SIZE(va_space->tools.counters); i++)
88         INIT_LIST_HEAD(va_space->tools.counters + i);
89     for (i = 0; i < ARRAY_SIZE(va_space->tools.queues_v1); i++)
90         INIT_LIST_HEAD(va_space->tools.queues_v1 + i);
91     for (i = 0; i < ARRAY_SIZE(va_space->tools.queues_v2); i++)
92         INIT_LIST_HEAD(va_space->tools.queues_v2 + i);
93 }
94 
register_gpu_peers(uvm_va_space_t * va_space,uvm_gpu_t * gpu)95 static NV_STATUS register_gpu_peers(uvm_va_space_t *va_space, uvm_gpu_t *gpu)
96 {
97     uvm_gpu_t *other_gpu;
98 
99     uvm_assert_rwsem_locked(&va_space->lock);
100 
101     for_each_va_space_gpu(other_gpu, va_space) {
102         uvm_gpu_peer_t *peer_caps;
103 
104         if (uvm_id_equal(other_gpu->id, gpu->id))
105             continue;
106 
107         peer_caps = uvm_gpu_peer_caps(gpu, other_gpu);
108 
109         if (peer_caps->link_type >= UVM_GPU_LINK_NVLINK_1 || gpu->parent == other_gpu->parent) {
110             NV_STATUS status = enable_peers(va_space, gpu, other_gpu);
111             if (status != NV_OK)
112                 return status;
113         }
114     }
115 
116     return NV_OK;
117 }
118 
va_space_check_processors_masks(uvm_va_space_t * va_space)119 static bool va_space_check_processors_masks(uvm_va_space_t *va_space)
120 {
121     uvm_processor_id_t processor;
122     uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, NULL);
123     uvm_processor_mask_t *processors = &block_context->scratch_processor_mask;
124 
125     uvm_assert_rwsem_locked_write(&va_space->lock);
126 
127     uvm_processor_mask_copy(processors, &va_space->registered_gpus);
128     uvm_processor_mask_set(processors, UVM_ID_CPU);
129 
130     for_each_id_in_mask(processor, processors) {
131         uvm_processor_id_t other_processor;
132         bool check_can_copy_from = true;
133 
134         if (UVM_ID_IS_GPU(processor)) {
135             uvm_gpu_t *gpu = uvm_va_space_get_gpu(va_space, processor);
136 
137             // Peer copies between two processors can be disabled even when they
138             // are NvLink peers, or there is HW support for atomics between
139             // them.
140             if (gpu->parent->peer_copy_mode == UVM_GPU_PEER_COPY_MODE_UNSUPPORTED)
141                 check_can_copy_from = false;
142         }
143 
144         UVM_ASSERT(processor_mask_array_test(va_space->can_access, processor, processor));
145         UVM_ASSERT(processor_mask_array_test(va_space->accessible_from, processor, processor));
146         UVM_ASSERT(processor_mask_array_test(va_space->can_copy_from, processor, processor));
147         UVM_ASSERT(processor_mask_array_test(va_space->can_copy_from, processor, UVM_ID_CPU));
148         UVM_ASSERT(processor_mask_array_test(va_space->can_copy_from, UVM_ID_CPU, processor));
149 
150         // NVLINK
151         UVM_ASSERT(!processor_mask_array_test(va_space->has_nvlink, processor, processor));
152 
153         if (check_can_copy_from) {
154             UVM_ASSERT(uvm_processor_mask_subset(&va_space->has_nvlink[uvm_id_value(processor)],
155                                                  &va_space->can_copy_from[uvm_id_value(processor)]));
156         }
157 
158         // Peers
159         UVM_ASSERT(!processor_mask_array_test(va_space->indirect_peers, processor, processor));
160         UVM_ASSERT(uvm_processor_mask_subset(&va_space->indirect_peers[uvm_id_value(processor)],
161                                              &va_space->has_native_atomics[uvm_id_value(processor)]));
162 
163         // Atomics
164         UVM_ASSERT(processor_mask_array_test(va_space->has_native_atomics, processor, processor));
165 
166         if (check_can_copy_from) {
167             UVM_ASSERT(uvm_processor_mask_subset(&va_space->has_native_atomics[uvm_id_value(processor)],
168                                                  &va_space->can_copy_from[uvm_id_value(processor)]));
169         }
170 
171         UVM_ASSERT(uvm_processor_mask_subset(&va_space->has_native_atomics[uvm_id_value(processor)],
172                                              &va_space->can_access[uvm_id_value(processor)]));
173 
174         for_each_id_in_mask(other_processor, &va_space->can_access[uvm_id_value(processor)])
175             UVM_ASSERT(processor_mask_array_test(va_space->accessible_from, other_processor, processor));
176 
177         for_each_id_in_mask(other_processor, &va_space->accessible_from[uvm_id_value(processor)])
178             UVM_ASSERT(processor_mask_array_test(va_space->can_access, other_processor, processor));
179     }
180 
181     return true;
182 }
183 
uvm_va_space_create(struct address_space * mapping,uvm_va_space_t ** va_space_ptr,NvU64 flags)184 NV_STATUS uvm_va_space_create(struct address_space *mapping, uvm_va_space_t **va_space_ptr, NvU64 flags)
185 {
186     NV_STATUS status;
187     uvm_va_space_t *va_space = uvm_kvmalloc_zero(sizeof(*va_space));
188     uvm_gpu_id_t gpu_id;
189 
190     *va_space_ptr = NULL;
191     if (!va_space)
192         return NV_ERR_NO_MEMORY;
193 
194     if (flags & ~UVM_INIT_FLAGS_MASK) {
195         uvm_kvfree(va_space);
196         return NV_ERR_INVALID_ARGUMENT;
197     }
198 
199     uvm_init_rwsem(&va_space->lock, UVM_LOCK_ORDER_VA_SPACE);
200     uvm_mutex_init(&va_space->closest_processors.mask_mutex, UVM_LOCK_ORDER_LEAF);
201     uvm_mutex_init(&va_space->serialize_writers_lock, UVM_LOCK_ORDER_VA_SPACE_SERIALIZE_WRITERS);
202     uvm_mutex_init(&va_space->read_acquire_write_release_lock,
203                    UVM_LOCK_ORDER_VA_SPACE_READ_ACQUIRE_WRITE_RELEASE_LOCK);
204     uvm_spin_lock_init(&va_space->va_space_mm.lock, UVM_LOCK_ORDER_LEAF);
205     uvm_range_tree_init(&va_space->va_range_tree);
206     uvm_ats_init_va_space(va_space);
207 
208     // Init to 0 since we rely on atomic_inc_return behavior to return 1 as the first ID
209     atomic64_set(&va_space->range_group_id_counter, 0);
210 
211     INIT_RADIX_TREE(&va_space->range_groups, NV_UVM_GFP_FLAGS);
212     uvm_range_tree_init(&va_space->range_group_ranges);
213 
214     bitmap_zero(va_space->enabled_peers, UVM_MAX_UNIQUE_GPU_PAIRS);
215 
216     // CPU is not explicitly registered in the va space
217     processor_mask_array_set(va_space->can_access, UVM_ID_CPU, UVM_ID_CPU);
218     processor_mask_array_set(va_space->accessible_from, UVM_ID_CPU, UVM_ID_CPU);
219     processor_mask_array_set(va_space->can_copy_from, UVM_ID_CPU, UVM_ID_CPU);
220     processor_mask_array_set(va_space->has_native_atomics, UVM_ID_CPU, UVM_ID_CPU);
221 
222     // CPU always participates in system-wide atomics
223     uvm_processor_mask_set(&va_space->system_wide_atomics_enabled_processors, UVM_ID_CPU);
224     uvm_processor_mask_set(&va_space->faultable_processors, UVM_ID_CPU);
225 
226     // Initialize the CPU/GPU affinity array. New CPU NUMA nodes are added at
227     // GPU registration time, but they are never freed on unregister_gpu
228     // (although the GPU is removed from the corresponding mask).
229     for_each_gpu_id(gpu_id) {
230         uvm_cpu_gpu_affinity_t *affinity = &va_space->gpu_cpu_numa_affinity[uvm_id_gpu_index(gpu_id)];
231 
232         affinity->numa_node = -1;
233         uvm_processor_mask_zero(&affinity->gpus);
234     }
235 
236     init_waitqueue_head(&va_space->va_space_mm.last_retainer_wait_queue);
237     init_waitqueue_head(&va_space->gpu_va_space_deferred_free.wait_queue);
238 
239     va_space->mapping = mapping;
240     va_space->test.page_prefetch_enabled = true;
241 
242     init_tools_data(va_space);
243 
244     uvm_down_write_mmap_lock(current->mm);
245     uvm_va_space_down_write(va_space);
246 
247     va_space->va_block_context = uvm_va_block_context_alloc(NULL);
248     if (!va_space->va_block_context) {
249         status = NV_ERR_NO_MEMORY;
250         goto fail;
251     }
252 
253     status = uvm_perf_init_va_space_events(va_space, &va_space->perf_events);
254     if (status != NV_OK)
255         goto fail;
256 
257     status = uvm_perf_heuristics_load(va_space);
258     if (status != NV_OK)
259         goto fail;
260 
261     status = uvm_gpu_init_va_space(va_space);
262     if (status != NV_OK)
263         goto fail;
264 
265     UVM_ASSERT(va_space_check_processors_masks(va_space));
266 
267     va_space->initialization_flags = flags;
268 
269     status = uvm_va_space_mm_register(va_space);
270     if (status != NV_OK)
271         goto fail;
272 
273     uvm_hmm_va_space_initialize(va_space);
274 
275     uvm_va_space_up_write(va_space);
276     uvm_up_write_mmap_lock(current->mm);
277 
278     uvm_mutex_lock(&g_uvm_global.va_spaces.lock);
279     list_add_tail(&va_space->list_node, &g_uvm_global.va_spaces.list);
280     uvm_mutex_unlock(&g_uvm_global.va_spaces.lock);
281 
282     *va_space_ptr = va_space;
283 
284     return NV_OK;
285 
286 fail:
287     uvm_perf_heuristics_unload(va_space);
288     uvm_perf_destroy_va_space_events(&va_space->perf_events);
289     uvm_va_block_context_free(va_space->va_block_context);
290     uvm_va_space_up_write(va_space);
291     uvm_up_write_mmap_lock(current->mm);
292 
293     // See the comment in uvm_va_space_mm_unregister() for why this has to be
294     // called after releasing the locks.
295     uvm_va_space_mm_unregister(va_space);
296 
297     uvm_kvfree(va_space);
298 
299     return status;
300 }
301 
302 // This function does *not* release the GPU, nor the GPU's PCIE peer pairings.
303 // Those are returned so the caller can do it after dropping the VA space lock.
unregister_gpu(uvm_va_space_t * va_space,uvm_gpu_t * gpu,struct mm_struct * mm,struct list_head * deferred_free_list,uvm_processor_mask_t * peers_to_release)304 static void unregister_gpu(uvm_va_space_t *va_space,
305                            uvm_gpu_t *gpu,
306                            struct mm_struct *mm,
307                            struct list_head *deferred_free_list,
308                            uvm_processor_mask_t *peers_to_release)
309 {
310     uvm_gpu_t *peer_gpu;
311     uvm_va_range_t *va_range;
312     NvU32 peer_table_index;
313 
314     uvm_assert_rwsem_locked_write(&va_space->lock);
315 
316     if (peers_to_release)
317         uvm_processor_mask_zero(peers_to_release);
318 
319     // If a GPU VA Space was explicitly registered, but not explicitly
320     // unregistered, unregister it and add all of its objects to the free list.
321     remove_gpu_va_space(uvm_gpu_va_space_get(va_space, gpu), mm, deferred_free_list);
322 
323     uvm_for_each_va_range(va_range, va_space)
324         uvm_va_range_unregister_gpu(va_range, gpu, mm, deferred_free_list);
325 
326     uvm_hmm_unregister_gpu(va_space, gpu, mm);
327 
328     // If this GPU has any peer-to-peer pair that was explicitly enabled, but
329     // not explicitly disabled, disable it.
330     // Notably do this only after unregistering the GPU from VA ranges to make
331     // sure there is no pending work using the peer mappings within the VA
332     // blocks (in particular migrations using the peer identity mappings).
333     for_each_va_space_gpu(peer_gpu, va_space) {
334         if (gpu == peer_gpu)
335             continue;
336 
337         peer_table_index = uvm_gpu_peer_table_index(gpu->id, peer_gpu->id);
338         if (test_bit(peer_table_index, va_space->enabled_peers)) {
339             disable_peers(va_space, gpu, peer_gpu, deferred_free_list);
340 
341             // Only PCIE peers need to be globally released. NVLINK peers are
342             // brought up and torn down automatically within add_gpu and
343             // remove_gpu.
344             if (peers_to_release && g_uvm_global.peers[peer_table_index].link_type == UVM_GPU_LINK_PCIE)
345                 uvm_processor_mask_set(peers_to_release, peer_gpu->id);
346         }
347     }
348 
349     if (gpu->parent->isr.replayable_faults.handling) {
350         UVM_ASSERT(uvm_processor_mask_test(&va_space->faultable_processors, gpu->id));
351         uvm_processor_mask_clear(&va_space->faultable_processors, gpu->id);
352         uvm_processor_mask_clear(&va_space->system_wide_atomics_enabled_processors, gpu->id);
353     }
354     else {
355         UVM_ASSERT(uvm_processor_mask_test(&va_space->non_faultable_processors, gpu->id));
356         uvm_processor_mask_clear(&va_space->non_faultable_processors, gpu->id);
357     }
358 
359     processor_mask_array_clear(va_space->can_access, gpu->id, gpu->id);
360     processor_mask_array_clear(va_space->can_access, gpu->id, UVM_ID_CPU);
361     processor_mask_array_clear(va_space->can_access, UVM_ID_CPU, gpu->id);
362     UVM_ASSERT(processor_mask_array_empty(va_space->can_access, gpu->id));
363 
364     processor_mask_array_clear(va_space->accessible_from, gpu->id, gpu->id);
365     processor_mask_array_clear(va_space->accessible_from, gpu->id, UVM_ID_CPU);
366     processor_mask_array_clear(va_space->accessible_from, UVM_ID_CPU, gpu->id);
367     UVM_ASSERT(processor_mask_array_empty(va_space->accessible_from, gpu->id));
368 
369     processor_mask_array_clear(va_space->can_copy_from, gpu->id, gpu->id);
370     processor_mask_array_clear(va_space->can_copy_from, gpu->id, UVM_ID_CPU);
371     processor_mask_array_clear(va_space->can_copy_from, UVM_ID_CPU, gpu->id);
372     UVM_ASSERT(processor_mask_array_empty(va_space->can_copy_from, gpu->id));
373 
374     processor_mask_array_clear(va_space->has_nvlink, gpu->id, UVM_ID_CPU);
375     processor_mask_array_clear(va_space->has_nvlink, UVM_ID_CPU, gpu->id);
376     UVM_ASSERT(processor_mask_array_empty(va_space->has_nvlink, gpu->id));
377 
378     UVM_ASSERT(processor_mask_array_empty(va_space->indirect_peers, gpu->id));
379 
380     processor_mask_array_clear(va_space->has_native_atomics, gpu->id, gpu->id);
381     processor_mask_array_clear(va_space->has_native_atomics, gpu->id, UVM_ID_CPU);
382     processor_mask_array_clear(va_space->has_native_atomics, UVM_ID_CPU, gpu->id);
383     UVM_ASSERT(processor_mask_array_empty(va_space->has_native_atomics, gpu->id));
384 
385     uvm_processor_mask_clear(&va_space->registered_gpus, gpu->id);
386     va_space->registered_gpus_table[uvm_id_gpu_index(gpu->id)] = NULL;
387 
388     // Remove the GPU from the CPU/GPU affinity masks
389     if (gpu->parent->closest_cpu_numa_node != -1) {
390         uvm_gpu_id_t gpu_id;
391 
392         for_each_gpu_id(gpu_id) {
393             uvm_cpu_gpu_affinity_t *affinity = &va_space->gpu_cpu_numa_affinity[uvm_id_gpu_index(gpu_id)];
394 
395             if (affinity->numa_node == gpu->parent->closest_cpu_numa_node) {
396                 uvm_processor_mask_clear(&affinity->gpus, gpu->id);
397                 break;
398             }
399         }
400     }
401 
402     if (va_space->gpu_unregister_dma_buffer[uvm_id_gpu_index(gpu->id)]) {
403         uvm_conf_computing_dma_buffer_free(&gpu->conf_computing.dma_buffer_pool,
404                                            va_space->gpu_unregister_dma_buffer[uvm_id_gpu_index(gpu->id)],
405                                            &va_space->gpu_unregister_dma_buffer[uvm_id_gpu_index(gpu->id)]->tracker);
406     }
407     va_space_check_processors_masks(va_space);
408 }
409 
gpu_va_space_stop_all_channels(uvm_gpu_va_space_t * gpu_va_space)410 static void gpu_va_space_stop_all_channels(uvm_gpu_va_space_t *gpu_va_space)
411 {
412     uvm_user_channel_t *user_channel;
413 
414     list_for_each_entry(user_channel, &gpu_va_space->registered_channels, list_node)
415         uvm_user_channel_stop(user_channel);
416 
417     // Prevent new channels from being registered since we'll be dropping the
418     // VA space lock shortly with the expectation that no more channels will
419     // arrive.
420     atomic_set(&gpu_va_space->disallow_new_channels, 1);
421 }
422 
423 // Detaches (unregisters) all user channels in a GPU VA space. The channels must
424 // have previously been stopped.
425 //
426 // The detached channels are added to the input list. The caller is expected to
427 // drop the VA space lock and call uvm_deferred_free_object_list to complete the
428 // destroy operation.
uvm_gpu_va_space_detach_all_user_channels(uvm_gpu_va_space_t * gpu_va_space,struct list_head * deferred_free_list)429 static void uvm_gpu_va_space_detach_all_user_channels(uvm_gpu_va_space_t *gpu_va_space,
430                                                       struct list_head *deferred_free_list)
431 {
432     uvm_user_channel_t *user_channel, *next_channel;
433     list_for_each_entry_safe(user_channel, next_channel, &gpu_va_space->registered_channels, list_node)
434         uvm_user_channel_detach(user_channel, deferred_free_list);
435 }
436 
uvm_va_space_detach_all_user_channels(uvm_va_space_t * va_space,struct list_head * deferred_free_list)437 void uvm_va_space_detach_all_user_channels(uvm_va_space_t *va_space, struct list_head *deferred_free_list)
438 {
439     uvm_gpu_va_space_t *gpu_va_space;
440     for_each_gpu_va_space(gpu_va_space, va_space)
441         uvm_gpu_va_space_detach_all_user_channels(gpu_va_space, deferred_free_list);
442 }
443 
uvm_va_space_destroy(uvm_va_space_t * va_space)444 void uvm_va_space_destroy(uvm_va_space_t *va_space)
445 {
446     uvm_va_range_t *va_range, *va_range_next;
447     uvm_gpu_t *gpu;
448     uvm_gpu_id_t gpu_id;
449     uvm_processor_mask_t *retained_gpus = &va_space->registered_gpus_teardown;
450     LIST_HEAD(deferred_free_list);
451 
452     // Remove the VA space from the global list before we start tearing things
453     // down so other threads can't see the VA space in a partially-valid state.
454     uvm_mutex_lock(&g_uvm_global.va_spaces.lock);
455     list_del(&va_space->list_node);
456     uvm_mutex_unlock(&g_uvm_global.va_spaces.lock);
457 
458     uvm_perf_heuristics_stop(va_space);
459 
460     // Stop all channels before unmapping anything. This kills the channels and
461     // prevents spurious MMU faults from being generated (bug 1722021), but
462     // doesn't prevent the bottom half from servicing old faults for those
463     // channels.
464     //
465     // This involves making RM calls, so we have to do that with the VA space
466     // lock in read mode.
467     uvm_va_space_down_read_rm(va_space);
468     uvm_va_space_stop_all_user_channels(va_space);
469     uvm_va_space_up_read_rm(va_space);
470 
471     // The bottom half GPU page fault handler(s) could still look up and use
472     // this va_space via the GPU's instance_ptr_table. Lock them out while we
473     // tear down. Once we're done, the bottom half will fail to find any
474     // registered GPUs in the VA space, so those faults will be canceled.
475     uvm_va_space_down_write(va_space);
476 
477     uvm_processor_mask_copy(retained_gpus, &va_space->registered_gpus);
478 
479     bitmap_copy(va_space->enabled_peers_teardown, va_space->enabled_peers, UVM_MAX_UNIQUE_GPU_PAIRS);
480 
481     uvm_va_space_detach_all_user_channels(va_space, &deferred_free_list);
482 
483     // Destroy all VA ranges. We do this before unregistering the GPUs for
484     // performance, since GPU unregister will walk all VA ranges in the VA space
485     // multiple times.
486     uvm_for_each_va_range_safe(va_range, va_range_next, va_space) {
487         // All channel ranges should've been destroyed by the channel unregister
488         // above
489         UVM_ASSERT(va_range->type != UVM_VA_RANGE_TYPE_CHANNEL);
490         uvm_va_range_destroy(va_range, &deferred_free_list);
491     }
492 
493     uvm_range_group_radix_tree_destroy(va_space);
494 
495     // Unregister all GPUs in the VA space. Note that this does not release the
496     // GPUs nor peers. We do that below.
497     for_each_va_space_gpu(gpu, va_space)
498         unregister_gpu(va_space, gpu, NULL, &deferred_free_list, NULL);
499 
500     uvm_hmm_va_space_destroy(va_space);
501 
502     uvm_perf_heuristics_unload(va_space);
503     uvm_perf_destroy_va_space_events(&va_space->perf_events);
504 
505     va_space_remove_dummy_thread_contexts(va_space);
506 
507     // Destroy the VA space's block context node tracking after all ranges have
508     // been destroyed as the VA blocks may reference it.
509     uvm_va_block_context_free(va_space->va_block_context);
510 
511     uvm_va_space_up_write(va_space);
512 
513     UVM_ASSERT(uvm_processor_mask_empty(&va_space->registered_gpus));
514     UVM_ASSERT(uvm_processor_mask_empty(&va_space->registered_gpu_va_spaces));
515 
516     for_each_gpu_id(gpu_id)
517         UVM_ASSERT(va_space->registered_gpus_table[uvm_id_gpu_index(gpu_id)] == NULL);
518 
519     // The instance pointer mappings for this VA space have been removed so no
520     // new bottom halves can get to this VA space, but there could still be
521     // bottom halves running from before we removed the mapping. Rather than
522     // ref-count the VA space, just wait for them to finish.
523     //
524     // This is also required to synchronize any pending
525     // block_deferred_accessed_by() work items.
526 
527     nv_kthread_q_flush(&g_uvm_global.global_q);
528 
529     for_each_gpu_in_mask(gpu, retained_gpus) {
530         // Free the processor masks allocated in uvm_va_space_register_gpu().
531         // The mask is also freed in uvm_va_space_unregister_gpu() but that
532         // function won't be called in uvm_release() and uvm_release_deferred()
533         // path.
534         uvm_processor_mask_cache_free(va_space->peers_to_release[uvm_id_value(gpu->id)]);
535 
536         // Set the pointer to NULL to avoid accidental re-use and double free.
537         va_space->peers_to_release[uvm_id_value(gpu->id)] = NULL;
538 
539         if (!gpu->parent->isr.replayable_faults.handling) {
540             UVM_ASSERT(!gpu->parent->isr.non_replayable_faults.handling);
541             continue;
542         }
543 
544         nv_kthread_q_flush(&gpu->parent->isr.bottom_half_q);
545 
546         // The same applies to the kill channel kthreads. However, they need to
547         // be flushed after their bottom-half counterparts since the latter may
548         // schedule a channel kill.
549         if (gpu->parent->isr.non_replayable_faults.handling)
550             nv_kthread_q_flush(&gpu->parent->isr.kill_channel_q);
551 
552         if (gpu->parent->access_counters_supported)
553             uvm_parent_gpu_access_counters_disable(gpu->parent, va_space);
554 
555     }
556 
557     // Check that all CPU/GPU affinity masks are empty
558     for_each_gpu_id(gpu_id) {
559         const uvm_cpu_gpu_affinity_t *affinity = &va_space->gpu_cpu_numa_affinity[uvm_id_gpu_index(gpu_id)];
560 
561         UVM_ASSERT(uvm_processor_mask_empty(&affinity->gpus));
562     }
563 
564     // ensure that there are no pending events that refer to this va_space
565     uvm_tools_flush_events();
566 
567     // Perform cleanup we can't do while holding the VA space lock
568 
569     uvm_deferred_free_object_list(&deferred_free_list);
570 
571     // Normally we'd expect this to happen as part of uvm_mm_release()
572     // but if userspace never initialized uvm_mm_fd that won't happen.
573     // We don't have to take the va_space_mm spinlock and update state
574     // here because we know no other thread can be in or subsequently
575     // call uvm_api_mm_initialize successfully because the UVM
576     // file-descriptor has been released.
577     if (va_space->va_space_mm.state == UVM_VA_SPACE_MM_STATE_UNINITIALIZED)
578         uvm_va_space_mm_unregister(va_space);
579     UVM_ASSERT(!uvm_va_space_mm_alive(&va_space->va_space_mm));
580 
581     uvm_mutex_lock(&g_uvm_global.global_lock);
582 
583     // Release the GPUs and their peer counts. Do not use
584     // for_each_gpu_in_mask for the outer loop as it reads the GPU
585     // state, which might get destroyed.
586     for_each_gpu_id_in_mask(gpu_id, retained_gpus) {
587         uvm_gpu_t *peer_gpu;
588 
589         gpu = uvm_gpu_get(gpu_id);
590 
591         uvm_processor_mask_clear(retained_gpus, gpu_id);
592 
593         for_each_gpu_in_mask(peer_gpu, retained_gpus) {
594             NvU32 peer_table_index = uvm_gpu_peer_table_index(gpu->id, peer_gpu->id);
595             if (test_bit(peer_table_index, va_space->enabled_peers_teardown)) {
596                 uvm_gpu_peer_t *peer_caps = &g_uvm_global.peers[peer_table_index];
597 
598                 if (peer_caps->link_type == UVM_GPU_LINK_PCIE)
599                     uvm_gpu_release_pcie_peer_access(gpu, peer_gpu);
600 
601                 __clear_bit(peer_table_index, va_space->enabled_peers_teardown);
602             }
603         }
604 
605         uvm_gpu_release_locked(gpu);
606     }
607 
608     UVM_ASSERT(bitmap_empty(va_space->enabled_peers, UVM_MAX_UNIQUE_GPU_PAIRS));
609     UVM_ASSERT(bitmap_empty(va_space->enabled_peers_teardown, UVM_MAX_UNIQUE_GPU_PAIRS));
610 
611     uvm_mutex_unlock(&g_uvm_global.global_lock);
612 
613     uvm_kvfree(va_space->mapping);
614     uvm_kvfree(va_space);
615 }
616 
uvm_va_space_stop_all_user_channels(uvm_va_space_t * va_space)617 void uvm_va_space_stop_all_user_channels(uvm_va_space_t *va_space)
618 {
619     uvm_gpu_va_space_t *gpu_va_space;
620     uvm_user_channel_t *user_channel;
621 
622     // Skip if all channels have been already stopped.
623     if (atomic_read(&va_space->user_channels_stopped))
624         return;
625 
626     uvm_assert_rwsem_locked_read(&va_space->lock);
627 
628     for_each_gpu_va_space(gpu_va_space, va_space) {
629         list_for_each_entry(user_channel, &gpu_va_space->registered_channels, list_node)
630             uvm_user_channel_stop(user_channel);
631     }
632 
633     // Since we're holding the VA space lock in read mode, multiple threads
634     // could set this concurrently. user_channels_stopped never transitions back
635     // to 0 after being set to 1 so that's not a problem.
636     atomic_set(&va_space->user_channels_stopped, 1);
637 }
638 
uvm_va_space_get_gpu_by_uuid(uvm_va_space_t * va_space,const NvProcessorUuid * gpu_uuid)639 uvm_gpu_t *uvm_va_space_get_gpu_by_uuid(uvm_va_space_t *va_space, const NvProcessorUuid *gpu_uuid)
640 {
641     uvm_gpu_t *gpu;
642 
643     for_each_va_space_gpu(gpu, va_space) {
644         if (uvm_uuid_eq(&gpu->uuid, gpu_uuid))
645             return gpu;
646     }
647 
648     return NULL;
649 }
650 
uvm_va_space_get_gpu_by_uuid_with_gpu_va_space(uvm_va_space_t * va_space,const NvProcessorUuid * gpu_uuid)651 uvm_gpu_t *uvm_va_space_get_gpu_by_uuid_with_gpu_va_space(uvm_va_space_t *va_space,
652                                                           const NvProcessorUuid *gpu_uuid)
653 {
654     uvm_gpu_t *gpu;
655 
656     gpu = uvm_va_space_get_gpu_by_uuid(va_space, gpu_uuid);
657     if (!gpu || !uvm_processor_mask_test(&va_space->registered_gpu_va_spaces, gpu->id))
658         return NULL;
659 
660     return gpu;
661 }
662 
uvm_va_space_retain_gpu_by_uuid(uvm_va_space_t * va_space,const NvProcessorUuid * gpu_uuid)663 uvm_gpu_t *uvm_va_space_retain_gpu_by_uuid(uvm_va_space_t *va_space, const NvProcessorUuid *gpu_uuid)
664 {
665     uvm_gpu_t *gpu;
666 
667     uvm_va_space_down_read(va_space);
668 
669     gpu = uvm_va_space_get_gpu_by_uuid(va_space, gpu_uuid);
670     if (gpu)
671         uvm_gpu_retain(gpu);
672 
673     uvm_va_space_up_read(va_space);
674 
675     return gpu;
676 }
677 
uvm_va_space_can_read_duplicate(uvm_va_space_t * va_space,uvm_gpu_t * changing_gpu)678 bool uvm_va_space_can_read_duplicate(uvm_va_space_t *va_space, uvm_gpu_t *changing_gpu)
679 {
680     NvU32 count = va_space->num_non_faultable_gpu_va_spaces;
681 
682     if (changing_gpu && !uvm_processor_mask_test(&va_space->faultable_processors, changing_gpu->id)) {
683         if (uvm_processor_mask_test(&va_space->registered_gpu_va_spaces, changing_gpu->id)) {
684             // A non-faultable GPU is getting removed.
685             UVM_ASSERT(count > 0);
686             --count;
687         }
688         else {
689             // A non-faultable GPU is getting added.
690             ++count;
691         }
692     }
693 
694     return count == 0;
695 }
696 
697 // Note that the "VA space" in the function name refers to a UVM per-process
698 // VA space. (This is different from a per-GPU VA space.)
uvm_va_space_register_gpu(uvm_va_space_t * va_space,const NvProcessorUuid * gpu_uuid,const uvm_rm_user_object_t * user_rm_device,NvBool * numa_enabled,NvS32 * numa_node_id,NvProcessorUuid * uuid_out)699 NV_STATUS uvm_va_space_register_gpu(uvm_va_space_t *va_space,
700                                     const NvProcessorUuid *gpu_uuid,
701                                     const uvm_rm_user_object_t *user_rm_device,
702                                     NvBool *numa_enabled,
703                                     NvS32 *numa_node_id,
704                                     NvProcessorUuid *uuid_out)
705 {
706     NV_STATUS status;
707     uvm_va_range_t *va_range;
708     uvm_gpu_t *gpu;
709     uvm_gpu_t *other_gpu;
710     bool gpu_can_access_sysmem = true;
711     uvm_processor_mask_t *peers_to_release = NULL;
712 
713     status = uvm_gpu_retain_by_uuid(gpu_uuid, user_rm_device, &gpu);
714     if (status != NV_OK)
715         return status;
716 
717     uvm_uuid_copy(uuid_out, &gpu->uuid);
718 
719     // Enabling access counters requires taking the ISR lock, so it is done
720     // without holding the (deeper order) VA space lock. Enabling the counters
721     // after dropping the VA space lock would create a window of time in which
722     // another thread could see the GPU as registered, but access counters would
723     // be disabled. Therefore, the counters are enabled before taking the VA
724     // space lock.
725     if (uvm_parent_gpu_access_counters_required(gpu->parent)) {
726         status = uvm_gpu_access_counters_enable(gpu, va_space);
727         if (status != NV_OK) {
728             uvm_gpu_release(gpu);
729             return status;
730         }
731     }
732 
733     uvm_va_space_down_write(va_space);
734 
735     // Make sure the gpu hasn't been already registered in this va space
736     if (uvm_processor_mask_test(&va_space->registered_gpus, gpu->id)) {
737         status = NV_ERR_INVALID_DEVICE;
738         goto done;
739     }
740 
741     // Mixing coherent and non-coherent GPUs is not supported
742     for_each_va_space_gpu(other_gpu, va_space) {
743         if (uvm_parent_gpu_is_coherent(gpu->parent) != uvm_parent_gpu_is_coherent(other_gpu->parent)) {
744             status = NV_ERR_INVALID_DEVICE;
745             goto done;
746         }
747     }
748 
749     // The VA space's mm is being torn down, so don't allow more work
750     if (va_space->disallow_new_registers) {
751         status = NV_ERR_PAGE_TABLE_NOT_AVAIL;
752         goto done;
753     }
754 
755     if (g_uvm_global.conf_computing_enabled) {
756         NvU32 gpu_index = uvm_id_gpu_index(gpu->id);
757         status = uvm_conf_computing_dma_buffer_alloc(&gpu->conf_computing.dma_buffer_pool,
758                                                      &va_space->gpu_unregister_dma_buffer[gpu_index],
759                                                      NULL);
760         if (status != NV_OK)
761             goto done;
762 
763         gpu_can_access_sysmem = false;
764     }
765 
766     UVM_ASSERT(!va_space->peers_to_release[uvm_id_value(gpu->id)]);
767 
768     peers_to_release = uvm_processor_mask_cache_alloc();
769     if (!peers_to_release) {
770         status = NV_ERR_NO_MEMORY;
771         goto done;
772     }
773 
774     va_space->peers_to_release[uvm_id_value(gpu->id)] = peers_to_release;
775 
776     uvm_processor_mask_set(&va_space->registered_gpus, gpu->id);
777     va_space->registered_gpus_table[uvm_id_gpu_index(gpu->id)] = gpu;
778 
779     if (gpu->parent->isr.replayable_faults.handling) {
780         UVM_ASSERT(!uvm_processor_mask_test(&va_space->faultable_processors, gpu->id));
781         uvm_processor_mask_set(&va_space->faultable_processors, gpu->id);
782 
783         UVM_ASSERT(!uvm_processor_mask_test(&va_space->system_wide_atomics_enabled_processors, gpu->id));
784         // System-wide atomics are enabled by default
785         uvm_processor_mask_set(&va_space->system_wide_atomics_enabled_processors, gpu->id);
786     }
787     else {
788         UVM_ASSERT(!uvm_processor_mask_test(&va_space->non_faultable_processors, gpu->id));
789         uvm_processor_mask_set(&va_space->non_faultable_processors, gpu->id);
790     }
791 
792     // All GPUs have native atomics on their own memory
793     processor_mask_array_set(va_space->has_native_atomics, gpu->id, gpu->id);
794 
795     // TODO: Bug 3252572: Support the new link type UVM_GPU_LINK_C2C
796     if (gpu->parent->system_bus.link >= UVM_GPU_LINK_NVLINK_1) {
797         processor_mask_array_set(va_space->has_nvlink, gpu->id, UVM_ID_CPU);
798         processor_mask_array_set(va_space->has_nvlink, UVM_ID_CPU, gpu->id);
799     }
800 
801     if (uvm_parent_gpu_is_coherent(gpu->parent)) {
802         processor_mask_array_set(va_space->has_native_atomics, gpu->id, UVM_ID_CPU);
803 
804         if (gpu->mem_info.numa.enabled) {
805             processor_mask_array_set(va_space->can_access, UVM_ID_CPU, gpu->id);
806             processor_mask_array_set(va_space->accessible_from, gpu->id, UVM_ID_CPU);
807             processor_mask_array_set(va_space->has_native_atomics, UVM_ID_CPU, gpu->id);
808         }
809     }
810 
811     // All processors have direct access to their own memory
812     processor_mask_array_set(va_space->can_access, gpu->id, gpu->id);
813     processor_mask_array_set(va_space->accessible_from, gpu->id, gpu->id);
814 
815     if (gpu_can_access_sysmem) {
816         processor_mask_array_set(va_space->can_access, gpu->id, UVM_ID_CPU);
817         processor_mask_array_set(va_space->accessible_from, UVM_ID_CPU, gpu->id);
818     }
819 
820     processor_mask_array_set(va_space->can_copy_from, gpu->id, gpu->id);
821     processor_mask_array_set(va_space->can_copy_from, gpu->id, UVM_ID_CPU);
822     processor_mask_array_set(va_space->can_copy_from, UVM_ID_CPU, gpu->id);
823 
824     // Update the CPU/GPU affinity masks
825     if (gpu->parent->closest_cpu_numa_node != -1) {
826         uvm_gpu_id_t gpu_id;
827 
828         for_each_gpu_id(gpu_id) {
829             uvm_cpu_gpu_affinity_t *affinity = &va_space->gpu_cpu_numa_affinity[uvm_id_gpu_index(gpu_id)];
830 
831             // If this is the first time this node is seen, take a new entry of
832             // the array. Entries are never released in order to avoid having
833             // to deal with holes.
834             if (affinity->numa_node == -1) {
835                 UVM_ASSERT(uvm_processor_mask_empty(&affinity->gpus));
836                 affinity->numa_node = gpu->parent->closest_cpu_numa_node;
837             }
838 
839             if (affinity->numa_node == gpu->parent->closest_cpu_numa_node) {
840                 uvm_processor_mask_set(&affinity->gpus, gpu->id);
841                 break;
842             }
843         }
844     }
845 
846     status = register_gpu_peers(va_space, gpu);
847     if (status != NV_OK)
848         goto cleanup;
849 
850     uvm_perf_heuristics_register_gpu(va_space, gpu);
851 
852     uvm_for_each_va_range(va_range, va_space) {
853         status = uvm_va_range_register_gpu(va_range, gpu);
854         if (status != NV_OK)
855             goto cleanup;
856     }
857 
858     if (gpu->mem_info.numa.enabled) {
859         *numa_enabled = NV_TRUE;
860         *numa_node_id = (NvS32)uvm_gpu_numa_node(gpu);
861     }
862     else {
863         *numa_enabled = NV_FALSE;
864         *numa_node_id = -1;
865     }
866 
867     goto done;
868 
869 cleanup:
870     // Clear out all of the processor mask bits. No VA ranges have mapped or
871     // allocated anything on this GPU yet if we fail here, so we don't need
872     // a deferred_free_list, mm, etc.
873     unregister_gpu(va_space, gpu, NULL, NULL, NULL);
874 
875     va_space->peers_to_release[uvm_id_value(gpu->id)] = NULL;
876 
877     uvm_processor_mask_cache_free(peers_to_release);
878 
879 done:
880     UVM_ASSERT(va_space_check_processors_masks(va_space));
881 
882     uvm_va_space_up_write(va_space);
883 
884     if (status != NV_OK) {
885         // There is no risk of disabling access counters on a previously
886         // registered GPU: the enablement step would have failed before even
887         // discovering that the GPU is already registered.
888         if (uvm_parent_gpu_access_counters_required(gpu->parent))
889             uvm_parent_gpu_access_counters_disable(gpu->parent, va_space);
890 
891         uvm_gpu_release(gpu);
892     }
893 
894     return status;
895 }
896 
uvm_va_space_unregister_gpu(uvm_va_space_t * va_space,const NvProcessorUuid * gpu_uuid)897 NV_STATUS uvm_va_space_unregister_gpu(uvm_va_space_t *va_space, const NvProcessorUuid *gpu_uuid)
898 {
899     uvm_gpu_t *gpu;
900     uvm_gpu_va_space_t *gpu_va_space;
901     struct mm_struct *mm;
902     uvm_gpu_id_t peer_gpu_id;
903     uvm_processor_mask_t *peers_to_release;
904     LIST_HEAD(deferred_free_list);
905 
906     // Stopping channels requires holding the VA space lock in read mode, so do
907     // it first. We start in write mode then drop to read in order to flush out
908     // other threads which are in the read-mode portion of any of the register
909     // or unregister operations.
910     uvm_va_space_down_write(va_space);
911 
912     gpu = uvm_va_space_get_gpu_by_uuid(va_space, gpu_uuid);
913     if (!gpu) {
914         uvm_va_space_up_write(va_space);
915         return NV_ERR_INVALID_DEVICE;
916     }
917 
918     // We have to drop the VA space lock below mid-unregister. We have to
919     // prevent any other threads from coming in during that window and allowing
920     // new channels to enter the GPU. That means we must disallow:
921     // - GPU VA space register
922     // - GPU unregister (which would allow new GPU registers)
923     if (uvm_processor_mask_test(&va_space->gpu_unregister_in_progress, gpu->id)) {
924         uvm_va_space_up_write(va_space);
925         return NV_ERR_INVALID_DEVICE;
926     }
927 
928     uvm_processor_mask_set(&va_space->gpu_unregister_in_progress, gpu->id);
929 
930     uvm_va_space_downgrade_write_rm(va_space);
931 
932     gpu_va_space = uvm_gpu_va_space_get(va_space, gpu);
933     if (gpu_va_space)
934         gpu_va_space_stop_all_channels(gpu_va_space);
935 
936     // We need to drop the lock to re-take it in write mode. We don't have to
937     // retain the GPU because we've prevented other threads from unregistering
938     // it from the VA space until we're done.
939     uvm_va_space_up_read_rm(va_space);
940 
941     // If uvm_parent_gpu_access_counters_required(gpu->parent) is true, a
942     // concurrent registration could enable access counters after they are
943     // disabled here.
944     // The concurrent registration will fail later on if it acquires the VA
945     // space lock before the unregistration does (because the GPU is still
946     // registered) and undo the access counters enablement, or succeed if it
947     // acquires the VA space lock after the unregistration does. Both outcomes
948     // result on valid states.
949     if (gpu->parent->access_counters_supported)
950         uvm_parent_gpu_access_counters_disable(gpu->parent, va_space);
951 
952     // mmap_lock is needed to establish CPU mappings to any pages evicted from
953     // the GPU if accessed by CPU is set for them.
954     mm = uvm_va_space_mm_or_current_retain_lock(va_space);
955 
956     uvm_va_space_down_write(va_space);
957 
958     // We blocked out other GPU unregisters, so this GPU must still be
959     // registered. However, the GPU VA space might have been unregistered on us.
960     UVM_ASSERT(uvm_processor_mask_test(&va_space->registered_gpus, gpu->id));
961     if (uvm_processor_mask_test(&va_space->registered_gpu_va_spaces, gpu->id))
962         UVM_ASSERT(uvm_gpu_va_space_get(va_space, gpu) == gpu_va_space);
963 
964     peers_to_release = va_space->peers_to_release[uvm_id_value(gpu->id)];
965 
966     va_space->peers_to_release[uvm_id_value(gpu->id)] = NULL;
967 
968     // This will call disable_peers for all GPU's peers, including NVLink
969     unregister_gpu(va_space, gpu, mm, &deferred_free_list, peers_to_release);
970 
971     UVM_ASSERT(uvm_processor_mask_test(&va_space->gpu_unregister_in_progress, gpu->id));
972     uvm_processor_mask_clear(&va_space->gpu_unregister_in_progress, gpu->id);
973 
974     uvm_va_space_up_write(va_space);
975 
976     // Unlock the mm since the call to uvm_deferred_free_object_list() requires
977     // that we don't hold any locks. We don't release the mm yet because that
978     // could call uvm_va_space_mm_shutdown() which waits for the deferred free
979     // list to be empty which would cause a deadlock.
980     if (mm)
981         uvm_up_read_mmap_lock(mm);
982 
983     uvm_deferred_free_object_list(&deferred_free_list);
984 
985     // Release the VA space's GPU and peer counts
986     uvm_mutex_lock(&g_uvm_global.global_lock);
987 
988     // Do not use for_each_gpu_in_mask as it reads the peer GPU state,
989     // which might get destroyed when we release the peer entry.
990     UVM_ASSERT(peers_to_release);
991 
992     for_each_gpu_id_in_mask(peer_gpu_id, peers_to_release) {
993         uvm_gpu_t *peer_gpu = uvm_gpu_get(peer_gpu_id);
994         UVM_ASSERT(uvm_gpu_peer_caps(gpu, peer_gpu)->link_type == UVM_GPU_LINK_PCIE);
995         uvm_gpu_release_pcie_peer_access(gpu, peer_gpu);
996     }
997 
998     uvm_processor_mask_cache_free(peers_to_release);
999 
1000     uvm_gpu_release_locked(gpu);
1001 
1002     uvm_mutex_unlock(&g_uvm_global.global_lock);
1003 
1004     uvm_va_space_mm_or_current_release(va_space, mm);
1005 
1006     return NV_OK;
1007 }
1008 
1009 // This does *not* release the global GPU peer entry
disable_peers(uvm_va_space_t * va_space,uvm_gpu_t * gpu0,uvm_gpu_t * gpu1,struct list_head * deferred_free_list)1010 static void disable_peers(uvm_va_space_t *va_space,
1011                           uvm_gpu_t *gpu0,
1012                           uvm_gpu_t *gpu1,
1013                           struct list_head *deferred_free_list)
1014 {
1015     NvU32 table_index;
1016     uvm_va_range_t *va_range;
1017 
1018     uvm_assert_rwsem_locked_write(&va_space->lock);
1019 
1020     table_index = uvm_gpu_peer_table_index(gpu0->id, gpu1->id);
1021 
1022     if (!test_bit(table_index, va_space->enabled_peers))
1023         return;
1024 
1025     // Unmap all page tables in this VA space which have peer mappings between
1026     // these two GPUs.
1027     uvm_for_each_va_range(va_range, va_space)
1028         uvm_va_range_disable_peer(va_range, gpu0, gpu1, deferred_free_list);
1029 
1030     processor_mask_array_clear(va_space->can_access, gpu0->id, gpu1->id);
1031     processor_mask_array_clear(va_space->can_access, gpu1->id, gpu0->id);
1032     processor_mask_array_clear(va_space->accessible_from, gpu0->id, gpu1->id);
1033     processor_mask_array_clear(va_space->accessible_from, gpu1->id, gpu0->id);
1034     processor_mask_array_clear(va_space->can_copy_from, gpu0->id, gpu1->id);
1035     processor_mask_array_clear(va_space->can_copy_from, gpu1->id, gpu0->id);
1036     processor_mask_array_clear(va_space->has_nvlink, gpu0->id, gpu1->id);
1037     processor_mask_array_clear(va_space->has_nvlink, gpu1->id, gpu0->id);
1038     processor_mask_array_clear(va_space->indirect_peers, gpu0->id, gpu1->id);
1039     processor_mask_array_clear(va_space->indirect_peers, gpu1->id, gpu0->id);
1040     processor_mask_array_clear(va_space->has_native_atomics, gpu0->id, gpu1->id);
1041     processor_mask_array_clear(va_space->has_native_atomics, gpu1->id, gpu0->id);
1042 
1043     __clear_bit(table_index, va_space->enabled_peers);
1044 
1045     va_space_check_processors_masks(va_space);
1046 }
1047 
enable_peers(uvm_va_space_t * va_space,uvm_gpu_t * gpu0,uvm_gpu_t * gpu1)1048 static NV_STATUS enable_peers(uvm_va_space_t *va_space, uvm_gpu_t *gpu0, uvm_gpu_t *gpu1)
1049 {
1050     NV_STATUS status = NV_OK;
1051     uvm_gpu_va_space_t *gpu_va_space0, *gpu_va_space1;
1052     NvU32 table_index = 0;
1053     uvm_gpu_peer_t *peer_caps;
1054     uvm_va_range_t *va_range;
1055     LIST_HEAD(deferred_free_list);
1056 
1057     uvm_assert_rwsem_locked_write(&va_space->lock);
1058 
1059     // We know the GPUs were retained already, so now verify that they've been
1060     // registered by this specific VA space.
1061     if (!uvm_processor_mask_test(&va_space->registered_gpus, gpu0->id) ||
1062         !uvm_processor_mask_test(&va_space->registered_gpus, gpu1->id)) {
1063         return NV_ERR_INVALID_DEVICE;
1064     }
1065 
1066     table_index = uvm_gpu_peer_table_index(gpu0->id, gpu1->id);
1067     peer_caps = &g_uvm_global.peers[table_index];
1068 
1069     UVM_ASSERT(!test_bit(table_index, va_space->enabled_peers));
1070 
1071     // If both GPUs have registered GPU VA spaces already, their big page sizes
1072     // must match.
1073     gpu_va_space0 = uvm_gpu_va_space_get(va_space, gpu0);
1074     gpu_va_space1 = uvm_gpu_va_space_get(va_space, gpu1);
1075     if (gpu_va_space0 &&
1076         gpu_va_space1 &&
1077         gpu_va_space0->page_tables.big_page_size != gpu_va_space1->page_tables.big_page_size) {
1078         return NV_ERR_NOT_COMPATIBLE;
1079     }
1080 
1081     processor_mask_array_set(va_space->can_access, gpu0->id, gpu1->id);
1082     processor_mask_array_set(va_space->can_access, gpu1->id, gpu0->id);
1083     processor_mask_array_set(va_space->accessible_from, gpu0->id, gpu1->id);
1084     processor_mask_array_set(va_space->accessible_from, gpu1->id, gpu0->id);
1085 
1086     if (gpu0->parent->peer_copy_mode != UVM_GPU_PEER_COPY_MODE_UNSUPPORTED) {
1087         UVM_ASSERT_MSG(gpu1->parent->peer_copy_mode == gpu0->parent->peer_copy_mode,
1088                        "GPU %s GPU %s\n",
1089                        uvm_gpu_name(gpu0),
1090                        uvm_gpu_name(gpu1));
1091 
1092         processor_mask_array_set(va_space->can_copy_from, gpu1->id, gpu0->id);
1093         processor_mask_array_set(va_space->can_copy_from, gpu0->id, gpu1->id);
1094     }
1095 
1096     // Pre-compute nvlink and native atomic masks for the new peers
1097     if (peer_caps->link_type >= UVM_GPU_LINK_NVLINK_1) {
1098         processor_mask_array_set(va_space->has_nvlink, gpu0->id, gpu1->id);
1099         processor_mask_array_set(va_space->has_nvlink, gpu1->id, gpu0->id);
1100 
1101         processor_mask_array_set(va_space->has_native_atomics, gpu0->id, gpu1->id);
1102         processor_mask_array_set(va_space->has_native_atomics, gpu1->id, gpu0->id);
1103 
1104         if (peer_caps->is_indirect_peer) {
1105             UVM_ASSERT(peer_caps->link_type >= UVM_GPU_LINK_NVLINK_2);
1106             UVM_ASSERT(gpu0->mem_info.numa.enabled);
1107             UVM_ASSERT(gpu1->mem_info.numa.enabled);
1108 
1109             processor_mask_array_set(va_space->indirect_peers, gpu0->id, gpu1->id);
1110             processor_mask_array_set(va_space->indirect_peers, gpu1->id, gpu0->id);
1111         }
1112     }
1113     else if (gpu0->parent == gpu1->parent) {
1114         processor_mask_array_set(va_space->has_native_atomics, gpu0->id, gpu1->id);
1115         processor_mask_array_set(va_space->has_native_atomics, gpu1->id, gpu0->id);
1116     }
1117 
1118     UVM_ASSERT(va_space_check_processors_masks(va_space));
1119     __set_bit(table_index, va_space->enabled_peers);
1120 
1121     uvm_for_each_va_range(va_range, va_space) {
1122         status = uvm_va_range_enable_peer(va_range, gpu0, gpu1);
1123         if (status != NV_OK)
1124             break;
1125     }
1126 
1127     if (status != NV_OK) {
1128         disable_peers(va_space, gpu0, gpu1, &deferred_free_list);
1129 
1130         // uvm_va_range_disable_peer adds only external allocations to the list,
1131         // but uvm_va_range_enable_peer doesn't do anything for them.
1132         UVM_ASSERT(list_empty(&deferred_free_list));
1133     }
1134 
1135     return status;
1136 }
1137 
1138 // On success the GPUs and the P2P access have been retained, but the caller
1139 // must not assume that the GPUs are still registered in the VA space after the
1140 // call since the VA space lock is dropped.
retain_pcie_peers_from_uuids(uvm_va_space_t * va_space,const NvProcessorUuid * gpu_uuid_1,const NvProcessorUuid * gpu_uuid_2,uvm_gpu_t ** gpu0,uvm_gpu_t ** gpu1)1141 static NV_STATUS retain_pcie_peers_from_uuids(uvm_va_space_t *va_space,
1142                                               const NvProcessorUuid *gpu_uuid_1,
1143                                               const NvProcessorUuid *gpu_uuid_2,
1144                                               uvm_gpu_t **gpu0,
1145                                               uvm_gpu_t **gpu1)
1146 {
1147     NV_STATUS status = NV_OK;
1148 
1149     uvm_va_space_down_read_rm(va_space);
1150 
1151     // The UUIDs should have already been registered
1152     *gpu0 = uvm_va_space_get_gpu_by_uuid(va_space, gpu_uuid_1);
1153     *gpu1 = uvm_va_space_get_gpu_by_uuid(va_space, gpu_uuid_2);
1154 
1155     if (*gpu0 && *gpu1 && !uvm_parent_id_equal((*gpu0)->parent->id, (*gpu1)->parent->id))
1156         status = uvm_gpu_retain_pcie_peer_access(*gpu0, *gpu1);
1157     else
1158         status = NV_ERR_INVALID_DEVICE;
1159 
1160     uvm_va_space_up_read_rm(va_space);
1161 
1162     return status;
1163 }
1164 
uvm_va_space_pcie_peer_enabled(uvm_va_space_t * va_space,uvm_gpu_t * gpu0,uvm_gpu_t * gpu1)1165 static bool uvm_va_space_pcie_peer_enabled(uvm_va_space_t *va_space, uvm_gpu_t *gpu0, uvm_gpu_t *gpu1)
1166 {
1167     return !processor_mask_array_test(va_space->has_nvlink, gpu0->id, gpu1->id) &&
1168            gpu0->parent != gpu1->parent &&
1169            uvm_va_space_peer_enabled(va_space, gpu0, gpu1);
1170 }
1171 
uvm_va_space_nvlink_peer_enabled(uvm_va_space_t * va_space,uvm_gpu_t * gpu0,uvm_gpu_t * gpu1)1172 static bool uvm_va_space_nvlink_peer_enabled(uvm_va_space_t *va_space, uvm_gpu_t *gpu0, uvm_gpu_t *gpu1)
1173 {
1174     return processor_mask_array_test(va_space->has_nvlink, gpu0->id, gpu1->id);
1175 }
1176 
free_gpu_va_space(nv_kref_t * nv_kref)1177 static void free_gpu_va_space(nv_kref_t *nv_kref)
1178 {
1179     uvm_gpu_va_space_t *gpu_va_space = container_of(nv_kref, uvm_gpu_va_space_t, kref);
1180     uvm_gpu_va_space_state_t state = uvm_gpu_va_space_state(gpu_va_space);
1181     UVM_ASSERT(state == UVM_GPU_VA_SPACE_STATE_INIT || state == UVM_GPU_VA_SPACE_STATE_DEAD);
1182     uvm_kvfree(gpu_va_space);
1183 }
1184 
uvm_gpu_va_space_release(uvm_gpu_va_space_t * gpu_va_space)1185 void uvm_gpu_va_space_release(uvm_gpu_va_space_t *gpu_va_space)
1186 {
1187     if (gpu_va_space)
1188         nv_kref_put(&gpu_va_space->kref, free_gpu_va_space);
1189 }
1190 
uvm_gpu_va_space_acquire_mmap_lock(struct mm_struct * mm)1191 static void uvm_gpu_va_space_acquire_mmap_lock(struct mm_struct *mm)
1192 {
1193     if (mm) {
1194         // uvm_ats_register_gpu_va_space() requires mmap_lock to be held in
1195         // write mode if IBM ATS support is provided through the kernel.
1196         // mmap_lock is optional if IBM ATS support is provided through the
1197         // driver. In all cases, We need mmap_lock at least in read mode to
1198         // handle potential CPU mapping changes in
1199         // uvm_va_range_add_gpu_va_space().
1200         if (UVM_ATS_IBM_SUPPORTED_IN_KERNEL())
1201             uvm_down_write_mmap_lock(mm);
1202         else
1203             uvm_down_read_mmap_lock(mm);
1204     }
1205 }
1206 
uvm_gpu_va_space_release_mmap_lock(struct mm_struct * mm)1207 static void uvm_gpu_va_space_release_mmap_lock(struct mm_struct *mm)
1208 {
1209     if (mm) {
1210         if (UVM_ATS_IBM_SUPPORTED_IN_KERNEL())
1211             uvm_up_write_mmap_lock(mm);
1212         else
1213             uvm_up_read_mmap_lock(mm);
1214     }
1215 }
1216 
uvm_gpu_va_space_set_page_dir(uvm_gpu_va_space_t * gpu_va_space)1217 static NV_STATUS uvm_gpu_va_space_set_page_dir(uvm_gpu_va_space_t *gpu_va_space)
1218 {
1219     NV_STATUS status;
1220     uvm_gpu_phys_address_t pdb_phys;
1221     NvU64 num_pdes;
1222     NvU32 pasid = -1U;
1223 
1224     if (gpu_va_space->ats.enabled) {
1225         pasid = gpu_va_space->ats.pasid;
1226         UVM_ASSERT(pasid != -1U);
1227     }
1228 
1229     // Replace the existing PDB, if present, with the new one allocated by UVM.
1230     // This will fail if nvUvmInterfaceSetPageDirectory has already been called
1231     // on the RM VA space object, which prevents the user from registering twice
1232     // and corrupting our state.
1233     //
1234     // TODO: Bug 1733664: RM needs to preempt and disable channels during this
1235     //       operation.
1236     pdb_phys = uvm_page_tree_pdb(&gpu_va_space->page_tables)->addr;
1237     num_pdes = uvm_mmu_page_tree_entries(&gpu_va_space->page_tables, 0, UVM_PAGE_SIZE_AGNOSTIC);
1238     status = uvm_rm_locked_call(nvUvmInterfaceSetPageDirectory(gpu_va_space->duped_gpu_va_space,
1239                                                                pdb_phys.address,
1240                                                                num_pdes,
1241                                                                pdb_phys.aperture == UVM_APERTURE_VID,
1242                                                                pasid));
1243     if (status != NV_OK) {
1244         if (status == NV_ERR_NOT_SUPPORTED) {
1245             // Convert to the return code specified by uvm.h for
1246             // already-registered PDBs.
1247             status = NV_ERR_INVALID_DEVICE;
1248         }
1249         else {
1250             UVM_DBG_PRINT("nvUvmInterfaceSetPageDirectory() failed: %s, GPU %s\n",
1251                           nvstatusToString(status),
1252                           uvm_gpu_name(gpu_va_space->gpu));
1253         }
1254 
1255         return status;
1256     }
1257 
1258     gpu_va_space->did_set_page_directory = true;
1259     return status;
1260 }
1261 
uvm_gpu_va_space_unset_page_dir(uvm_gpu_va_space_t * gpu_va_space)1262 void uvm_gpu_va_space_unset_page_dir(uvm_gpu_va_space_t *gpu_va_space)
1263 {
1264     if (uvm_gpu_va_space_state(gpu_va_space) != UVM_GPU_VA_SPACE_STATE_INIT)
1265         uvm_assert_rwsem_locked_read(&gpu_va_space->va_space->lock);
1266 
1267     if (gpu_va_space->did_set_page_directory) {
1268         NV_STATUS status = uvm_rm_locked_call(nvUvmInterfaceUnsetPageDirectory(gpu_va_space->duped_gpu_va_space));
1269         UVM_ASSERT_MSG(status == NV_OK,
1270                        "nvUvmInterfaceUnsetPageDirectory() failed: %s, GPU %s\n",
1271                        nvstatusToString(status),
1272                        uvm_gpu_name(gpu_va_space->gpu));
1273         gpu_va_space->did_set_page_directory = false;
1274     }
1275 }
1276 
destroy_gpu_va_space(uvm_gpu_va_space_t * gpu_va_space)1277 static void destroy_gpu_va_space(uvm_gpu_va_space_t *gpu_va_space)
1278 {
1279     NvU64 delay_us = 0;
1280     uvm_va_space_t *va_space;
1281     uvm_gpu_va_space_state_t state;
1282 
1283     if (!gpu_va_space)
1284         return;
1285 
1286     state = uvm_gpu_va_space_state(gpu_va_space);
1287     UVM_ASSERT(state == UVM_GPU_VA_SPACE_STATE_INIT || state == UVM_GPU_VA_SPACE_STATE_DEAD);
1288 
1289     va_space = gpu_va_space->va_space;
1290     UVM_ASSERT(va_space);
1291 
1292     delay_us = atomic64_read(&va_space->test.destroy_gpu_va_space_delay_us);
1293 
1294     if (delay_us)
1295         udelay(delay_us);
1296 
1297     // Serialize this uvm_gpu_va_space_unset_page_dir call with the one in
1298     // uvm_va_space_mm_shutdown, which also starts with the VA space lock in
1299     // write mode. RM will serialize the calls internally, so we lock here only
1300     // to avoid getting benign errors from nvUvmInterfaceUnsetPageDirectory.
1301     //
1302     // If we never got to add_gpu_va_space, then gpu_va_space was never
1303     // registered within the va_space, so uvm_va_space_mm_shutdown couldn't see
1304     // it and we don't have to take the lock. state is guaranteed to be
1305     // UVM_GPU_VA_SPACE_STATE_INIT if add_gpu_va_space wasn't reached.
1306     if (state != UVM_GPU_VA_SPACE_STATE_INIT) {
1307         uvm_va_space_down_write(va_space);
1308         uvm_va_space_downgrade_write_rm(va_space);
1309     }
1310 
1311     uvm_gpu_va_space_unset_page_dir(gpu_va_space);
1312 
1313     if (state != UVM_GPU_VA_SPACE_STATE_INIT)
1314         uvm_va_space_up_read_rm(va_space);
1315 
1316     if (gpu_va_space->page_tables.root)
1317         uvm_page_tree_deinit(&gpu_va_space->page_tables);
1318 
1319     if (gpu_va_space->duped_gpu_va_space)
1320         uvm_rm_locked_call_void(nvUvmInterfaceAddressSpaceDestroy(gpu_va_space->duped_gpu_va_space));
1321 
1322     // If the state is DEAD, then this GPU VA space is tracked in
1323     // va_space->gpu_va_space_deferred_free. uvm_ats_unregister_gpu_va_space may
1324     // wait for this count to go to 0 via uvm_va_space_mm_shutdown, so we must
1325     // decrement it before calling that function.
1326     if (gpu_va_space->state == UVM_GPU_VA_SPACE_STATE_DEAD) {
1327         int num_pending = atomic_dec_return(&va_space->gpu_va_space_deferred_free.num_pending);
1328         if (num_pending == 0)
1329             wake_up_all(&va_space->gpu_va_space_deferred_free.wait_queue);
1330         else
1331             UVM_ASSERT(num_pending > 0);
1332     }
1333 
1334     // Note that this call may wait for faults to finish being serviced, which
1335     // means it may depend on the VA space lock and mmap_lock.
1336     uvm_ats_unregister_gpu_va_space(gpu_va_space);
1337 
1338     uvm_ats_unbind_gpu(gpu_va_space);
1339 
1340 
1341     uvm_gpu_va_space_release(gpu_va_space);
1342 }
1343 
create_gpu_va_space(uvm_gpu_t * gpu,uvm_va_space_t * va_space,uvm_rm_user_object_t * user_rm_va_space,uvm_gpu_va_space_t ** out_gpu_va_space)1344 static NV_STATUS create_gpu_va_space(uvm_gpu_t *gpu,
1345                                      uvm_va_space_t *va_space,
1346                                      uvm_rm_user_object_t *user_rm_va_space,
1347                                      uvm_gpu_va_space_t **out_gpu_va_space)
1348 {
1349     NV_STATUS status;
1350     uvm_gpu_va_space_t *gpu_va_space;
1351     UvmGpuAddressSpaceInfo gpu_address_space_info;
1352 
1353     *out_gpu_va_space = NULL;
1354 
1355     gpu_va_space = uvm_kvmalloc_zero(sizeof(*gpu_va_space));
1356     if (!gpu_va_space)
1357         return NV_ERR_NO_MEMORY;
1358 
1359     gpu_va_space->gpu = gpu;
1360     gpu_va_space->va_space = va_space;
1361     INIT_LIST_HEAD(&gpu_va_space->registered_channels);
1362     INIT_LIST_HEAD(&gpu_va_space->channel_va_ranges);
1363     nv_kref_init(&gpu_va_space->kref);
1364 
1365     // TODO: Bug 1624521: This interface needs to use rm_control_fd to do
1366     //       validation.
1367     (void)user_rm_va_space->rm_control_fd;
1368     status = uvm_rm_locked_call(nvUvmInterfaceDupAddressSpace(uvm_gpu_device_handle(gpu),
1369                                                               user_rm_va_space->user_client,
1370                                                               user_rm_va_space->user_object,
1371                                                               &gpu_va_space->duped_gpu_va_space,
1372                                                               &gpu_address_space_info));
1373     if (status != NV_OK) {
1374         UVM_DBG_PRINT("failed to dup address space with error: %s, for GPU:%s \n",
1375                 nvstatusToString(status), uvm_gpu_name(gpu));
1376         goto error;
1377     }
1378 
1379     gpu_va_space->ats.enabled = gpu_address_space_info.atsEnabled;
1380 
1381     // If ATS support in the UVM driver isn't enabled, fail registration of GPU
1382     // VA spaces which have ATS enabled.
1383     if (!g_uvm_global.ats.enabled && gpu_va_space->ats.enabled) {
1384         UVM_INFO_PRINT("GPU VA space requires ATS, but ATS is not supported or enabled\n");
1385         status = NV_ERR_INVALID_FLAGS;
1386         goto error;
1387     }
1388 
1389     // If this GPU VA space uses ATS then pageable memory access must not have
1390     // been disabled in the VA space.
1391     if (gpu_va_space->ats.enabled && !uvm_va_space_pageable_mem_access_supported(va_space)) {
1392         UVM_INFO_PRINT("GPU VA space requires ATS, but pageable memory access is not supported\n");
1393         status = NV_ERR_INVALID_FLAGS;
1394         goto error;
1395     }
1396 
1397     // RM allows the creation of VA spaces on Pascal with 128k big pages. We
1398     // don't support that, so just fail those attempts.
1399     //
1400     // TODO: Bug 1789555: Remove this check once RM disallows this case.
1401     if (!gpu->parent->arch_hal->mmu_mode_hal(gpu_address_space_info.bigPageSize)) {
1402         status = NV_ERR_INVALID_FLAGS;
1403         goto error;
1404     }
1405 
1406     // Set up this GPU's page tables
1407     UVM_ASSERT(gpu_va_space->page_tables.root == NULL);
1408     status = uvm_page_tree_init(gpu,
1409                                 gpu_va_space,
1410                                 UVM_PAGE_TREE_TYPE_USER,
1411                                 gpu_address_space_info.bigPageSize,
1412                                 uvm_get_page_tree_location(gpu->parent),
1413                                 &gpu_va_space->page_tables);
1414     if (status != NV_OK) {
1415         UVM_ERR_PRINT("Initializing the page tree failed: %s, GPU %s\n", nvstatusToString(status), uvm_gpu_name(gpu));
1416         goto error;
1417     }
1418 
1419     status = uvm_ats_bind_gpu(gpu_va_space);
1420     if (status != NV_OK)
1421         goto error;
1422 
1423     *out_gpu_va_space = gpu_va_space;
1424     return NV_OK;
1425 
1426 error:
1427     destroy_gpu_va_space(gpu_va_space);
1428     return status;
1429 }
1430 
add_gpu_va_space(uvm_gpu_va_space_t * gpu_va_space)1431 static void add_gpu_va_space(uvm_gpu_va_space_t *gpu_va_space)
1432 {
1433     uvm_va_space_t *va_space = gpu_va_space->va_space;
1434     uvm_gpu_t *gpu = gpu_va_space->gpu;
1435 
1436     UVM_ASSERT(va_space);
1437     uvm_assert_rwsem_locked_write(&va_space->lock);
1438 
1439     if (!uvm_processor_mask_test(&va_space->faultable_processors, gpu->id))
1440         va_space->num_non_faultable_gpu_va_spaces++;
1441 
1442     uvm_processor_mask_set(&va_space->registered_gpu_va_spaces, gpu->id);
1443     va_space->gpu_va_spaces[uvm_id_gpu_index(gpu->id)] = gpu_va_space;
1444     gpu_va_space->state = UVM_GPU_VA_SPACE_STATE_ACTIVE;
1445 }
1446 
check_gpu_va_space(uvm_gpu_va_space_t * gpu_va_space)1447 static NV_STATUS check_gpu_va_space(uvm_gpu_va_space_t *gpu_va_space)
1448 {
1449     uvm_va_space_t *va_space = gpu_va_space->va_space;
1450     uvm_gpu_t *gpu = gpu_va_space->gpu;
1451     uvm_gpu_t *other_gpu;
1452     uvm_gpu_va_space_t *other_gpu_va_space;
1453 
1454     UVM_ASSERT(va_space);
1455     uvm_assert_rwsem_locked_write(&va_space->lock);
1456 
1457     UVM_ASSERT(uvm_gpu_va_space_state(gpu_va_space) == UVM_GPU_VA_SPACE_STATE_INIT);
1458 
1459     if (!uvm_processor_mask_test(&va_space->registered_gpus, gpu->id))
1460         return NV_ERR_INVALID_DEVICE;
1461 
1462     // RM will return an error from create_gpu_va_space if the given RM VA space
1463     // object has already been registered by any VA space. Now we just need to
1464     // check if a different VA space has already been registered.
1465     if (uvm_processor_mask_test(&va_space->registered_gpu_va_spaces, gpu->id))
1466         return NV_ERR_INVALID_DEVICE;
1467 
1468     // If a GPU unregister is in progress but temporarily dropped the VA space
1469     // lock, we can't register new GPU VA spaces.
1470     if (uvm_processor_mask_test(&va_space->gpu_unregister_in_progress, gpu->id))
1471         return NV_ERR_INVALID_DEVICE;
1472 
1473     // The VA space's mm is being torn down, so don't allow more work
1474     if (va_space->disallow_new_registers)
1475         return NV_ERR_PAGE_TABLE_NOT_AVAIL;
1476 
1477     // This GPU VA space must match its big page size with all enabled peers.
1478     // Also, the new GPU VA space must have the same ATS setting as previously-
1479     // registered GPU VA spaces
1480     for_each_va_space_gpu_in_mask(other_gpu, va_space, &va_space->registered_gpu_va_spaces) {
1481         UVM_ASSERT(other_gpu != gpu);
1482 
1483         other_gpu_va_space = uvm_gpu_va_space_get(va_space, other_gpu);
1484         if (other_gpu_va_space->ats.enabled != gpu_va_space->ats.enabled)
1485             return NV_ERR_INVALID_FLAGS;
1486 
1487         if (!test_bit(uvm_gpu_peer_table_index(gpu->id, other_gpu->id), va_space->enabled_peers))
1488             continue;
1489 
1490         if (gpu_va_space->page_tables.big_page_size != other_gpu_va_space->page_tables.big_page_size)
1491             return NV_ERR_NOT_COMPATIBLE;
1492     }
1493 
1494     return NV_OK;
1495 }
1496 
uvm_va_space_register_gpu_va_space(uvm_va_space_t * va_space,uvm_rm_user_object_t * user_rm_va_space,const NvProcessorUuid * gpu_uuid)1497 NV_STATUS uvm_va_space_register_gpu_va_space(uvm_va_space_t *va_space,
1498                                              uvm_rm_user_object_t *user_rm_va_space,
1499                                              const NvProcessorUuid *gpu_uuid)
1500 {
1501     NV_STATUS status;
1502     uvm_gpu_t *gpu;
1503     uvm_gpu_va_space_t *gpu_va_space;
1504     uvm_va_range_t *va_range;
1505     struct mm_struct *mm;
1506     LIST_HEAD(deferred_free_list);
1507 
1508     gpu = uvm_va_space_retain_gpu_by_uuid(va_space, gpu_uuid);
1509     if (!gpu)
1510         return NV_ERR_INVALID_DEVICE;
1511 
1512     mm = uvm_va_space_mm_or_current_retain(va_space);
1513     if (!mm) {
1514         status = NV_ERR_PAGE_TABLE_NOT_AVAIL;
1515         goto error_gpu_release;
1516     }
1517 
1518     status = create_gpu_va_space(gpu, va_space, user_rm_va_space, &gpu_va_space);
1519     if (status != NV_OK)
1520         goto error_gpu_release;
1521 
1522     uvm_gpu_va_space_acquire_mmap_lock(mm);
1523     uvm_va_space_down_write(va_space);
1524 
1525     status = check_gpu_va_space(gpu_va_space);
1526     if (status != NV_OK)
1527         goto error_unlock;
1528 
1529     status = uvm_ats_register_gpu_va_space(gpu_va_space);
1530     if (status != NV_OK)
1531         goto error_unlock;
1532 
1533     uvm_va_space_up_write(va_space);
1534     uvm_gpu_va_space_release_mmap_lock(mm);
1535 
1536     status = uvm_gpu_va_space_set_page_dir(gpu_va_space);
1537     if (status != NV_OK)
1538         goto error_destroy;
1539 
1540     uvm_gpu_va_space_acquire_mmap_lock(mm);
1541     uvm_va_space_down_write(va_space);
1542 
1543     // va_space state might have changed before the lock reacquire for write.
1544     // So, check the state again.
1545     status = check_gpu_va_space(gpu_va_space);
1546     if (status != NV_OK)
1547         goto error_unlock;
1548 
1549     add_gpu_va_space(gpu_va_space);
1550 
1551     // Tell the VA ranges that they can map this GPU, if they need to.
1552     //
1553     // Ideally we'd downgrade the VA space lock to read mode while adding new
1554     // mappings, but that would complicate error handling since we have to
1555     // remove the GPU VA space if any of these mappings fail.
1556     uvm_for_each_va_range(va_range, va_space) {
1557         status = uvm_va_range_add_gpu_va_space(va_range, gpu_va_space, mm);
1558         if (status != NV_OK)
1559             goto error;
1560     }
1561 
1562     uvm_va_space_up_write(va_space);
1563     uvm_gpu_va_space_release_mmap_lock(mm);
1564 
1565     uvm_va_space_mm_or_current_release(va_space, mm);
1566     uvm_gpu_release(gpu);
1567 
1568     return NV_OK;
1569 
1570 error:
1571     UVM_ASSERT(uvm_gpu_va_space_state(gpu_va_space) == UVM_GPU_VA_SPACE_STATE_ACTIVE);
1572     remove_gpu_va_space(gpu_va_space, mm, &deferred_free_list);
1573 
1574     // Nothing else could've been attached to this gpu_va_space (channels,
1575     // external allocations) since we're still holding the VA space lock
1576     // since add_gpu_va_space(). Therefore the GPU VA space itself should be
1577     // the only item in the list, and we can just destroy it directly below.
1578     UVM_ASSERT(list_is_singular(&deferred_free_list));
1579 error_unlock:
1580     uvm_va_space_up_write(va_space);
1581     uvm_gpu_va_space_release_mmap_lock(mm);
1582 error_destroy:
1583     destroy_gpu_va_space(gpu_va_space);
1584 error_gpu_release:
1585     uvm_va_space_mm_or_current_release(va_space, mm);
1586     uvm_gpu_release(gpu);
1587     return status;
1588 }
1589 
find_gpu_va_space_index(uvm_va_space_t * va_space,uvm_parent_gpu_t * parent_gpu)1590 static NvU32 find_gpu_va_space_index(uvm_va_space_t *va_space,
1591                                      uvm_parent_gpu_t *parent_gpu)
1592 {
1593     uvm_gpu_id_t gpu_id;
1594     NvU32 index = UVM_ID_MAX_PROCESSORS;
1595 
1596     // TODO: Bug 4351121: this conversion from parent ID to gpu ID depends on
1597     // the fact that only one partition is registered per va_space per physical
1598     // GPU. This code will need to change when multiple MIG instances are
1599     // supported.
1600     for_each_sub_processor_id_in_parent_gpu(gpu_id, parent_gpu->id) {
1601         if (uvm_processor_mask_test(&va_space->registered_gpu_va_spaces, gpu_id)) {
1602             UVM_ASSERT(index == UVM_ID_MAX_PROCESSORS);
1603             index = uvm_id_gpu_index(gpu_id);
1604         }
1605     }
1606 
1607     return index;
1608 }
1609 
uvm_gpu_va_space_get_by_parent_gpu(uvm_va_space_t * va_space,uvm_parent_gpu_t * parent_gpu)1610 uvm_gpu_va_space_t *uvm_gpu_va_space_get_by_parent_gpu(uvm_va_space_t *va_space,
1611                                                        uvm_parent_gpu_t *parent_gpu)
1612 {
1613     uvm_gpu_va_space_t *gpu_va_space;
1614     NvU32 gpu_index;
1615 
1616     uvm_assert_rwsem_locked(&va_space->lock);
1617 
1618     if (!parent_gpu)
1619         return NULL;
1620 
1621     gpu_index = find_gpu_va_space_index(va_space, parent_gpu);
1622     if (gpu_index == UVM_ID_MAX_PROCESSORS)
1623         return NULL;
1624 
1625     gpu_va_space = va_space->gpu_va_spaces[gpu_index];
1626     UVM_ASSERT(uvm_gpu_va_space_state(gpu_va_space) == UVM_GPU_VA_SPACE_STATE_ACTIVE);
1627     UVM_ASSERT(gpu_va_space->va_space == va_space);
1628     UVM_ASSERT(gpu_va_space->gpu->parent == parent_gpu);
1629 
1630     return gpu_va_space;
1631 }
1632 
1633 // The caller must have stopped all channels under this gpu_va_space before
1634 // calling this function.
remove_gpu_va_space(uvm_gpu_va_space_t * gpu_va_space,struct mm_struct * mm,struct list_head * deferred_free_list)1635 static void remove_gpu_va_space(uvm_gpu_va_space_t *gpu_va_space,
1636                                 struct mm_struct *mm,
1637                                 struct list_head *deferred_free_list)
1638 {
1639     uvm_va_space_t *va_space;
1640     uvm_va_range_t *va_range;
1641     uvm_va_range_t *va_range_next;
1642     uvm_gpu_t *gpu;
1643 
1644     if (!gpu_va_space || uvm_gpu_va_space_state(gpu_va_space) != UVM_GPU_VA_SPACE_STATE_ACTIVE)
1645         return;
1646 
1647     va_space = gpu_va_space->va_space;
1648     UVM_ASSERT(va_space);
1649 
1650     uvm_assert_rwsem_locked_write(&va_space->lock);
1651 
1652     uvm_gpu_va_space_detach_all_user_channels(gpu_va_space, deferred_free_list);
1653 
1654     // Removing all registered channels should've removed all VA ranges used by
1655     // those channels.
1656     UVM_ASSERT(list_empty(&gpu_va_space->channel_va_ranges));
1657 
1658     // Unmap all page tables in this VA space on this GPU.
1659     // TODO: Bug 1799173: This will need to add objects to deferred_free_list
1660     uvm_for_each_va_range_safe(va_range, va_range_next, va_space)
1661         uvm_va_range_remove_gpu_va_space(va_range, gpu_va_space, mm, deferred_free_list);
1662 
1663     uvm_hmm_remove_gpu_va_space(va_space, gpu_va_space, mm);
1664 
1665     uvm_deferred_free_object_add(deferred_free_list,
1666                                  &gpu_va_space->deferred_free,
1667                                  UVM_DEFERRED_FREE_OBJECT_GPU_VA_SPACE);
1668 
1669     // Let uvm_va_space_mm_shutdown know that it has to wait for this GPU VA
1670     // space to be destroyed.
1671     atomic_inc(&va_space->gpu_va_space_deferred_free.num_pending);
1672 
1673     gpu = gpu_va_space->gpu;
1674 
1675     if (!uvm_processor_mask_test(&va_space->faultable_processors, gpu->id)) {
1676         UVM_ASSERT(va_space->num_non_faultable_gpu_va_spaces);
1677         va_space->num_non_faultable_gpu_va_spaces--;
1678     }
1679 
1680     uvm_processor_mask_clear(&va_space->registered_gpu_va_spaces, gpu->id);
1681     va_space->gpu_va_spaces[uvm_id_gpu_index(gpu->id)] = NULL;
1682     gpu_va_space->state = UVM_GPU_VA_SPACE_STATE_DEAD;
1683 }
1684 
uvm_va_space_unregister_gpu_va_space(uvm_va_space_t * va_space,const NvProcessorUuid * gpu_uuid)1685 NV_STATUS uvm_va_space_unregister_gpu_va_space(uvm_va_space_t *va_space, const NvProcessorUuid *gpu_uuid)
1686 {
1687     NV_STATUS status = NV_OK;
1688     uvm_gpu_t *gpu;
1689     uvm_gpu_va_space_t *gpu_va_space;
1690     struct mm_struct *mm;
1691     LIST_HEAD(deferred_free_list);
1692 
1693     // Stopping channels requires holding the VA space lock in read mode, so do
1694     // it first. This also takes the serialize_writers_lock, so we'll serialize
1695     // with other threads about to perform channel binds in
1696     // uvm_register_channel since.
1697     uvm_va_space_down_read_rm(va_space);
1698 
1699     gpu = uvm_va_space_get_gpu_by_uuid_with_gpu_va_space(va_space, gpu_uuid);
1700     if (!gpu) {
1701         uvm_va_space_up_read_rm(va_space);
1702         return NV_ERR_INVALID_DEVICE;
1703     }
1704 
1705     gpu_va_space = uvm_gpu_va_space_get(va_space, gpu);
1706     UVM_ASSERT(gpu_va_space);
1707 
1708     gpu_va_space_stop_all_channels(gpu_va_space);
1709 
1710     // We need to drop the lock to re-take it in write mode
1711     uvm_gpu_va_space_retain(gpu_va_space);
1712     uvm_gpu_retain(gpu);
1713     uvm_va_space_up_read_rm(va_space);
1714 
1715     mm = uvm_va_space_mm_or_current_retain_lock(va_space);
1716     uvm_va_space_down_write(va_space);
1717 
1718     // We dropped the lock so we have to re-verify that this gpu_va_space is
1719     // still valid. If so, then the GPU is also still registered under the VA
1720     // space. If not, we raced with another unregister thread, so return an
1721     // an error for double-unregister.
1722     if (uvm_gpu_va_space_state(gpu_va_space) == UVM_GPU_VA_SPACE_STATE_DEAD) {
1723         status = NV_ERR_INVALID_DEVICE;
1724     }
1725     else {
1726         UVM_ASSERT(gpu == uvm_va_space_get_gpu_by_uuid_with_gpu_va_space(va_space, gpu_uuid));
1727         UVM_ASSERT(gpu_va_space == uvm_gpu_va_space_get(va_space, gpu));
1728 
1729         remove_gpu_va_space(gpu_va_space, mm, &deferred_free_list);
1730     }
1731 
1732     uvm_va_space_up_write(va_space);
1733 
1734     // Unlock the mm since the call to uvm_deferred_free_object_list() requires
1735     // that we don't hold any locks. We don't release the mm yet because that
1736     // could call uvm_va_space_mm_shutdown() which waits for the deferred free
1737     // list to be empty which would cause a deadlock.
1738     if (mm)
1739         uvm_up_read_mmap_lock(mm);
1740 
1741     uvm_deferred_free_object_list(&deferred_free_list);
1742     uvm_gpu_va_space_release(gpu_va_space);
1743     uvm_gpu_release(gpu);
1744 
1745     uvm_va_space_mm_or_current_release(va_space, mm);
1746 
1747     return status;
1748 }
1749 
uvm_va_space_peer_enabled(uvm_va_space_t * va_space,const uvm_gpu_t * gpu0,const uvm_gpu_t * gpu1)1750 bool uvm_va_space_peer_enabled(uvm_va_space_t *va_space, const uvm_gpu_t *gpu0, const uvm_gpu_t *gpu1)
1751 {
1752     size_t table_index;
1753 
1754     UVM_ASSERT(uvm_processor_mask_test(&va_space->registered_gpus, gpu0->id));
1755     UVM_ASSERT(uvm_processor_mask_test(&va_space->registered_gpus, gpu1->id));
1756 
1757     table_index = uvm_gpu_peer_table_index(gpu0->id, gpu1->id);
1758     return !!test_bit(table_index, va_space->enabled_peers);
1759 }
1760 
uvm_processor_mask_find_closest_id(uvm_va_space_t * va_space,const uvm_processor_mask_t * candidates,uvm_processor_id_t src)1761 uvm_processor_id_t uvm_processor_mask_find_closest_id(uvm_va_space_t *va_space,
1762                                                       const uvm_processor_mask_t *candidates,
1763                                                       uvm_processor_id_t src)
1764 {
1765     uvm_processor_mask_t *mask = &va_space->closest_processors.mask;
1766     uvm_processor_id_t closest_id;
1767 
1768     // Highest priority: the local processor itself
1769     if (uvm_processor_mask_test(candidates, src))
1770         return src;
1771 
1772     uvm_mutex_lock(&va_space->closest_processors.mask_mutex);
1773 
1774     if (uvm_processor_mask_and(mask, candidates, &va_space->has_nvlink[uvm_id_value(src)])) {
1775         // NvLink peers
1776         uvm_processor_mask_t *indirect_peers;
1777         uvm_processor_mask_t *direct_peers = &va_space->closest_processors.direct_peers;
1778 
1779         indirect_peers = &va_space->indirect_peers[uvm_id_value(src)];
1780 
1781         if (uvm_processor_mask_andnot(direct_peers, mask, indirect_peers)) {
1782             // Direct peers, prioritizing GPU peers over CPU
1783             closest_id = uvm_processor_mask_find_first_gpu_id(direct_peers);
1784             if (UVM_ID_IS_INVALID(closest_id))
1785                 closest_id = UVM_ID_CPU;
1786         }
1787         else {
1788             // Indirect peers
1789             UVM_ASSERT(UVM_ID_IS_GPU(src));
1790             UVM_ASSERT(!uvm_processor_mask_test(mask, UVM_ID_CPU));
1791 
1792             closest_id = uvm_processor_mask_find_first_gpu_id(mask);
1793         }
1794     }
1795     else if (uvm_processor_mask_and(mask, candidates, &va_space->can_access[uvm_id_value(src)])) {
1796         // If source is GPU, prioritize PCIe peers over CPU
1797         // CPUs only have direct access to GPU memory over NVLINK, not PCIe, and
1798         // should have been selected above
1799         UVM_ASSERT(UVM_ID_IS_GPU(src));
1800 
1801         closest_id = uvm_processor_mask_find_first_gpu_id(mask);
1802         if (UVM_ID_IS_INVALID(closest_id))
1803             closest_id = UVM_ID_CPU;
1804     }
1805     else {
1806         // No GPUs with direct access are in the mask. Just pick the first
1807         // processor in the mask, if any.
1808         closest_id = uvm_processor_mask_find_first_id(candidates);
1809     }
1810 
1811     uvm_mutex_unlock(&va_space->closest_processors.mask_mutex);
1812 
1813     return closest_id;
1814 }
1815 
uvm_deferred_free_object_channel(uvm_deferred_free_object_t * object,uvm_parent_processor_mask_t * flushed_parent_gpus)1816 static void uvm_deferred_free_object_channel(uvm_deferred_free_object_t *object,
1817                                              uvm_parent_processor_mask_t *flushed_parent_gpus)
1818 {
1819     uvm_user_channel_t *channel = container_of(object, uvm_user_channel_t, deferred_free);
1820     uvm_gpu_t *gpu = channel->gpu;
1821 
1822     // Flush out any faults with this instance pointer still in the buffer. This
1823     // prevents us from re-allocating the same instance pointer for a new
1824     // channel and mis-attributing old faults to it.
1825     if (gpu->parent->replayable_faults_supported &&
1826         !uvm_parent_processor_mask_test(flushed_parent_gpus, gpu->parent->id)) {
1827         uvm_gpu_fault_buffer_flush(gpu);
1828         uvm_parent_processor_mask_set(flushed_parent_gpus, gpu->parent->id);
1829     }
1830 
1831     uvm_user_channel_destroy_detached(channel);
1832 }
1833 
uvm_deferred_free_object_list(struct list_head * deferred_free_list)1834 void uvm_deferred_free_object_list(struct list_head *deferred_free_list)
1835 {
1836     uvm_deferred_free_object_t *object, *next;
1837     uvm_parent_processor_mask_t flushed_parent_gpus;
1838 
1839     // flushed_parent_gpus prevents redundant fault buffer flushes by tracking
1840     // the parent GPUs on which the flush already happened. Flushing the fault
1841     // buffer on one GPU instance will flush it for all other instances on that
1842     // parent GPU.
1843     uvm_parent_processor_mask_zero(&flushed_parent_gpus);
1844 
1845     list_for_each_entry_safe(object, next, deferred_free_list, list_node) {
1846         list_del(&object->list_node);
1847 
1848         switch (object->type) {
1849             case UVM_DEFERRED_FREE_OBJECT_TYPE_CHANNEL:
1850                 uvm_deferred_free_object_channel(object, &flushed_parent_gpus);
1851                 break;
1852             case UVM_DEFERRED_FREE_OBJECT_GPU_VA_SPACE:
1853                 destroy_gpu_va_space(container_of(object, uvm_gpu_va_space_t, deferred_free));
1854                 break;
1855             case UVM_DEFERRED_FREE_OBJECT_TYPE_EXTERNAL_ALLOCATION:
1856                 uvm_ext_gpu_map_free(container_of(object, uvm_ext_gpu_map_t, deferred_free));
1857                 break;
1858             default:
1859                 UVM_ASSERT_MSG(0, "Invalid type %d\n", object->type);
1860         }
1861     }
1862 }
1863 
uvm_gpu_va_space_get_user_channel(uvm_gpu_va_space_t * gpu_va_space,uvm_gpu_phys_address_t instance_ptr)1864 uvm_user_channel_t *uvm_gpu_va_space_get_user_channel(uvm_gpu_va_space_t *gpu_va_space,
1865                                                       uvm_gpu_phys_address_t instance_ptr)
1866 {
1867     uvm_user_channel_t *user_channel;
1868     uvm_va_space_t *va_space = gpu_va_space->va_space;
1869 
1870     UVM_ASSERT(uvm_gpu_va_space_state(gpu_va_space) == UVM_GPU_VA_SPACE_STATE_ACTIVE);
1871     uvm_assert_rwsem_locked(&va_space->lock);
1872 
1873     // TODO: Bug 1880191: This is called on every non-replayable fault service.
1874     // Evaluate the performance impact of this list traversal and potentially
1875     // replace it with something better.
1876     list_for_each_entry(user_channel, &gpu_va_space->registered_channels, list_node) {
1877         if (user_channel->instance_ptr.addr.address == instance_ptr.address &&
1878             user_channel->instance_ptr.addr.aperture == instance_ptr.aperture) {
1879             return user_channel;
1880         }
1881     }
1882 
1883     return NULL;
1884 }
1885 
uvm_api_enable_peer_access(UVM_ENABLE_PEER_ACCESS_PARAMS * params,struct file * filp)1886 NV_STATUS uvm_api_enable_peer_access(UVM_ENABLE_PEER_ACCESS_PARAMS *params, struct file *filp)
1887 {
1888     uvm_va_space_t *va_space = uvm_va_space_get(filp);
1889     NV_STATUS status = NV_OK;
1890     uvm_gpu_t *gpu0 = NULL;
1891     uvm_gpu_t *gpu1 = NULL;
1892     size_t table_index;
1893 
1894     uvm_mutex_lock(&g_uvm_global.global_lock);
1895     status = retain_pcie_peers_from_uuids(va_space, &params->gpuUuidA, &params->gpuUuidB, &gpu0, &gpu1);
1896     uvm_mutex_unlock(&g_uvm_global.global_lock);
1897     if (status != NV_OK)
1898         return status;
1899 
1900     uvm_va_space_down_write(va_space);
1901 
1902     table_index = uvm_gpu_peer_table_index(gpu0->id, gpu1->id);
1903     if (test_bit(table_index, va_space->enabled_peers))
1904         status = NV_ERR_INVALID_DEVICE;
1905     else
1906         status = enable_peers(va_space, gpu0, gpu1);
1907 
1908     uvm_va_space_up_write(va_space);
1909 
1910     if (status != NV_OK) {
1911         uvm_mutex_lock(&g_uvm_global.global_lock);
1912         uvm_gpu_release_pcie_peer_access(gpu0, gpu1);
1913         uvm_mutex_unlock(&g_uvm_global.global_lock);
1914     }
1915 
1916     return status;
1917 }
1918 
uvm_api_disable_peer_access(UVM_DISABLE_PEER_ACCESS_PARAMS * params,struct file * filp)1919 NV_STATUS uvm_api_disable_peer_access(UVM_DISABLE_PEER_ACCESS_PARAMS *params, struct file *filp)
1920 {
1921     uvm_va_space_t *va_space = uvm_va_space_get(filp);
1922     NV_STATUS status = NV_OK;
1923     uvm_gpu_t *gpu0, *gpu1;
1924     LIST_HEAD(deferred_free_list);
1925 
1926     uvm_va_space_down_write(va_space);
1927 
1928     gpu0 = uvm_va_space_get_gpu_by_uuid(va_space, &params->gpuUuidA);
1929     gpu1 = uvm_va_space_get_gpu_by_uuid(va_space, &params->gpuUuidB);
1930 
1931     if (!gpu0 || !gpu1) {
1932         status = NV_ERR_INVALID_DEVICE;
1933         goto error;
1934     }
1935 
1936     if (uvm_id_equal(gpu0->id, gpu1->id)) {
1937         status = NV_ERR_INVALID_DEVICE;
1938         goto error;
1939     }
1940 
1941     if (!uvm_va_space_pcie_peer_enabled(va_space, gpu0, gpu1)) {
1942         status = NV_ERR_INVALID_DEVICE;
1943         goto error;
1944     }
1945 
1946     disable_peers(va_space, gpu0, gpu1, &deferred_free_list);
1947 
1948     // disable_peers doesn't release the GPU peer ref count, which means the two
1949     // GPUs will remain retained even if another thread unregisters them from
1950     // this VA space after we drop the lock.
1951     uvm_va_space_up_write(va_space);
1952 
1953     uvm_deferred_free_object_list(&deferred_free_list);
1954 
1955     uvm_mutex_lock(&g_uvm_global.global_lock);
1956     uvm_gpu_release_pcie_peer_access(gpu0, gpu1);
1957     uvm_mutex_unlock(&g_uvm_global.global_lock);
1958 
1959     return NV_OK;
1960 
1961 error:
1962     uvm_va_space_up_write(va_space);
1963     return status;
1964 }
1965 
uvm_va_space_pageable_mem_access_supported(uvm_va_space_t * va_space)1966 bool uvm_va_space_pageable_mem_access_supported(uvm_va_space_t *va_space)
1967 {
1968     // Any pageable memory access requires that we have mm_struct association
1969     // via va_space_mm.
1970     if (!uvm_va_space_mm_enabled(va_space))
1971         return false;
1972 
1973     // We might have systems with both ATS and HMM support. ATS gets priority.
1974     if (g_uvm_global.ats.supported)
1975         return g_uvm_global.ats.enabled;
1976 
1977     return uvm_hmm_is_enabled(va_space);
1978 }
1979 
uvm_test_get_pageable_mem_access_type(UVM_TEST_GET_PAGEABLE_MEM_ACCESS_TYPE_PARAMS * params,struct file * filp)1980 NV_STATUS uvm_test_get_pageable_mem_access_type(UVM_TEST_GET_PAGEABLE_MEM_ACCESS_TYPE_PARAMS *params,
1981                                                  struct file *filp)
1982 {
1983     uvm_va_space_t *va_space = uvm_va_space_get(filp);
1984 
1985     params->type = UVM_TEST_PAGEABLE_MEM_ACCESS_TYPE_NONE;
1986 
1987     if (uvm_va_space_pageable_mem_access_supported(va_space)) {
1988         if (g_uvm_global.ats.enabled) {
1989             if (UVM_ATS_IBM_SUPPORTED_IN_KERNEL())
1990                 params->type = UVM_TEST_PAGEABLE_MEM_ACCESS_TYPE_ATS_KERNEL;
1991             else
1992                 params->type = UVM_TEST_PAGEABLE_MEM_ACCESS_TYPE_ATS_DRIVER;
1993         }
1994         else {
1995             params->type = UVM_TEST_PAGEABLE_MEM_ACCESS_TYPE_HMM;
1996         }
1997     }
1998     else if (uvm_va_space_mm_enabled(va_space)) {
1999         params->type = UVM_TEST_PAGEABLE_MEM_ACCESS_TYPE_MMU_NOTIFIER;
2000     }
2001 
2002     return NV_OK;
2003 }
2004 
uvm_test_flush_deferred_work(UVM_TEST_FLUSH_DEFERRED_WORK_PARAMS * params,struct file * filp)2005 NV_STATUS uvm_test_flush_deferred_work(UVM_TEST_FLUSH_DEFERRED_WORK_PARAMS *params, struct file *filp)
2006 {
2007     UvmTestDeferredWorkType work_type = params->work_type;
2008 
2009     switch (work_type) {
2010         case UvmTestDeferredWorkTypeAcessedByMappings:
2011             nv_kthread_q_flush(&g_uvm_global.global_q);
2012             return NV_OK;
2013         default:
2014             return NV_ERR_INVALID_ARGUMENT;
2015     }
2016 }
2017 
uvm_test_enable_nvlink_peer_access(UVM_TEST_ENABLE_NVLINK_PEER_ACCESS_PARAMS * params,struct file * filp)2018 NV_STATUS uvm_test_enable_nvlink_peer_access(UVM_TEST_ENABLE_NVLINK_PEER_ACCESS_PARAMS *params, struct file *filp)
2019 {
2020     uvm_va_space_t *va_space = uvm_va_space_get(filp);
2021     NV_STATUS status = NV_OK;
2022     uvm_gpu_t *gpu0 = NULL;
2023     uvm_gpu_t *gpu1 = NULL;
2024     size_t table_index;
2025     uvm_gpu_peer_t *peer_caps = NULL;
2026 
2027     uvm_va_space_down_write(va_space);
2028 
2029     gpu0 = uvm_va_space_get_gpu_by_uuid(va_space, &params->gpuUuidA);
2030     gpu1 = uvm_va_space_get_gpu_by_uuid(va_space, &params->gpuUuidB);
2031 
2032     if (gpu0 && gpu1 && !uvm_id_equal(gpu0->id, gpu1->id))
2033         peer_caps = uvm_gpu_peer_caps(gpu0, gpu1);
2034 
2035     if (!peer_caps || peer_caps->link_type < UVM_GPU_LINK_NVLINK_1) {
2036         uvm_va_space_up_write(va_space);
2037         return NV_ERR_INVALID_DEVICE;
2038     }
2039 
2040     table_index = uvm_gpu_peer_table_index(gpu0->id, gpu1->id);
2041 
2042     // NVLink peers are automatically enabled in the VA space at VA space
2043     // registration time. In order to avoid tests having to keep track of the
2044     // different initial state for PCIe and NVLink peers, we just return NV_OK
2045     // if NVLink peer were already enabled.
2046     if (test_bit(table_index, va_space->enabled_peers))
2047         status = NV_OK;
2048     else
2049         status = enable_peers(va_space, gpu0, gpu1);
2050 
2051     uvm_va_space_up_write(va_space);
2052 
2053     return status;
2054 }
2055 
uvm_test_disable_nvlink_peer_access(UVM_TEST_DISABLE_NVLINK_PEER_ACCESS_PARAMS * params,struct file * filp)2056 NV_STATUS uvm_test_disable_nvlink_peer_access(UVM_TEST_DISABLE_NVLINK_PEER_ACCESS_PARAMS *params, struct file *filp)
2057 {
2058     uvm_va_space_t *va_space = uvm_va_space_get(filp);
2059     NV_STATUS status = NV_OK;
2060     uvm_gpu_t *gpu0, *gpu1;
2061     LIST_HEAD(deferred_free_list);
2062 
2063     uvm_va_space_down_write(va_space);
2064 
2065     gpu0 = uvm_va_space_get_gpu_by_uuid(va_space, &params->gpuUuidA);
2066     gpu1 = uvm_va_space_get_gpu_by_uuid(va_space, &params->gpuUuidB);
2067 
2068     if (!gpu0 || !gpu1) {
2069         status = NV_ERR_INVALID_DEVICE;
2070         goto error;
2071     }
2072 
2073     if (uvm_id_equal(gpu0->id, gpu1->id)) {
2074         status = NV_ERR_INVALID_DEVICE;
2075         goto error;
2076     }
2077 
2078     if (!uvm_va_space_nvlink_peer_enabled(va_space, gpu0, gpu1)) {
2079         status = NV_ERR_INVALID_DEVICE;
2080         goto error;
2081     }
2082 
2083     disable_peers(va_space, gpu0, gpu1, &deferred_free_list);
2084 
2085     uvm_va_space_up_write(va_space);
2086 
2087     uvm_deferred_free_object_list(&deferred_free_list);
2088 
2089     return NV_OK;
2090 
2091 error:
2092     uvm_va_space_up_write(va_space);
2093     return status;
2094 }
2095 
uvm_test_va_space_inject_error(UVM_TEST_VA_SPACE_INJECT_ERROR_PARAMS * params,struct file * filp)2096 NV_STATUS uvm_test_va_space_inject_error(UVM_TEST_VA_SPACE_INJECT_ERROR_PARAMS *params, struct file *filp)
2097 {
2098     uvm_va_space_t *va_space = uvm_va_space_get(filp);
2099 
2100     atomic_set(&va_space->test.migrate_vma_allocation_fail_nth, params->migrate_vma_allocation_fail_nth);
2101     atomic_set(&va_space->test.va_block_allocation_fail_nth, params->va_block_allocation_fail_nth);
2102 
2103     return NV_OK;
2104 }
2105 
2106 // Add a fixed number of dummy thread contexts to each thread context table.
2107 // The newly added thread contexts are removed by calling
2108 // uvm_test_va_space_remove_dummy_thread_contexts, or during VA space shutdown.
uvm_test_va_space_add_dummy_thread_contexts(UVM_TEST_VA_SPACE_ADD_DUMMY_THREAD_CONTEXTS_PARAMS * params,struct file * filp)2109 NV_STATUS uvm_test_va_space_add_dummy_thread_contexts(UVM_TEST_VA_SPACE_ADD_DUMMY_THREAD_CONTEXTS_PARAMS *params,
2110                                                        struct file *filp)
2111 {
2112     size_t i;
2113     uvm_va_space_t *va_space;
2114     size_t total_dummy_thread_contexts = params->num_dummy_thread_contexts * UVM_THREAD_CONTEXT_TABLE_SIZE;
2115     NV_STATUS status = NV_OK;
2116 
2117     if (params->num_dummy_thread_contexts == 0)
2118         return NV_OK;
2119 
2120     va_space = uvm_va_space_get(filp);
2121 
2122     uvm_va_space_down_write(va_space);
2123 
2124     if (va_space->test.dummy_thread_context_wrappers != NULL) {
2125         status = NV_ERR_INVALID_STATE;
2126         goto out;
2127     }
2128 
2129     if (va_space->test.num_dummy_thread_context_wrappers > 0) {
2130         status = NV_ERR_INVALID_STATE;
2131         goto out;
2132     }
2133 
2134     if (!uvm_thread_context_wrapper_is_used()) {
2135         status = NV_ERR_INVALID_STATE;
2136         goto out;
2137     }
2138 
2139     va_space->test.dummy_thread_context_wrappers = uvm_kvmalloc(sizeof(*va_space->test.dummy_thread_context_wrappers) *
2140                                                                 total_dummy_thread_contexts);
2141     if (va_space->test.dummy_thread_context_wrappers == NULL) {
2142         status = NV_ERR_NO_MEMORY;
2143         goto out;
2144     }
2145 
2146     va_space->test.num_dummy_thread_context_wrappers = total_dummy_thread_contexts;
2147 
2148     for (i = 0; i < total_dummy_thread_contexts; i++) {
2149         uvm_thread_context_t *thread_context = &va_space->test.dummy_thread_context_wrappers[i].context;
2150 
2151         // The context pointer is used to fill the task.
2152         thread_context->task = (struct task_struct *) thread_context;
2153 
2154         uvm_thread_context_add_at(thread_context, i % UVM_THREAD_CONTEXT_TABLE_SIZE);
2155     }
2156 
2157 out:
2158     uvm_va_space_up_write(va_space);
2159 
2160     return status;
2161 }
2162 
va_space_remove_dummy_thread_contexts(uvm_va_space_t * va_space)2163 static void va_space_remove_dummy_thread_contexts(uvm_va_space_t *va_space)
2164 {
2165     size_t i;
2166 
2167     uvm_assert_rwsem_locked_write(&va_space->lock);
2168 
2169     if (va_space->test.dummy_thread_context_wrappers == NULL) {
2170         UVM_ASSERT(va_space->test.num_dummy_thread_context_wrappers == 0);
2171         return;
2172     }
2173 
2174     UVM_ASSERT(uvm_thread_context_wrapper_is_used());
2175     UVM_ASSERT(uvm_enable_builtin_tests != 0);
2176     UVM_ASSERT(va_space->test.num_dummy_thread_context_wrappers > 0);
2177 
2178     for (i = 0; i < va_space->test.num_dummy_thread_context_wrappers; i++) {
2179         uvm_thread_context_t *thread_context = &va_space->test.dummy_thread_context_wrappers[i].context;
2180 
2181         uvm_thread_context_remove_at(thread_context, i % UVM_THREAD_CONTEXT_TABLE_SIZE);
2182     }
2183 
2184     uvm_kvfree(va_space->test.dummy_thread_context_wrappers);
2185     va_space->test.dummy_thread_context_wrappers = NULL;
2186     va_space->test.num_dummy_thread_context_wrappers = 0;
2187 }
2188 
uvm_test_va_space_remove_dummy_thread_contexts(UVM_TEST_VA_SPACE_REMOVE_DUMMY_THREAD_CONTEXTS_PARAMS * params,struct file * filp)2189 NV_STATUS uvm_test_va_space_remove_dummy_thread_contexts(UVM_TEST_VA_SPACE_REMOVE_DUMMY_THREAD_CONTEXTS_PARAMS *params,
2190                                                           struct file *filp)
2191 {
2192     uvm_va_space_t *va_space = uvm_va_space_get(filp);
2193 
2194     uvm_va_space_down_write(va_space);
2195 
2196     va_space_remove_dummy_thread_contexts(va_space);
2197 
2198     uvm_va_space_up_write(va_space);
2199 
2200     return NV_OK;
2201 }
2202 
uvm_test_destroy_gpu_va_space_delay(UVM_TEST_DESTROY_GPU_VA_SPACE_DELAY_PARAMS * params,struct file * filp)2203 NV_STATUS uvm_test_destroy_gpu_va_space_delay(UVM_TEST_DESTROY_GPU_VA_SPACE_DELAY_PARAMS *params, struct file *filp)
2204 {
2205     uvm_va_space_t *va_space = uvm_va_space_get(filp);
2206 
2207     // va_space lock is not needed here.
2208     atomic64_set(&va_space->test.destroy_gpu_va_space_delay_us, params->delay_us);
2209 
2210     return NV_OK;
2211 }
2212 
uvm_test_force_cpu_to_cpu_copy_with_ce(UVM_TEST_FORCE_CPU_TO_CPU_COPY_WITH_CE_PARAMS * params,struct file * filp)2213 NV_STATUS uvm_test_force_cpu_to_cpu_copy_with_ce(UVM_TEST_FORCE_CPU_TO_CPU_COPY_WITH_CE_PARAMS *params,
2214                                                  struct file *filp)
2215 
2216 {
2217     uvm_va_space_t *va_space = uvm_va_space_get(filp);
2218 
2219     va_space->test.force_cpu_to_cpu_copy_with_ce = params->force_copy_with_ce;
2220     return NV_OK;
2221 }
2222 
uvm_test_va_space_allow_movable_allocations(UVM_TEST_VA_SPACE_ALLOW_MOVABLE_ALLOCATIONS_PARAMS * params,struct file * filp)2223 NV_STATUS uvm_test_va_space_allow_movable_allocations(UVM_TEST_VA_SPACE_ALLOW_MOVABLE_ALLOCATIONS_PARAMS *params,
2224                                                       struct file *filp)
2225 {
2226     uvm_va_space_t *va_space = uvm_va_space_get(filp);
2227 
2228     va_space->test.allow_allocation_from_movable = params->allow_movable;
2229     return NV_OK;
2230 }
2231 
2232 // List of fault service contexts for CPU faults
2233 static LIST_HEAD(g_cpu_service_block_context_list);
2234 
2235 static uvm_spinlock_t g_cpu_service_block_context_list_lock;
2236 
uvm_service_block_context_alloc(struct mm_struct * mm)2237 uvm_service_block_context_t *uvm_service_block_context_alloc(struct mm_struct *mm)
2238 {
2239     uvm_service_block_context_t *service_context = uvm_kvmalloc(sizeof(*service_context));
2240 
2241     if (!service_context)
2242         return NULL;
2243 
2244     service_context->block_context = uvm_va_block_context_alloc(mm);
2245     if (!service_context->block_context) {
2246         uvm_kvfree(service_context);
2247         service_context = NULL;
2248     }
2249 
2250     return service_context;
2251 }
2252 
uvm_service_block_context_free(uvm_service_block_context_t * service_context)2253 void uvm_service_block_context_free(uvm_service_block_context_t *service_context)
2254 {
2255     if (!service_context)
2256         return;
2257 
2258     uvm_va_block_context_free(service_context->block_context);
2259     uvm_kvfree(service_context);
2260 }
2261 
uvm_service_block_context_init(void)2262 NV_STATUS uvm_service_block_context_init(void)
2263 {
2264     unsigned num_preallocated_contexts = 4;
2265 
2266     uvm_spin_lock_init(&g_cpu_service_block_context_list_lock, UVM_LOCK_ORDER_LEAF);
2267 
2268     // Pre-allocate some fault service contexts for the CPU and add them to the global list
2269     while (num_preallocated_contexts-- > 0) {
2270         uvm_service_block_context_t *service_context = uvm_service_block_context_alloc(NULL);
2271 
2272         if (!service_context)
2273             return NV_ERR_NO_MEMORY;
2274 
2275         list_add(&service_context->cpu_fault.service_context_list, &g_cpu_service_block_context_list);
2276     }
2277 
2278     return NV_OK;
2279 }
2280 
uvm_service_block_context_exit(void)2281 void uvm_service_block_context_exit(void)
2282 {
2283     uvm_service_block_context_t *service_context, *service_context_tmp;
2284 
2285     // Free fault service contexts for the CPU and add clear the global list
2286     list_for_each_entry_safe(service_context,
2287                              service_context_tmp,
2288                              &g_cpu_service_block_context_list,
2289                              cpu_fault.service_context_list) {
2290         uvm_service_block_context_free(service_context);
2291     }
2292 
2293     INIT_LIST_HEAD(&g_cpu_service_block_context_list);
2294 }
2295 
2296 // Get a fault service context from the global list or allocate a new one if
2297 // there are no available entries.
service_block_context_cpu_alloc(void)2298 static uvm_service_block_context_t *service_block_context_cpu_alloc(void)
2299 {
2300     uvm_service_block_context_t *service_context;
2301 
2302     uvm_spin_lock(&g_cpu_service_block_context_list_lock);
2303 
2304     service_context = list_first_entry_or_null(&g_cpu_service_block_context_list,
2305                                                uvm_service_block_context_t,
2306                                                cpu_fault.service_context_list);
2307 
2308     if (service_context)
2309         list_del(&service_context->cpu_fault.service_context_list);
2310 
2311     uvm_spin_unlock(&g_cpu_service_block_context_list_lock);
2312 
2313     if (!service_context)
2314         service_context = uvm_service_block_context_alloc(NULL);
2315     else
2316         uvm_va_block_context_init(service_context->block_context, NULL);
2317 
2318     return service_context;
2319 }
2320 
2321 // Put a fault service context in the global list.
service_block_context_cpu_free(uvm_service_block_context_t * service_context)2322 static void service_block_context_cpu_free(uvm_service_block_context_t *service_context)
2323 {
2324     uvm_spin_lock(&g_cpu_service_block_context_list_lock);
2325 
2326     list_add(&service_context->cpu_fault.service_context_list, &g_cpu_service_block_context_list);
2327 
2328     uvm_spin_unlock(&g_cpu_service_block_context_list_lock);
2329 }
2330 
uvm_va_space_cpu_fault(uvm_va_space_t * va_space,struct vm_area_struct * vma,struct vm_fault * vmf,bool is_hmm)2331 static vm_fault_t uvm_va_space_cpu_fault(uvm_va_space_t *va_space,
2332                                          struct vm_area_struct *vma,
2333                                          struct vm_fault *vmf,
2334                                          bool is_hmm)
2335 {
2336     uvm_va_block_t *va_block;
2337     NvU64 fault_addr = nv_page_fault_va(vmf);
2338     bool is_write = vmf->flags & FAULT_FLAG_WRITE;
2339     NV_STATUS status = uvm_global_get_status();
2340     bool tools_enabled;
2341     bool major_fault = false;
2342     bool is_remote_mm = false;
2343     uvm_service_block_context_t *service_context;
2344     uvm_processor_mask_t *gpus_to_check_for_ecc;
2345 
2346     if (status != NV_OK)
2347         goto convert_error;
2348 
2349     // TODO: Bug 2583279: Lock tracking is disabled for the power management
2350     // lock in order to suppress reporting of a lock policy violation.
2351     // The violation consists in acquiring the power management lock multiple
2352     // times, and it is manifested as an error during release. The
2353     // re-acquisition of the power management locks happens upon re-entry in the
2354     // UVM module, and it is benign on itself, but when combined with certain
2355     // power management scenarios, it is indicative of a potential deadlock.
2356     // Tracking will be re-enabled once the power management locking strategy is
2357     // modified to avoid deadlocks.
2358     if (!uvm_down_read_trylock_no_tracking(&g_uvm_global.pm.lock)) {
2359         status = NV_ERR_BUSY_RETRY;
2360         goto convert_error;
2361     }
2362 
2363     service_context = service_block_context_cpu_alloc();
2364     if (!service_context) {
2365         status = NV_ERR_NO_MEMORY;
2366         goto unlock;
2367     }
2368 
2369     service_context->cpu_fault.wakeup_time_stamp = 0;
2370 
2371     // There are up to three mm_structs to worry about, and they might all be
2372     // different:
2373     //
2374     // 1) vma->vm_mm
2375     // 2) current->mm
2376     // 3) va_space->va_space_mm.mm (though note that if this is valid, then it
2377     //    must match vma->vm_mm).
2378     //
2379     // The kernel guarantees that vma->vm_mm has a reference taken with
2380     // mmap_lock held on the CPU fault path, so tell the fault handler to use
2381     // that one. current->mm might differ if we're on the access_process_vm
2382     // (ptrace) path or if another driver is calling get_user_pages.
2383     service_context->block_context->mm = vma->vm_mm;
2384 
2385     // The mmap_lock might be held in write mode, but the mode doesn't matter
2386     // for the purpose of lock ordering and we don't rely on it being in write
2387     // anywhere so just record it as read mode in all cases.
2388     uvm_record_lock_mmap_lock_read(vma->vm_mm);
2389 
2390     do {
2391         bool do_sleep = false;
2392 
2393         if (status == NV_WARN_MORE_PROCESSING_REQUIRED) {
2394             NvU64 now = NV_GETTIME();
2395             if (now < service_context->cpu_fault.wakeup_time_stamp)
2396                 do_sleep = true;
2397 
2398             if (do_sleep)
2399                 uvm_tools_record_throttling_start(va_space, fault_addr, UVM_ID_CPU);
2400 
2401             // Drop the VA space lock while we sleep
2402             uvm_va_space_up_read(va_space);
2403 
2404             // usleep_range is preferred because msleep has a 20ms granularity
2405             // and udelay uses a busy-wait loop. usleep_range uses
2406             // high-resolution timers and, by adding a range, the Linux
2407             // scheduler may coalesce our wakeup with others, thus saving some
2408             // interrupts.
2409             if (do_sleep) {
2410                 unsigned long nap_us = (service_context->cpu_fault.wakeup_time_stamp - now) / 1000;
2411 
2412                 usleep_range(nap_us, nap_us + nap_us / 2);
2413             }
2414         }
2415 
2416         uvm_va_space_down_read(va_space);
2417 
2418         if (do_sleep)
2419             uvm_tools_record_throttling_end(va_space, fault_addr, UVM_ID_CPU);
2420 
2421         if (is_hmm) {
2422             if (va_space->va_space_mm.mm == vma->vm_mm) {
2423                 // Note that normally we should find a va_block for the faulting
2424                 // address because the block had to be created when migrating a
2425                 // page to the GPU and a device private PTE inserted into the CPU
2426                 // page tables in order for migrate_to_ram() to be called. Not
2427                 // finding it means the PTE was remapped to a different virtual
2428                 // address with mremap() so create a new va_block if needed.
2429                 status = uvm_hmm_va_block_find_create(va_space,
2430                                                       fault_addr,
2431                                                       &service_context->block_context->hmm.vma,
2432                                                       &va_block);
2433                 if (status != NV_OK)
2434                     break;
2435 
2436                 UVM_ASSERT(service_context->block_context->hmm.vma == vma);
2437                 status = uvm_hmm_migrate_begin(va_block);
2438                 if (status != NV_OK)
2439                     break;
2440 
2441                 service_context->cpu_fault.vmf = vmf;
2442             }
2443             else {
2444                 is_remote_mm = true;
2445                 status = uvm_hmm_remote_cpu_fault(vmf);
2446                 break;
2447             }
2448         }
2449         else {
2450             status = uvm_va_block_find_create_managed(va_space, fault_addr, &va_block);
2451             if (status != NV_OK) {
2452                 UVM_ASSERT_MSG(status == NV_ERR_NO_MEMORY, "status: %s\n", nvstatusToString(status));
2453                 break;
2454             }
2455 
2456             // Watch out, current->mm might not be vma->vm_mm
2457             UVM_ASSERT(vma == uvm_va_range_vma(va_block->va_range));
2458         }
2459 
2460         // Loop until thrashing goes away.
2461         status = uvm_va_block_cpu_fault(va_block, fault_addr, is_write, service_context);
2462 
2463         if (is_hmm)
2464             uvm_hmm_migrate_finish(va_block);
2465     } while (status == NV_WARN_MORE_PROCESSING_REQUIRED);
2466 
2467     if (status != NV_OK && !(is_hmm && status == NV_ERR_BUSY_RETRY)) {
2468         UvmEventFatalReason reason;
2469 
2470         reason = uvm_tools_status_to_fatal_fault_reason(status);
2471         UVM_ASSERT(reason != UvmEventFatalReasonInvalid);
2472 
2473         uvm_tools_record_cpu_fatal_fault(va_space, fault_addr, is_write, reason);
2474     }
2475 
2476     tools_enabled = va_space->tools.enabled;
2477     gpus_to_check_for_ecc = &service_context->cpu_fault.gpus_to_check_for_ecc;
2478 
2479     if (status == NV_OK && !is_remote_mm)
2480         uvm_global_gpu_retain(gpus_to_check_for_ecc);
2481 
2482     uvm_va_space_up_read(va_space);
2483     uvm_record_unlock_mmap_lock_read(vma->vm_mm);
2484 
2485     if (status == NV_OK && !is_remote_mm) {
2486         status = uvm_global_gpu_check_ecc_error(gpus_to_check_for_ecc);
2487         uvm_global_gpu_release(gpus_to_check_for_ecc);
2488     }
2489 
2490     if (tools_enabled)
2491         uvm_tools_flush_events();
2492 
2493     // Major faults involve I/O in order to resolve the fault.
2494     // If any pages were DMA'ed between the GPU and host memory, that makes it
2495     // a major fault. A process can also get statistics for major and minor
2496     // faults by calling readproc().
2497     major_fault = service_context->cpu_fault.did_migrate;
2498     service_block_context_cpu_free(service_context);
2499 
2500 unlock:
2501     // TODO: Bug 2583279: See the comment above the matching lock acquisition
2502     uvm_up_read_no_tracking(&g_uvm_global.pm.lock);
2503 
2504 convert_error:
2505     switch (status) {
2506         case NV_OK:
2507         case NV_ERR_BUSY_RETRY:
2508             return VM_FAULT_NOPAGE | (major_fault ? VM_FAULT_MAJOR : 0);
2509         case NV_ERR_NO_MEMORY:
2510             return VM_FAULT_OOM;
2511         default:
2512             return VM_FAULT_SIGBUS;
2513     }
2514 }
2515 
uvm_va_space_cpu_fault_managed(uvm_va_space_t * va_space,struct vm_area_struct * vma,struct vm_fault * vmf)2516 vm_fault_t uvm_va_space_cpu_fault_managed(uvm_va_space_t *va_space,
2517                                           struct vm_area_struct *vma,
2518                                           struct vm_fault *vmf)
2519 {
2520     UVM_ASSERT(va_space == uvm_va_space_get(vma->vm_file));
2521 
2522     return uvm_va_space_cpu_fault(va_space, vma, vmf, false);
2523 }
2524 
uvm_va_space_cpu_fault_hmm(uvm_va_space_t * va_space,struct vm_area_struct * vma,struct vm_fault * vmf)2525 vm_fault_t uvm_va_space_cpu_fault_hmm(uvm_va_space_t *va_space,
2526                                       struct vm_area_struct *vma,
2527                                       struct vm_fault *vmf)
2528 {
2529     return uvm_va_space_cpu_fault(va_space, vma, vmf, true);
2530 }
2531