1 /*******************************************************************************
2     Copyright (c) 2015-2023 NVIDIA Corporation
3 
4     Permission is hereby granted, free of charge, to any person obtaining a copy
5     of this software and associated documentation files (the "Software"), to
6     deal in the Software without restriction, including without limitation the
7     rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
8     sell copies of the Software, and to permit persons to whom the Software is
9     furnished to do so, subject to the following conditions:
10 
11         The above copyright notice and this permission notice shall be
12         included in all copies or substantial portions of the Software.
13 
14     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17     THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19     FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20     DEALINGS IN THE SOFTWARE.
21 
22 *******************************************************************************/
23 
24 #ifndef __UVM_VA_SPACE_H__
25 #define __UVM_VA_SPACE_H__
26 
27 #include "uvm_processors.h"
28 #include "uvm_global.h"
29 #include "uvm_gpu.h"
30 #include "uvm_range_tree.h"
31 #include "uvm_range_group.h"
32 #include "uvm_forward_decl.h"
33 #include "uvm_mmu.h"
34 #include "uvm_linux.h"
35 #include "uvm_common.h"
36 #include "nv-kref.h"
37 #include "nv-linux.h"
38 #include "uvm_perf_events.h"
39 #include "uvm_perf_module.h"
40 #include "uvm_va_block_types.h"
41 #include "uvm_va_block.h"
42 #include "uvm_hmm.h"
43 #include "uvm_test_ioctl.h"
44 #include "uvm_ats.h"
45 #include "uvm_va_space_mm.h"
46 #include "uvm_conf_computing.h"
47 
48 // uvm_deferred_free_object provides a mechanism for building and later freeing
49 // a list of objects which are owned by a VA space, but can't be freed while the
50 // VA space lock is held.
51 
52 typedef enum
53 {
54     UVM_DEFERRED_FREE_OBJECT_TYPE_CHANNEL,
55     UVM_DEFERRED_FREE_OBJECT_GPU_VA_SPACE,
56     UVM_DEFERRED_FREE_OBJECT_TYPE_EXTERNAL_ALLOCATION,
57     UVM_DEFERRED_FREE_OBJECT_TYPE_COUNT
58 } uvm_deferred_free_object_type_t;
59 
60 typedef struct
61 {
62     uvm_deferred_free_object_type_t type;
63     struct list_head list_node;
64 } uvm_deferred_free_object_t;
65 
66 static void uvm_deferred_free_object_add(struct list_head *list,
67                                          uvm_deferred_free_object_t *object,
68                                          uvm_deferred_free_object_type_t type)
69 {
70     object->type = type;
71     list_add_tail(&object->list_node, list);
72 }
73 
74 // Walks the list of pending objects and frees each one as appropriate to its
75 // type.
76 //
77 // LOCKING: May take the GPU isr_lock and the RM locks.
78 void uvm_deferred_free_object_list(struct list_head *deferred_free_list);
79 
80 typedef enum
81 {
82     // The GPU VA space has been initialized but not yet inserted into the
83     // parent VA space.
84     UVM_GPU_VA_SPACE_STATE_INIT = 0,
85 
86     // The GPU VA space is active in the VA space.
87     UVM_GPU_VA_SPACE_STATE_ACTIVE,
88 
89     // The GPU VA space is no longer active in the VA space. This state can be
90     // observed when threads retain the gpu_va_space then drop the VA space
91     // lock. After re-taking the VA space lock, the state must be inspected to
92     // see if another thread unregistered the gpu_va_space in the meantime.
93     UVM_GPU_VA_SPACE_STATE_DEAD,
94 
95     UVM_GPU_VA_SPACE_STATE_COUNT
96 } uvm_gpu_va_space_state_t;
97 
98 struct uvm_gpu_va_space_struct
99 {
100     // Parent pointers
101     uvm_va_space_t *va_space;
102     uvm_gpu_t *gpu;
103 
104     uvm_gpu_va_space_state_t state;
105 
106     // Handle to the duped GPU VA space
107     // to be used for all further GPU VA space related UVM-RM interactions.
108     uvmGpuAddressSpaceHandle duped_gpu_va_space;
109     bool did_set_page_directory;
110 
111     uvm_page_tree_t page_tables;
112 
113     // List of all uvm_user_channel_t's under this GPU VA space
114     struct list_head registered_channels;
115 
116     // List of all uvm_va_range_t's under this GPU VA space with type ==
117     // UVM_VA_RANGE_TYPE_CHANNEL. Used at channel registration time to find
118     // shareable VA ranges without having to iterate through all VA ranges in
119     // the VA space.
120     struct list_head channel_va_ranges;
121 
122     // Boolean which is 1 if no new channel registration is allowed. This is set
123     // when all the channels under the GPU VA space have been stopped to prevent
124     // new ones from entering after we drop the VA space lock. It is an atomic_t
125     // because multiple threads may set it to 1 concurrently.
126     atomic_t disallow_new_channels;
127 
128     // Node for the deferred free list where this GPU VA space is stored upon
129     // being unregistered.
130     uvm_deferred_free_object_t deferred_free;
131 
132     // Reference count for this gpu_va_space. This only protects the memory
133     // object itself, for use in cases when the gpu_va_space needs to be
134     // accessed across dropping and re-acquiring the VA space lock.
135     nv_kref_t kref;
136 
137     // ATS specific state
138     uvm_ats_gpu_va_space_t ats;
139 };
140 
141 typedef struct
142 {
143     int                  numa_node;
144 
145     uvm_processor_mask_t gpus;
146 } uvm_cpu_gpu_affinity_t;
147 
148 struct uvm_va_space_struct
149 {
150     // Mask of gpus registered with the va space
151     uvm_processor_mask_t registered_gpus;
152 
153     // Array of pointers to the uvm_gpu_t objects that correspond to the
154     // uvm_processor_id_t index.
155     //
156     // With SMC, GPUs can be partitioned so the number of uvm_gpu_t objects can
157     // be larger than UVM_ID_MAX_GPUS. However, each VA space can only
158     // subscribe to a single partition per GPU, so it is fine to have a regular
159     // processor mask.
160     uvm_gpu_t *registered_gpus_table[UVM_ID_MAX_GPUS];
161 
162     // Mask of processors registered with the va space that support replayable
163     // faults.
164     uvm_processor_mask_t faultable_processors;
165 
166     // Mask of processors registered with the va space that don't support
167     // faulting.
168     uvm_processor_mask_t non_faultable_processors;
169 
170     // This is a count of non fault capable processors with a GPU VA space
171     // registered.
172     NvU32 num_non_faultable_gpu_va_spaces;
173 
174     // Semaphore protecting the state of the va space
175     uvm_rw_semaphore_t lock;
176 
177     // Lock taken prior to taking the VA space lock in write mode, or prior to
178     // taking the VA space lock in read mode on a path which will call in RM.
179     // See UVM_LOCK_ORDER_VA_SPACE_SERIALIZE_WRITERS in uvm_lock.h.
180     uvm_mutex_t serialize_writers_lock;
181 
182     // Lock taken to serialize down_reads on the VA space lock with up_writes in
183     // other threads. See
184     // UVM_LOCK_ORDER_VA_SPACE_READ_ACQUIRE_WRITE_RELEASE_LOCK in uvm_lock.h.
185     uvm_mutex_t read_acquire_write_release_lock;
186 
187     // Tree of uvm_va_range_t's
188     uvm_range_tree_t va_range_tree;
189 
190     // Kernel mapping structure passed to unmap_mapping range to unmap CPU PTEs
191     // in this process.
192     struct address_space *mapping;
193 
194     // Storage in g_uvm_global.va_spaces.list
195     struct list_head list_node;
196 
197     // Monotonically increasing counter for range groups IDs
198     atomic64_t range_group_id_counter;
199 
200     // Range groups
201     struct radix_tree_root range_groups;
202     uvm_range_tree_t range_group_ranges;
203 
204     // Peer to peer table
205     // A bitmask of peer to peer pairs enabled in this va_space
206     // indexed by a peer_table_index returned by uvm_gpu_peer_table_index().
207     DECLARE_BITMAP(enabled_peers, UVM_MAX_UNIQUE_GPU_PAIRS);
208 
209     // Temporary copy of the above state used to avoid allocation during VA
210     // space destroy.
211     DECLARE_BITMAP(enabled_peers_teardown, UVM_MAX_UNIQUE_GPU_PAIRS);
212 
213     // Interpreting these processor masks:
214     //      uvm_processor_mask_test(foo[A], B)
215     // ...should be read as "test if A foo B." For example:
216     //      uvm_processor_mask_test(accessible_from[B], A)
217     // means "test if B is accessible_from A."
218 
219     // Pre-computed masks that contain, for each processor, a mask of processors
220     // which that processor can directly access. In other words, this will test
221     // whether A has direct access to B:
222     //      uvm_processor_mask_test(can_access[A], B)
223     uvm_processor_mask_t can_access[UVM_ID_MAX_PROCESSORS];
224 
225     // Pre-computed masks that contain, for each processor memory, a mask with
226     // the processors that have direct access enabled to its memory. This is the
227     // opposite direction as can_access. In other words, this will test whether
228     // A has direct access to B:
229     //      uvm_processor_mask_test(accessible_from[B], A)
230     uvm_processor_mask_t accessible_from[UVM_ID_MAX_PROCESSORS];
231 
232     // Pre-computed masks that contain, for each processor memory, a mask with
233     // the processors that can directly copy to and from its memory. This is
234     // almost the same as accessible_from masks, but also requires peer identity
235     // mappings to be supported for peer access.
236     uvm_processor_mask_t can_copy_from[UVM_ID_MAX_PROCESSORS];
237 
238     // Pre-computed masks that contain, for each processor, a mask of processors
239     // to which that processor has NVLINK access. In other words, this will test
240     // whether A has NVLINK access to B:
241     //      uvm_processor_mask_test(has_nvlink[A], B)
242     // This is a subset of can_access.
243     uvm_processor_mask_t has_nvlink[UVM_ID_MAX_PROCESSORS];
244 
245     // Pre-computed masks that contain, for each processor memory, a mask with
246     // the processors that have direct access to its memory and native support
247     // for atomics in HW. This is a subset of accessible_from.
248     uvm_processor_mask_t has_native_atomics[UVM_ID_MAX_PROCESSORS];
249 
250     // Pre-computed masks that contain, for each processor memory, a mask with
251     // the processors that are indirect peers. Indirect peers can access each
252     // other's memory like regular peers, but with additional latency and/or bw
253     // penalty.
254     uvm_processor_mask_t indirect_peers[UVM_ID_MAX_PROCESSORS];
255 
256     // Mask of gpu_va_spaces registered with the va space
257     // indexed by gpu->id
258     uvm_processor_mask_t registered_gpu_va_spaces;
259 
260     // Mask of GPUs which have temporarily dropped the VA space lock mid-
261     // unregister. Used to make other paths return an error rather than
262     // corrupting state.
263     uvm_processor_mask_t gpu_unregister_in_progress;
264 
265     // Mask of processors that are participating in system-wide atomics
266     uvm_processor_mask_t system_wide_atomics_enabled_processors;
267 
268     // Mask of physical GPUs where access counters are enabled on this VA space
269     uvm_parent_processor_mask_t access_counters_enabled_processors;
270 
271     // Array with information regarding CPU/GPU NUMA affinity. There is one
272     // entry per CPU NUMA node. Entries in the array are populated sequentially
273     // as new CPU NUMA nodes are discovered on GPU registration. Each entry
274     // contains a CPU NUMA node id, and a mask with the GPUs attached to it.
275     // Since each GPU can only be attached to one CPU node id, the array can
276     // contain information for up to UVM_ID_MAX_GPUS nodes. The information is
277     // stored in the VA space to avoid taking the global lock.
278     uvm_cpu_gpu_affinity_t gpu_cpu_numa_affinity[UVM_ID_MAX_GPUS];
279 
280     // Unregistering a GPU may trigger memory eviction from the GPU to the CPU.
281     // This must happen without allocation, thus, a buffer is preallocated
282     // at GPU register and freed at GPU unregister.
283     uvm_conf_computing_dma_buffer_t *gpu_unregister_dma_buffer[UVM_ID_MAX_GPUS];
284 
285     // Array of GPU VA spaces
286     uvm_gpu_va_space_t *gpu_va_spaces[UVM_ID_MAX_GPUS];
287 
288     // Tracking of GPU VA spaces which have dropped the VA space lock and are
289     // pending destruction. uvm_va_space_mm_shutdown has to wait for those
290     // destroy operations to be completely done.
291     struct
292     {
293         atomic_t num_pending;
294         wait_queue_head_t wait_queue;
295     } gpu_va_space_deferred_free;
296 
297     // Per-va_space event notification information for performance heuristics
298     uvm_perf_va_space_events_t perf_events;
299 
300     uvm_perf_module_data_desc_t perf_modules_data[UVM_PERF_MODULE_TYPE_COUNT];
301 
302     // Array of modules that are loaded in the va_space, indexed by module type
303     uvm_perf_module_t *perf_modules[UVM_PERF_MODULE_TYPE_COUNT];
304 
305     // Lists of counters listening for events on this VA space
306     // Protected by lock
307     struct
308     {
309         bool enabled;
310 
311         uvm_rw_semaphore_t lock;
312 
313         // Lists of counters listening for events on this VA space
314         struct list_head counters[UVM_TOTAL_COUNTERS];
315         struct list_head queues_v1[UvmEventNumTypesAll];
316         struct list_head queues_v2[UvmEventNumTypesAll];
317 
318         // Node for this va_space in global subscribers list
319         struct list_head node;
320     } tools;
321 
322     // Boolean which is 1 if all user channels have been already stopped. This
323     // is an atomic_t because multiple threads may call
324     // uvm_va_space_stop_all_user_channels concurrently.
325     atomic_t user_channels_stopped;
326 
327     // Prevent future registrations of any kind (GPU, GPU VA space, channel).
328     // This is used when the associated va_space_mm is torn down, which has to
329     // prevent any new work from being started in this VA space.
330     bool disallow_new_registers;
331 
332     bool user_channel_stops_are_immediate;
333 
334     // Block context used for GPU unmap operations so that allocation is not
335     // required on the teardown path. This can only be used while the VA space
336     // lock is held in write mode. Access using uvm_va_space_block_context().
337     uvm_va_block_context_t *va_block_context;
338 
339     NvU64 initialization_flags;
340 
341     // The mm currently associated with this VA space, if any.
342     uvm_va_space_mm_t va_space_mm;
343 
344     union
345     {
346         uvm_ats_va_space_t ats;
347 
348         // HMM information about this VA space.
349         uvm_hmm_va_space_t hmm;
350     };
351 
352     struct
353     {
354         bool  page_prefetch_enabled;
355         bool  skip_migrate_vma;
356 
357         atomic_t migrate_vma_allocation_fail_nth;
358 
359         atomic_t va_block_allocation_fail_nth;
360 
361         uvm_thread_context_wrapper_t *dummy_thread_context_wrappers;
362         size_t num_dummy_thread_context_wrappers;
363 
364         atomic64_t destroy_gpu_va_space_delay_us;
365 
366         atomic64_t split_invalidate_delay_us;
367 
368         bool force_cpu_to_cpu_copy_with_ce;
369 
370         bool allow_allocation_from_movable;
371     } test;
372 
373     // Queue item for deferred f_ops->release() handling
374     nv_kthread_q_item_t deferred_release_q_item;
375 };
376 
377 static uvm_gpu_t *uvm_va_space_get_gpu(uvm_va_space_t *va_space, uvm_gpu_id_t gpu_id)
378 {
379     uvm_gpu_t *gpu;
380 
381     UVM_ASSERT(uvm_processor_mask_test(&va_space->registered_gpus, gpu_id));
382 
383     gpu = va_space->registered_gpus_table[uvm_id_gpu_index(gpu_id)];
384 
385     UVM_ASSERT(gpu);
386     UVM_ASSERT(uvm_gpu_get(gpu->id) == gpu);
387 
388     return gpu;
389 }
390 
391 static const char *uvm_va_space_processor_name(uvm_va_space_t *va_space, uvm_processor_id_t id)
392 {
393     if (UVM_ID_IS_CPU(id))
394         return "0: CPU";
395     else
396         return uvm_gpu_name(uvm_va_space_get_gpu(va_space, id));
397 }
398 
399 static void uvm_va_space_processor_uuid(uvm_va_space_t *va_space, NvProcessorUuid *uuid, uvm_processor_id_t id)
400 {
401     if (UVM_ID_IS_CPU(id)) {
402         memcpy(uuid, &NV_PROCESSOR_UUID_CPU_DEFAULT, sizeof(*uuid));
403     }
404     else {
405         uvm_gpu_t *gpu = uvm_va_space_get_gpu(va_space, id);
406         UVM_ASSERT(gpu);
407         memcpy(uuid, &gpu->uuid, sizeof(*uuid));
408     }
409 }
410 
411 static bool uvm_va_space_processor_has_memory(uvm_va_space_t *va_space, uvm_processor_id_t id)
412 {
413     if (UVM_ID_IS_CPU(id))
414         return true;
415 
416     return uvm_va_space_get_gpu(va_space, id)->mem_info.size > 0;
417 }
418 
419 NV_STATUS uvm_va_space_create(struct address_space *mapping, uvm_va_space_t **va_space_ptr, NvU64 flags);
420 void uvm_va_space_destroy(uvm_va_space_t *va_space);
421 
422 // All VA space locking should be done with these wrappers. They're macros so
423 // lock assertions are attributed to line numbers correctly.
424 
425 #define uvm_va_space_down_write(__va_space)                             \
426     do {                                                                \
427         uvm_mutex_lock(&(__va_space)->serialize_writers_lock);          \
428         uvm_mutex_lock(&(__va_space)->read_acquire_write_release_lock); \
429         uvm_down_write(&(__va_space)->lock);                            \
430     } while (0)
431 
432 #define uvm_va_space_up_write(__va_space)                                   \
433     do {                                                                    \
434         uvm_up_write(&(__va_space)->lock);                                  \
435         uvm_mutex_unlock(&(__va_space)->read_acquire_write_release_lock);   \
436         uvm_mutex_unlock(&(__va_space)->serialize_writers_lock);            \
437     } while (0)
438 
439 #define uvm_va_space_downgrade_write(__va_space)                                        \
440     do {                                                                                \
441         uvm_downgrade_write(&(__va_space)->lock);                                       \
442         uvm_mutex_unlock_out_of_order(&(__va_space)->read_acquire_write_release_lock);  \
443         uvm_mutex_unlock_out_of_order(&(__va_space)->serialize_writers_lock);           \
444     } while (0)
445 
446 // Call this when holding the VA space lock for write in order to downgrade to
447 // read on a path which also needs to make RM calls.
448 #define uvm_va_space_downgrade_write_rm(__va_space)                                     \
449     do {                                                                                \
450         uvm_assert_mutex_locked(&(__va_space)->serialize_writers_lock);                 \
451         uvm_downgrade_write(&(__va_space)->lock);                                       \
452         uvm_mutex_unlock_out_of_order(&(__va_space)->read_acquire_write_release_lock);  \
453     } while (0)
454 
455 #define uvm_va_space_down_read(__va_space)                                              \
456     do {                                                                                \
457         uvm_mutex_lock(&(__va_space)->read_acquire_write_release_lock);                 \
458         uvm_down_read(&(__va_space)->lock);                                             \
459         uvm_mutex_unlock_out_of_order(&(__va_space)->read_acquire_write_release_lock);  \
460     } while (0)
461 
462 // Call this if RM calls need to be made while holding the VA space lock in read
463 // mode. Note that taking read_acquire_write_release_lock is unnecessary since
464 // the down_read is serialized with another thread's up_write by the
465 // serialize_writers_lock.
466 #define uvm_va_space_down_read_rm(__va_space)                           \
467     do {                                                                \
468         uvm_mutex_lock(&(__va_space)->serialize_writers_lock);          \
469         uvm_down_read(&(__va_space)->lock);                             \
470     } while (0)
471 
472 #define uvm_va_space_up_read(__va_space) uvm_up_read(&(__va_space)->lock)
473 
474 #define uvm_va_space_up_read_rm(__va_space)                             \
475     do {                                                                \
476         uvm_up_read(&(__va_space)->lock);                               \
477         uvm_mutex_unlock(&(__va_space)->serialize_writers_lock);        \
478     } while (0)
479 
480 // Get a registered gpu by uuid. This restricts the search for GPUs, to those
481 // that have been registered with a va_space. This returns NULL if the GPU is
482 // not present, or not registered with the va_space.
483 //
484 // LOCKING: The VA space lock must be held.
485 uvm_gpu_t *uvm_va_space_get_gpu_by_uuid(uvm_va_space_t *va_space, const NvProcessorUuid *gpu_uuid);
486 
487 // Like uvm_va_space_get_gpu_by_uuid, but also returns NULL if the GPU does
488 // not have a GPU VA space registered in the UVM va_space.
489 //
490 // LOCKING: The VA space lock must be held.
491 uvm_gpu_t *uvm_va_space_get_gpu_by_uuid_with_gpu_va_space(uvm_va_space_t *va_space, const NvProcessorUuid *gpu_uuid);
492 
493 // Same as uvm_va_space_get_gpu_by_uuid but it also retains the GPU. The caller
494 // cannot assume that the GPU is still registered in the VA space after the
495 // function returns.
496 //
497 // LOCKING: The function takes and releases the VA space lock in read mode.
498 uvm_gpu_t *uvm_va_space_retain_gpu_by_uuid(uvm_va_space_t *va_space, const NvProcessorUuid *gpu_uuid);
499 
500 // Returns whether read-duplication is supported.
501 // If gpu is NULL, returns the current state.
502 // otherwise, it returns what the result would be once the gpu's va space is
503 // added or removed (by inverting the gpu's current state).
504 bool uvm_va_space_can_read_duplicate(uvm_va_space_t *va_space, uvm_gpu_t *changing_gpu);
505 
506 // Register a gpu in the va space
507 // Note that each gpu can be only registered once in a va space
508 //
509 // The input gpu_uuid is for the phyisical GPU. The user_rm_va_space argument
510 // identifies the SMC partition if provided and SMC is enabled.
511 //
512 // This call returns whether the GPU memory is a NUMA node in the kernel and the
513 // corresponding node id.
514 // It also returns the GI UUID (if gpu_uuid is a SMC partition) or a copy of
515 // gpu_uuid if the GPU is not SMC capable or SMC is not enabled.
516 NV_STATUS uvm_va_space_register_gpu(uvm_va_space_t *va_space,
517                                     const NvProcessorUuid *gpu_uuid,
518                                     const uvm_rm_user_object_t *user_rm_va_space,
519                                     NvBool *numa_enabled,
520                                     NvS32 *numa_node_id,
521                                     NvProcessorUuid *uuid_out);
522 
523 // Unregister a gpu from the va space
524 NV_STATUS uvm_va_space_unregister_gpu(uvm_va_space_t *va_space, const NvProcessorUuid *gpu_uuid);
525 
526 // Registers a GPU VA space with the UVM VA space.
527 NV_STATUS uvm_va_space_register_gpu_va_space(uvm_va_space_t *va_space,
528                                              uvm_rm_user_object_t *user_rm_va_space,
529                                              const NvProcessorUuid *gpu_uuid);
530 
531 // Unregisters a GPU VA space from the UVM VA space.
532 NV_STATUS uvm_va_space_unregister_gpu_va_space(uvm_va_space_t *va_space, const NvProcessorUuid *gpu_uuid);
533 
534 // Stop all user channels
535 //
536 // This function sets a flag in the VA space indicating that all the channels
537 // have been already stopped and should only be used when no new user channels
538 // can be registered.
539 //
540 // LOCKING: The VA space lock must be held in read mode, not write.
541 void uvm_va_space_stop_all_user_channels(uvm_va_space_t *va_space);
542 
543 // Calls uvm_user_channel_detach on all user channels in a VA space.
544 //
545 // The detached channels are added to the input list. The caller is expected to
546 // drop the VA space lock and call uvm_deferred_free_object_list to complete the
547 // destroy operation.
548 //
549 // LOCKING: The owning VA space must be locked in write mode.
550 void uvm_va_space_detach_all_user_channels(uvm_va_space_t *va_space, struct list_head *deferred_free_list);
551 
552 // Returns whether peer access between these two GPUs has been enabled in this
553 // VA space. Both GPUs must be registered in the VA space.
554 bool uvm_va_space_peer_enabled(uvm_va_space_t *va_space, const uvm_gpu_t *gpu0, const uvm_gpu_t *gpu1);
555 
556 // Returns the va_space this file points to. Returns NULL if this file
557 // does not point to a va_space.
558 static uvm_va_space_t *uvm_fd_va_space(struct file *filp)
559 {
560     uvm_va_space_t *va_space;
561     uvm_fd_type_t type;
562 
563     type = uvm_fd_type(filp, (void **) &va_space);
564     if (type != UVM_FD_VA_SPACE)
565         return NULL;
566 
567     return va_space;
568 }
569 
570 static uvm_va_space_t *uvm_va_space_get(struct file *filp)
571 {
572     uvm_fd_type_t fd_type;
573     uvm_va_space_t *va_space;
574 
575     fd_type = uvm_fd_type(filp, (void **)&va_space);
576     UVM_ASSERT(uvm_file_is_nvidia_uvm(filp));
577     UVM_ASSERT_MSG(fd_type == UVM_FD_VA_SPACE, "filp: 0x%llx", (NvU64)filp);
578 
579     return va_space;
580 }
581 
582 static uvm_va_block_context_t *uvm_va_space_block_context(uvm_va_space_t *va_space, struct mm_struct *mm)
583 {
584     uvm_assert_rwsem_locked_write(&va_space->lock);
585     if (mm)
586         uvm_assert_mmap_lock_locked(mm);
587 
588     uvm_va_block_context_init(va_space->va_block_context, mm);
589     return va_space->va_block_context;
590 }
591 
592 // Retains the GPU VA space memory object. destroy_gpu_va_space and
593 // uvm_gpu_va_space_release drop the count. This is used to keep the GPU VA
594 // space object allocated when dropping and re-taking the VA space lock. If
595 // another thread called remove_gpu_va_space in the meantime,
596 // gpu_va_space->state will be UVM_GPU_VA_SPACE_STATE_DEAD.
597 static inline void uvm_gpu_va_space_retain(uvm_gpu_va_space_t *gpu_va_space)
598 {
599     nv_kref_get(&gpu_va_space->kref);
600 }
601 
602 // This only frees the GPU VA space object itself, so it must have been removed
603 // from its VA space and destroyed prior to the final release.
604 void uvm_gpu_va_space_release(uvm_gpu_va_space_t *gpu_va_space);
605 
606 // Wrapper for nvUvmInterfaceUnsetPageDirectory
607 void uvm_gpu_va_space_unset_page_dir(uvm_gpu_va_space_t *gpu_va_space);
608 
609 static uvm_gpu_va_space_state_t uvm_gpu_va_space_state(uvm_gpu_va_space_t *gpu_va_space)
610 {
611     UVM_ASSERT(gpu_va_space->gpu);
612     UVM_ASSERT(gpu_va_space->va_space);
613 
614     return gpu_va_space->state;
615 }
616 
617 // Return the GPU VA space for the given physical GPU.
618 // Locking: the va_space lock must be held.
619 uvm_gpu_va_space_t *uvm_gpu_va_space_get_by_parent_gpu(uvm_va_space_t *va_space,
620                                                        uvm_parent_gpu_t *parent_gpu);
621 
622 static uvm_gpu_va_space_t *uvm_gpu_va_space_get(uvm_va_space_t *va_space, uvm_gpu_t *gpu)
623 {
624     uvm_gpu_va_space_t *gpu_va_space;
625 
626     if (!gpu)
627         return NULL;
628 
629     gpu_va_space = uvm_gpu_va_space_get_by_parent_gpu(va_space, gpu->parent);
630     if (gpu_va_space)
631         UVM_ASSERT(gpu_va_space->gpu == gpu);
632 
633     return gpu_va_space;
634 }
635 
636 #define for_each_gpu_va_space(__gpu_va_space, __va_space)                                                   \
637     for (__gpu_va_space =                                                                                   \
638             uvm_gpu_va_space_get(                                                                           \
639                 __va_space,                                                                                 \
640                 uvm_processor_mask_find_first_va_space_gpu(&__va_space->registered_gpu_va_spaces, va_space) \
641             );                                                                                              \
642          __gpu_va_space;                                                                                    \
643          __gpu_va_space =                                                                                   \
644             uvm_gpu_va_space_get(                                                                           \
645                 __va_space,                                                                                 \
646                 __uvm_processor_mask_find_next_va_space_gpu(&__va_space->registered_gpu_va_spaces,          \
647                                                             va_space,                                       \
648                                                             __gpu_va_space->gpu)                            \
649             )                                                                                               \
650         )
651 
652 // Return the first GPU set in the given mask or NULL. The caller must ensure
653 // that the GPUs set in the mask are registered in the VA space and cannot be
654 // unregistered during this call.
655 static uvm_gpu_t *uvm_processor_mask_find_first_va_space_gpu(const uvm_processor_mask_t *mask, uvm_va_space_t *va_space)
656 {
657     uvm_gpu_t *gpu;
658     uvm_gpu_id_t gpu_id;
659 
660     UVM_ASSERT(uvm_processor_mask_gpu_subset(mask, &va_space->registered_gpus));
661 
662     gpu_id = uvm_processor_mask_find_first_gpu_id(mask);
663     if (UVM_ID_IS_INVALID(gpu_id))
664         return NULL;
665 
666     gpu = uvm_va_space_get_gpu(va_space, gpu_id);
667     UVM_ASSERT_MSG(gpu, "gpu_id %u\n", uvm_id_value(gpu_id));
668 
669     return gpu;
670 }
671 
672 static uvm_gpu_t *uvm_va_space_find_first_gpu(uvm_va_space_t *va_space)
673 {
674     uvm_assert_rwsem_locked(&va_space->lock);
675 
676     return uvm_processor_mask_find_first_va_space_gpu(&va_space->registered_gpus, va_space);
677 }
678 
679 // Same as uvm_processor_mask_find_next_va_space_gpu below, but gpu cannot be
680 // NULL
681 static uvm_gpu_t *__uvm_processor_mask_find_next_va_space_gpu(const uvm_processor_mask_t *mask,
682                                                               uvm_va_space_t *va_space,
683                                                               uvm_gpu_t *gpu)
684 {
685     uvm_gpu_id_t gpu_id;
686 
687     UVM_ASSERT(gpu != NULL);
688     UVM_ASSERT(uvm_processor_mask_gpu_subset(mask, &va_space->registered_gpus));
689 
690     gpu_id = uvm_processor_mask_find_next_id(mask, uvm_gpu_id_next(gpu->id));
691     if (UVM_ID_IS_INVALID(gpu_id))
692         return NULL;
693 
694     gpu = uvm_va_space_get_gpu(va_space, gpu_id);
695     UVM_ASSERT_MSG(gpu, "gpu_id %u\n", uvm_id_value(gpu_id));
696 
697     return gpu;
698 }
699 
700 // Return the next GPU with an id larger than gpu->id set in the given mask.
701 // The function returns NULL if gpu is NULL. The caller must ensure that the
702 // GPUs set in the mask are registered in the VA space and cannot be
703 // unregistered during this call.
704 static uvm_gpu_t *uvm_processor_mask_find_next_va_space_gpu(const uvm_processor_mask_t *mask,
705                                                             uvm_va_space_t *va_space,
706                                                             uvm_gpu_t *gpu)
707 {
708     if (gpu == NULL)
709         return NULL;
710 
711     return __uvm_processor_mask_find_next_va_space_gpu(mask, va_space, gpu);
712 }
713 
714 #define for_each_va_space_gpu_in_mask(gpu, va_space, mask)                                       \
715     for (({uvm_assert_rwsem_locked(&(va_space)->lock);                                           \
716            gpu = uvm_processor_mask_find_first_va_space_gpu(mask, va_space);});                  \
717            gpu != NULL;                                                                          \
718            gpu = __uvm_processor_mask_find_next_va_space_gpu(mask, va_space, gpu))
719 
720 // Helper to iterate over all GPUs registered in a UVM VA space
721 #define for_each_va_space_gpu(gpu, va_space) \
722     for_each_va_space_gpu_in_mask(gpu, va_space, &(va_space)->registered_gpus)
723 
724 // Return the processor in the candidates mask that is "closest" to src, or
725 // UVM_ID_MAX_PROCESSORS if candidates is empty. The order is:
726 // - src itself
727 // - Direct NVLINK GPU peers if src is CPU or GPU (1)
728 // - NVLINK CPU if src is GPU
729 // - Indirect NVLINK GPU peers if src is GPU
730 // - PCIe peers if src is GPU (2)
731 // - CPU if src is GPU
732 // - Deterministic selection from the pool of candidates
733 //
734 // (1) When src is a GPU, NVLINK GPU peers are preferred over the CPU because in
735 //     NUMA systems the CPU processor may refer to multiple CPU NUMA nodes, and
736 //     the bandwidth between src and the farthest CPU node can be substantially
737 //     lower than the bandwidth src and its peer GPUs.
738 // (2) TODO: Bug 1764943: Is copying from a PCI peer always better than copying
739 //     from CPU?
740 uvm_processor_id_t uvm_processor_mask_find_closest_id(uvm_va_space_t *va_space,
741                                                       const uvm_processor_mask_t *candidates,
742                                                       uvm_processor_id_t src);
743 
744 // Iterate over each ID in mask in order of proximity to src. This is
745 // destructive to mask.
746 #define for_each_closest_id(id, mask, src, va_space)                    \
747     for (id = uvm_processor_mask_find_closest_id(va_space, mask, src);  \
748          UVM_ID_IS_VALID(id);                                           \
749          uvm_processor_mask_clear(mask, id), id = uvm_processor_mask_find_closest_id(va_space, mask, src))
750 
751 // Return the GPU whose memory corresponds to the given node_id
752 static uvm_gpu_t *uvm_va_space_find_gpu_with_memory_node_id(uvm_va_space_t *va_space, int node_id)
753 {
754     uvm_gpu_t *gpu;
755 
756     UVM_ASSERT(nv_numa_node_has_memory(node_id));
757 
758     if (!g_uvm_global.ats.supported)
759         return NULL;
760 
761     for_each_va_space_gpu(gpu, va_space) {
762         if (uvm_gpu_numa_node(gpu) == node_id)
763             return gpu;
764     }
765 
766     return NULL;
767 }
768 
769 static bool uvm_va_space_memory_node_is_gpu(uvm_va_space_t *va_space, int node_id)
770 {
771     return uvm_va_space_find_gpu_with_memory_node_id(va_space, node_id) != NULL;
772 }
773 
774 // Return a processor mask with the GPUs attached to the node_id CPU memory
775 // node
776 static void uvm_va_space_get_gpus_attached_to_cpu_node(uvm_va_space_t *va_space,
777                                                        int node_id,
778                                                        uvm_processor_mask_t *gpus)
779 {
780     uvm_gpu_id_t gpu_id;
781 
782     UVM_ASSERT(!uvm_va_space_memory_node_is_gpu(va_space, node_id));
783 
784     for_each_gpu_id(gpu_id) {
785         const uvm_cpu_gpu_affinity_t *affinity = &va_space->gpu_cpu_numa_affinity[uvm_id_gpu_index(gpu_id)];
786         if (affinity->numa_node == node_id) {
787             uvm_processor_mask_copy(gpus, &affinity->gpus);
788             return;
789         }
790     }
791 
792     uvm_processor_mask_zero(gpus);
793 }
794 
795 // Helper that returns the first GPU in the mask returned by
796 // uvm_va_space_get_gpus_attached_to_cpu_node or NULL if empty
797 static uvm_gpu_t *uvm_va_space_find_first_gpu_attached_to_cpu_node(uvm_va_space_t *va_space, int node_id)
798 {
799     uvm_processor_mask_t gpus;
800 
801     uvm_va_space_get_gpus_attached_to_cpu_node(va_space, node_id, &gpus);
802 
803     return uvm_processor_mask_find_first_va_space_gpu(&gpus, va_space);
804 }
805 
806 // Obtain the user channel with the given instance_ptr. This is used during
807 // non-replayable fault service. This function needs to be called with the va
808 // space lock held in order to prevent channels from being removed.
809 uvm_user_channel_t *uvm_gpu_va_space_get_user_channel(uvm_gpu_va_space_t *gpu_va_space,
810                                                       uvm_gpu_phys_address_t instance_ptr);
811 
812 // Whether some form of pageable access (ATS, HMM) is supported by the system on
813 // this VA space. This does NOT check whether GPUs with pageable support are
814 // present, just whether system + VA space support exists.
815 bool uvm_va_space_pageable_mem_access_supported(uvm_va_space_t *va_space);
816 
817 NV_STATUS uvm_test_get_pageable_mem_access_type(UVM_TEST_GET_PAGEABLE_MEM_ACCESS_TYPE_PARAMS *params,
818                                                  struct file *filp);
819 NV_STATUS uvm_test_enable_nvlink_peer_access(UVM_TEST_ENABLE_NVLINK_PEER_ACCESS_PARAMS *params, struct file *filp);
820 NV_STATUS uvm_test_disable_nvlink_peer_access(UVM_TEST_DISABLE_NVLINK_PEER_ACCESS_PARAMS *params, struct file *filp);
821 NV_STATUS uvm_test_destroy_gpu_va_space_delay(UVM_TEST_DESTROY_GPU_VA_SPACE_DELAY_PARAMS *params, struct file *filp);
822 NV_STATUS uvm_test_force_cpu_to_cpu_copy_with_ce(UVM_TEST_FORCE_CPU_TO_CPU_COPY_WITH_CE_PARAMS *params,
823                                                  struct file *filp);
824 NV_STATUS uvm_test_va_space_allow_movable_allocations(UVM_TEST_VA_SPACE_ALLOW_MOVABLE_ALLOCATIONS_PARAMS *params,
825                                                       struct file *filp);
826 
827 // Handle a CPU fault in the given VA space for a managed allocation,
828 // performing any operations necessary to establish a coherent CPU mapping
829 // (migrations, cache invalidates, etc.).
830 //
831 // Locking:
832 //  - vma->vm_mm->mmap_lock must be held in at least read mode. Note, that
833 //    might not be the same as current->mm->mmap_lock.
834 // Returns:
835 // VM_FAULT_NOPAGE: if page was faulted in OK
836 //     (possibly or'ed with VM_FAULT_MAJOR if a migration was needed).
837 // VM_FAULT_OOM: if system memory wasn't available.
838 // VM_FAULT_SIGBUS: if a CPU mapping to fault_addr cannot be accessed,
839 //     for example because it's within a range group which is non-migratable.
840 vm_fault_t uvm_va_space_cpu_fault_managed(uvm_va_space_t *va_space,
841                                           struct vm_area_struct *vma,
842                                           struct vm_fault *vmf);
843 
844 // Handle a CPU fault in the given VA space for a HMM allocation,
845 // performing any operations necessary to establish a coherent CPU mapping
846 // (migrations, cache invalidates, etc.).
847 //
848 // Locking:
849 //  - vma->vm_mm->mmap_lock must be held in at least read mode. Note, that
850 //    might not be the same as current->mm->mmap_lock.
851 // Returns:
852 // VM_FAULT_NOPAGE: if page was faulted in OK
853 //     (possibly or'ed with VM_FAULT_MAJOR if a migration was needed).
854 // VM_FAULT_OOM: if system memory wasn't available.
855 // VM_FAULT_SIGBUS: if a CPU mapping to fault_addr cannot be accessed.
856 vm_fault_t uvm_va_space_cpu_fault_hmm(uvm_va_space_t *va_space,
857                                       struct vm_area_struct *vma,
858                                       struct vm_fault *vmf);
859 
860 #endif // __UVM_VA_SPACE_H__
861