1 /*******************************************************************************
2     Copyright (c) 2015-2022 NVIDIA Corporation
4     Permission is hereby granted, free of charge, to any person obtaining a copy
5     of this software and associated documentation files (the "Software"), to
6     deal in the Software without restriction, including without limitation the
7     rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
8     sell copies of the Software, and to permit persons to whom the Software is
9     furnished to do so, subject to the following conditions:
11         The above copyright notice and this permission notice shall be
12         included in all copies or substantial portions of the Software.
22 *******************************************************************************/
24 #ifndef __UVM_VA_RANGE_H__
25 #define __UVM_VA_RANGE_H__
27 #include "uvm_linux.h"
28 #include "nv-kref.h"
29 #include "uvm_common.h"
30 #include "uvm_perf_module.h"
31 #include "uvm_processors.h"
32 #include "uvm_gpu.h"
33 #include "uvm_lock.h"
34 #include "uvm_va_space.h"
35 #include "uvm_range_tree.h"
36 #include "uvm_va_policy.h"
37 #include "uvm_test_ioctl.h"
38 #include "uvm_range_group.h"
39 #include "uvm_forward_decl.h"
40 #include "uvm_mmu.h"
41 #include "uvm_hal_types.h"
42 #include "uvm_mem.h"
43 #include "uvm_tracker.h"
44 #include "uvm_ioctl.h"
46 // VA Ranges are the UVM driver equivalent of Linux kernel vmas. They represent
47 // user allocations of any page-aligned size. We maintain these as a separate
48 // data structure from the vma tree for several reasons:
49 //
50 // 1) RM allocations mapped to the GPU by UVM don't have associated UVM vmas
51 //
52 // 2) We don't always have a separate reference on the vma's mm_struct, so we
53 //    can't always lock mmap_lock on paths where current->mm != vma->vm_mm.
54 //
55 // 3) HMM vmas aren't ours, so we can't use their vm_private_data pointers.
56 //
57 // The tree as a whole is protected by va_space->lock. Faults and mappings only
58 // need to take the lock in read mode.
59 // Modification of the range state (such as changes to logical permissions or
60 // location preferences) must take the lock in write mode.
61 //
62 // VA ranges with type == UVM_VA_RANGE_TYPE_MANAGED:
63 //     Each va_range is contained completely within a parent vma. There can be
64 //     multiple va_ranges under the same vma, but not vice versa. All VAs within
65 //     the va_range share the same policy state.
66 //
67 //     Each va_range is a collection of VA blocks. The VA blocks each have
68 //     individual locks, and they hold the current mapping and location state
69 //     for their block across all processors (CPU and all GPUs).
70 //
71 // VA ranges with type == UVM_VA_RANGE_TYPE_EXTERNAL:
72 //     These ranges track physical allocations made by RM. The UVM driver is
73 //     responsible for mapping them to the GPU(s), but not to the CPU. These
74 //     ranges do not support faulting nor migration, and they do not necessarily
75 //     correspond to valid vmas.
76 //
77 //     These ranges do not have blocks. All state (page tables, mapping handles,
78 //     etc) is maintained within the range.
79 //
80 // VA ranges with type == UVM_VA_RANGE_TYPE_CHANNEL:
81 //     These are similar to EXTERNAL ranges, except they represent internal
82 //     allocations required for user channels to operate (context save areas,
83 //     for example).
84 //
85 // VA ranges with type == UVM_VA_RANGE_TYPE_SKED_REFLECTED:
86 //     These ranges track special SKED reflected mappings required for CNP. The
87 //     mappings don't have any physical backing. They just use PTEs with a
88 //     special kind, see make_sked_reflected_pte_pascal() for an example of the
89 //     PTE encoding.
90 //     Notably the API that creates these ranges calls them "dynamic parallelism
91 //     regions", but we use "SKED reflected ranges" internally as it's more
92 //     descriptive.
93 //
94 // VA ranges with type == UVM_VA_RANGE_TYPE_SEMAPHORE_POOL:
95 //     These ranges track semaphore pool allocations. They are backed by sysmem,
96 //     and persistently mapped on the CPU and all GPUs (with registered VA
97 //     spaces) in a user VA space. The ranges are also mapped on UVM internal VA
98 //     space on the CPU and all registered GPUs.
99 //
100 //     These ranges do not have blocks.
101 //
103 // This enum must be kept in sync with UVM_TEST_VA_RANGE_TYPE in
104 // uvm_test_ioctl.h
105 typedef enum
106 {
114 } uvm_va_range_type_t;
116 // Wrapper to protect access to VMA's vm_page_prot
117 typedef struct
118 {
119     // Needed for creating CPU mappings on the va_range. Do not access this
120     // directly, instead use uvm_va_range_vma and friends.
121     struct vm_area_struct *vma;
123     uvm_rw_semaphore_t lock;
124 } uvm_vma_wrapper_t;
126 // TODO: Bug 1733295. VA range types should really be inverted. Instead of
127 //       maintaining common node state with a union of structs, we should have
128 //       separate C types for each VA range type. Each type would embed a common
129 //       VA range node.
130 //
131 //       There's a lot of state in the top-level uvm_va_range_t struct below
132 //       which really belongs in the per-type structs (for example, blocks).
133 //       We're deferring that cleanup to the full refactor.
135 // va_range state when va_range.type == UVM_VA_RANGE_TYPE_MANAGED
136 typedef struct
137 {
138     // This is null in the case of a zombie allocation. Zombie allocations are
139     // created from unfreed allocations at termination of a process which used
140     // UVM_INIT_FLAGS_MULTI_PROCESS_SHARING_MODE, when at least one other
141     // process is sharing the UVM file descriptor.
142     uvm_vma_wrapper_t *vma_wrapper;
144     // Managed allocations only use this policy and never use the policy
145     // stored in the va_block for HMM allocations.
146     uvm_va_policy_t policy;
148     uvm_perf_module_data_desc_t perf_modules_data[UVM_PERF_MODULE_TYPE_COUNT];
149 } uvm_va_range_managed_t;
151 typedef struct
152 {
153     // GPU mapping the allocation. The GPU's RM address space is required when
154     // releasing the handle.
155     uvm_gpu_t *gpu;
157     // RM handle to the physical allocation. This handle is dup'd into our client
158     // once - on initial mapping of the external allocation. If the allocation is
159     // ever split, its ref_count is incremented. The allocation is not released
160     // until the ref_count drops to 0.
161     NvHandle rm_handle;
163     // Refcount for this handle/allocation. The refcount is used when external
164     // ranges are split, resulting in two ranges using the same physical allocation.
165     nv_kref_t ref_count;
166 } uvm_ext_gpu_mem_handle;
168 typedef struct
169 {
170     uvm_range_tree_node_t node;
172     // Handle to the physical user allocation dup'd into our client. This
173     // prevents the allocation from being removed until we free it, even if the
174     // user frees their handle without telling us.
175     // This will be NULL for sparse mappings, which don't correspond to actual
176     // allocations.
177     uvm_ext_gpu_mem_handle *mem_handle;
179     // Tracks completion of PTE writes on pt_range_vec. The tree lock
180     // protecting this ext_gpu_map may be dropped before those writes are
181     // complete, so subsequent operations on this ext_gpu_map must acquire this
182     // tracker before operating on pt_range_vec.
183     uvm_tracker_t tracker;
185     // GPU on which this allocation is mapped.
186     uvm_gpu_t *gpu;
188     // GPU which owns the allocation. For sysmem, this is the GPU that the
189     // sysmem was originally allocated under. For the allocation to remain valid
190     // we need to prevent the GPU from going away, similarly to P2P mapped
191     // memory.
192     // Similarly for EGM memory.
193     //
194     // This field is not used for sparse mappings as they don't have an
195     // allocation and, hence, owning GPU.
196     //
197     // TODO: Bug 1811006: The semantics of sysmem might change depending on the
198     // resolution of this bug.
199     //
200     // TODO: Bug 1757136: For SLI, this is any GPU in the SLI group. We may need
201     //       to handle that specially.
202     uvm_gpu_t *owning_gpu;
204     // We need to know whether this memory is actually located on owning_gpu so
205     // we know what type of membar is needed at TLB invalidate time, and to know
206     // if the mapping GPU has to be unmapped on UvmDisablePeerAccess.
207     //
208     // This field is not used for sparse mappings as they don't have physical
209     // backing.
210     bool is_sysmem;
212     // EGM memory. If true is_sysmem also has to be true and owning_gpu
213     // has to be valid.
214     bool is_egm;
215     // GPU page tables mapping the allocation
216     uvm_page_table_range_vec_t pt_range_vec;
218     // Node for the deferred free list where this allocation is stored upon
219     // unmapped.
220     //
221     // This field is unused for sparse mappings. Since they don't have physical
222     // backing there is no RM object to be freed when the mapping is unmapped.
223     uvm_deferred_free_object_t deferred_free;
224 } uvm_ext_gpu_map_t;
226 typedef struct
227 {
228     // Lock protecting the range tree.
229     uvm_mutex_t lock;
231     // Range tree that contains all of the mapped portions of an External VA
232     // range. The tree holds uvm_ext_gpu_map_t instances.
233     uvm_range_tree_t tree;
234 } uvm_ext_gpu_range_tree_t;
236 typedef struct
237 {
238     // Mask of GPUs which have mappings to this VA range. If a bit in this mask
239     // is set, the corresponding pointer in gpu_ranges is valid.
240     // The bitmap can be safely accessed by following the locking rules:
241     //   * If the VA space lock is held for write, the mask can be read or written
242     //     normally.
243     //   * If the VA space lock is held for read, and one of the range tree locks is
244     //     held, only the bit corresponding to that GPU range tree can be accessed.
245     //     Writes must use uvm_processor_mask_set_atomic and
246     //     uvm_processor_mask_clear_atomic to avoid clobbering other bits in the
247     //     mask. If no range tree lock is held, the mask cannot be accessed.
248     //   * If the VA space lock is not held, the mask cannot be accessed
249     uvm_processor_mask_t mapped_gpus;
251     // Per-GPU tree of mapped external allocations. This has to be per-GPU in the VA
252     // range because each GPU is able to map a completely different set of
253     // allocations to the same VA range.
254     uvm_ext_gpu_range_tree_t gpu_ranges[UVM_ID_MAX_GPUS];
256     // Dynamically allocated page mask allocated in
257     // uvm_va_range_create_external() and used and freed in uvm_free().
258     uvm_processor_mask_t *retained_mask;
259 } uvm_va_range_external_t;
261 // va_range state when va_range.type == UVM_VA_RANGE_TYPE_CHANNEL. This
262 // represents a channel buffer resource and mapping.
263 typedef struct
264 {
265     // Only a single GPU can map a channel resource, so we only need one GPU
266     // VA space parent.
267     uvm_gpu_va_space_t *gpu_va_space;
269     // Page tables mapped by this range
270     uvm_page_table_range_vec_t pt_range_vec;
272     // Physical location of this channel resource. All pages have the same
273     // aperture.
274     uvm_aperture_t aperture;
276     // Note that this is not a normal RM object handle. It is a non-zero opaque
277     // identifier underneath the GPU VA space which represents this channel
278     // resource. Each channel using this VA range has retained this descriptor
279     // and is responsible for releasing it. That's safe because channels outlive
280     // their VA ranges.
281     NvP64 rm_descriptor;
283     // This is an ID assigned by RM to each resource descriptor.
284     NvU32 rm_id;
286     // The TSG which owns this mapping. Sharing of VA ranges is only allowed
287     // within the same TSG. If valid == false, no sharing is allowed because the
288     // channel is not in a TSG.
289     struct
290     {
291         bool valid;
292         NvU32 id;
293     } tsg;
295     NvU64 ref_count;
297     // Storage in the corresponding uvm_gpu_va_space's channel_va_ranges list
298     struct list_head list_node;
299 } uvm_va_range_channel_t;
301 // va_range state when va_range.type == UVM_VA_RANGE_TYPE_SKED_REFLECTED. This
302 // represents a sked reflected mapping.
303 typedef struct
304 {
305     // Each SKED reflected range is unique to a single GPU so only a single GPU
306     // VA space needs to be tracked.
307     uvm_gpu_va_space_t *gpu_va_space;
309     // Page tables mapped by this range
310     uvm_page_table_range_vec_t pt_range_vec;
311 } uvm_va_range_sked_reflected_t;
313 typedef struct
314 {
315     uvm_mem_t *mem;
317     // The optional owner is a GPU (at most one) that has the allocation cached -
318     // in this case, all writes must be done from this GPU.
319     // protected by va_space lock
320     uvm_gpu_t *owner;
322     // Per-gpu attributes
323     uvm_mem_gpu_mapping_attrs_t gpu_attrs[UVM_ID_MAX_GPUS];
325     // Default attributes to assign when a new GPU is registered
326     uvm_mem_gpu_mapping_attrs_t default_gpu_attrs;
328     // Tracks all outstanding GPU work using this allocation.
329     uvm_tracker_t tracker;
330     uvm_mutex_t tracker_lock;
331 } uvm_va_range_semaphore_pool_t;
333 struct uvm_va_range_struct
334 {
335     // Parent uvm_va_space.
336     uvm_va_space_t *va_space;
338     // Storage in VA range tree. Also contains range start and end.
339     // start and end + 1 have to be PAGE_SIZED aligned.
340     uvm_range_tree_node_t node;
342     // Force the next split on this range to fail. Set by error injection ioctl
343     // (testing purposes only).
344     bool inject_split_error;
346     // Force the next register_gpu_va_space to fail while adding this va_range.
347     // Set by error injection ioctl (testing purposes only).
348     bool inject_add_gpu_va_space_error;
350     // Mask of UVM-Lite GPUs for the VA range
351     //
352     // If the preferred location is set to a non-faultable GPU or the CPU,
353     // this mask contains all non-faultable GPUs that are in the accessed by
354     // mask and the preferred location itself if it's a GPU. Empty otherwise.
355     //
356     // All UVM-Lite GPUs have mappings only to the preferred location. The
357     // mappings are initially established only when the pages are resident on
358     // the preferred location, but persist after that until the preferred
359     // location is changed or a GPU stops being a UVM-Lite GPU.
360     uvm_processor_mask_t uvm_lite_gpus;
362     // This is a uvm_va_block_t ** array of all VA block pointers under this
363     // range. The pointers can be accessed using the functions
364     // uvm_va_range_block() and uvm_va_range_block_create(). The latter
365     // allocates the block if it doesn't already exist. Once allocated, the
366     // blocks persist in the array until the parent VA range is destroyed.
367     //
368     // Concurrent on-demand allocation requires the use of either atomics or a
369     // spin lock. Given that we don't want to take a spin lock for every lookup,
370     // and that the blocks are persistent, atomics are preferred.
371     //
372     // The number of blocks is calculated from the range size using
373     // uvm_va_range_num_blocks().
374     //
375     // TODO: Bug 1766585: Compare perf of up-front allocation and demand-
376     //       allocation of blocks in the common case (lots of accessed blocks)
377     //       and the sparse case. If the common case is hurt by demand-
378     //       allocation, or if the sparse case isn't helped much, just allocate
379     //       them all at range allocation.
380     atomic_long_t *blocks;
382     uvm_va_range_type_t type;
383     union
384     {
385         uvm_va_range_managed_t managed;
386         uvm_va_range_external_t external;
387         uvm_va_range_channel_t channel;
388         uvm_va_range_sked_reflected_t sked_reflected;
389         uvm_va_range_semaphore_pool_t semaphore_pool;
390     };
391 };
393 // Module load/exit
394 NV_STATUS uvm_va_range_init(void);
395 void uvm_va_range_exit(void);
uvm_va_range_size(uvm_va_range_t * va_range)397 static NvU64 uvm_va_range_size(uvm_va_range_t *va_range)
398 {
399     return uvm_range_tree_node_size(&va_range->node);
400 }
uvm_va_range_is_aligned(uvm_va_range_t * va_range,NvU64 alignment)402 static bool uvm_va_range_is_aligned(uvm_va_range_t *va_range, NvU64 alignment)
403 {
404     return IS_ALIGNED(va_range->node.start, alignment) && IS_ALIGNED(uvm_va_range_size(va_range), alignment);
405 }
uvm_va_range_is_managed_zombie(uvm_va_range_t * va_range)407 static bool uvm_va_range_is_managed_zombie(uvm_va_range_t *va_range)
408 {
409     return va_range->type == UVM_VA_RANGE_TYPE_MANAGED && va_range->managed.vma_wrapper == NULL;
410 }
412 // Create a va_range with type UVM_VA_RANGE_TYPE_MANAGED. The out va_range pointer
413 // is optional.
414 //
415 // Returns NV_ERR_UVM_ADDRESS_IN_USE if the vma overlaps with an existing range
416 // in the va_space tree.
417 NV_STATUS uvm_va_range_create_mmap(uvm_va_space_t *va_space,
418                                    struct mm_struct *mm,
419                                    uvm_vma_wrapper_t *vma_wrapper,
420                                    uvm_va_range_t **out_va_range);
422 // Create a va_range with type UVM_VA_RANGE_TYPE_EXTERNAL. The out va_range
423 // pointer is optional.
424 //
425 // Returns NV_ERR_UVM_ADDRESS_IN_USE if the range overlaps with an existing
426 // range in the va_space tree.
427 NV_STATUS uvm_va_range_create_external(uvm_va_space_t *va_space,
428                                        struct mm_struct *mm,
429                                        NvU64 start,
430                                        NvU64 length,
431                                        uvm_va_range_t **out_va_range);
433 // Create a va_range with type UVM_VA_RANGE_TYPE_CHANNEL. The out va_range
434 // pointer is optional.
435 //
436 // Returns NV_ERR_UVM_ADDRESS_IN_USE if the range overlaps with an existing
437 // range in the va_space tree.
438 NV_STATUS uvm_va_range_create_channel(uvm_va_space_t *va_space,
439                                       struct mm_struct *mm,
440                                       NvU64 start,
441                                       NvU64 end,
442                                       uvm_va_range_t **out_va_range);
444 NV_STATUS uvm_va_range_create_sked_reflected(uvm_va_space_t *va_space,
445                                              struct mm_struct *mm,
446                                              NvU64 start,
447                                              NvU64 length,
448                                              uvm_va_range_t **out_va_range);
450 NV_STATUS uvm_va_range_create_semaphore_pool(uvm_va_space_t *va_space,
451                                              struct mm_struct *mm,
452                                              NvU64 start,
453                                              NvU64 length,
454                                              const UvmGpuMappingAttributes *per_gpu_attrs,
455                                              NvU32 per_gpu_attrs_count,
456                                              uvm_va_range_t **out_va_range);
458 // Destroys any state associated with this VA range, removes the VA range from
459 // the VA space, and frees the VA range.
460 //
461 // deferred_free_list may be NULL if the VA range type is known to not require
462 // deferred free. Otherwise this function adds entries to the list for later
463 // processing by uvm_deferred_free_object_list.
464 void uvm_va_range_destroy(uvm_va_range_t *va_range, struct list_head *deferred_free_list);
466 void uvm_va_range_zombify(uvm_va_range_t *va_range);
468 NV_STATUS uvm_api_clean_up_zombie_resources(UVM_CLEAN_UP_ZOMBIE_RESOURCES_PARAMS *params, struct file *filp);
469 NV_STATUS uvm_api_validate_va_range(UVM_VALIDATE_VA_RANGE_PARAMS *params, struct file *filp);
471 // Inform the VA range that a GPU VA space is now available for them to map, if
472 // the VA range is supposed to proactively map GPUs (UvmAllocSemaphorePool,
473 // UvmSetAccessedBy).
474 //
475 // If mm != NULL, that mm is used for any CPU mappings which may be created as
476 // a result of this call. See uvm_va_block_context_t::mm for details.
477 //
478 // LOCKING: If mm != NULL, the caller must hold mm->mmap_lock in at least read
479 //          mode.
480 NV_STATUS uvm_va_range_add_gpu_va_space(uvm_va_range_t *va_range,
481                                         uvm_gpu_va_space_t *gpu_va_space,
482                                         struct mm_struct *mm);
484 // Destroy the VA range's mappings on the GPU, if it has any
485 //
486 // If mm != NULL, that mm is used for any CPU mappings which may be created as
487 // a result of this call. See uvm_va_block_context_t::mm for details.
488 //
489 // LOCKING: If mm != NULL, the caller must hold mm->mmap_lock in at least read
490 //          mode.
491 void uvm_va_range_remove_gpu_va_space(uvm_va_range_t *va_range,
492                                       uvm_gpu_va_space_t *gpu_va_space,
493                                       struct mm_struct *mm,
494                                       struct list_head *deferred_free_list);
496 // Inform the VA range that peer mappings can now be established between the
497 // GPUs, if the VA range is supposed to proactively create them (UvmSetAccessedBy).
498 NV_STATUS uvm_va_range_enable_peer(uvm_va_range_t *va_range, uvm_gpu_t *gpu0, uvm_gpu_t *gpu1);
500 // Unmap all page tables in this VA range which have peer mappings between these
501 // two GPUs, in either direction.
502 void uvm_va_range_disable_peer(uvm_va_range_t *va_range,
503                                uvm_gpu_t *gpu0,
504                                uvm_gpu_t *gpu1,
505                                struct list_head *deferred_free_list);
507 // Notify the VA range of a newly registered GPU.
508 //
509 // LOCKING: the lock of the enclosing VA space is held in R/W mode
510 NV_STATUS uvm_va_range_register_gpu(uvm_va_range_t *va_range, uvm_gpu_t *gpu);
512 // Unmap all page tables in this VA range which map memory owned by this GPU.
513 // Managed ranges will have any memory still resident on this GPU evicted to
514 // system memory.
515 //
516 // deferred_free_list may be NULL if the VA range type is known to not require
517 // deferred free. Otherwise this function adds entries to the list for later
518 // processing by uvm_deferred_free_object_list.
519 //
520 // If mm != NULL, that mm is used for any CPU mappings which may be created as
521 // a result of this call. See uvm_va_block_context_t::mm for details.
522 //
523 // LOCKING: If mm != NULL, the caller must hold mm->mmap_lock in at least read
524 //          mode.
525 void uvm_va_range_unregister_gpu(uvm_va_range_t *va_range,
526                                  uvm_gpu_t *gpu,
527                                  struct mm_struct *mm,
528                                  struct list_head *deferred_free_list);
530 // Splits existing_va_range into two pieces, with new_va_range always after
531 // existing. existing is updated to have new_end. new_end+1 must be page-
532 // aligned.
533 //
534 // Before: [----------- existing ------------]
535 // After:  [---- existing ----][---- new ----]
536 //                            ^new_end
537 //
538 // On error, existing_va_range is still accessible and is left in its original
539 // functional state.
540 //
541 // The va_range must have type UVM_VA_RANGE_TYPE_MANAGED.
542 NV_STATUS uvm_va_range_split(uvm_va_range_t *existing_va_range,
543                              NvU64 new_end,
544                              uvm_va_range_t **new_va_range);
546 // TODO: Bug 1707562: Merge va ranges
uvm_va_range_container(uvm_range_tree_node_t * node)548 static uvm_va_range_t *uvm_va_range_container(uvm_range_tree_node_t *node)
549 {
550     if (!node)
551         return NULL;
552     return container_of(node, uvm_va_range_t, node);
553 }
555 // Returns the va_range containing addr, if any
556 uvm_va_range_t *uvm_va_range_find(uvm_va_space_t *va_space, NvU64 addr);
uvm_ext_gpu_map_container(uvm_range_tree_node_t * node)558 static uvm_ext_gpu_map_t *uvm_ext_gpu_map_container(uvm_range_tree_node_t *node)
559 {
560     if (!node)
561         return NULL;
562     return container_of(node, uvm_ext_gpu_map_t, node);
563 }
565 // Iterators for all va_ranges
567 #define uvm_for_each_va_range(va_range, va_space) \
568     list_for_each_entry((va_range), &(va_space)->va_range_tree.head, node.list)
570 #define uvm_for_each_va_range_safe(va_range, va_range_next, va_space) \
571     list_for_each_entry_safe((va_range), (va_range_next), &(va_space)->va_range_tree.head, node.list)
574 // Iterators for specific ranges
576 // Returns the first va_range in the range [start, end], if any
577 uvm_va_range_t *uvm_va_space_iter_first(uvm_va_space_t *va_space, NvU64 start, NvU64 end);
579 // Returns the va_range following the provided va_range in address order, if
580 // that va_range's start <= the provided end.
581 uvm_va_range_t *uvm_va_space_iter_next(uvm_va_range_t *va_range, NvU64 end);
583 // Like uvm_va_space_iter_next, but also returns NULL if the next va_range
584 // is not adjacent to the provided va_range.
uvm_va_space_iter_next_contig(uvm_va_range_t * va_range,NvU64 end)585 static uvm_va_range_t *uvm_va_space_iter_next_contig(uvm_va_range_t *va_range, NvU64 end)
586 {
587     uvm_va_range_t *next = uvm_va_space_iter_next(va_range, end);
588     if (next && next->node.start != va_range->node.end + 1)
589         return NULL;
590     return next;
591 }
593 // Returns whether the range [start, end] has any VA ranges within it
uvm_va_space_range_empty(uvm_va_space_t * va_space,NvU64 start,NvU64 end)594 static bool uvm_va_space_range_empty(uvm_va_space_t *va_space, NvU64 start, NvU64 end)
595 {
596     return uvm_va_space_iter_first(va_space, start, end) == NULL;
597 }
599 #define uvm_for_each_va_range_in(va_range, va_space, start, end)            \
600     for ((va_range) = uvm_va_space_iter_first((va_space), (start), (end));  \
601          (va_range);                                                        \
602          (va_range) = uvm_va_space_iter_next((va_range), (end)))
604 #define uvm_for_each_va_range_in_safe(va_range, va_range_next, va_space, start, end)    \
605     for ((va_range) = uvm_va_space_iter_first((va_space), (start), (end)),              \
606              (va_range_next) = uvm_va_space_iter_next((va_range), (end));               \
607          (va_range);                                                                    \
608          (va_range) = (va_range_next), (va_range_next) = uvm_va_space_iter_next((va_range), (end)))
610 // Iterator for all contiguous VA ranges between [start, end]. If any part of
611 // [start, end] is not covered by a VA range, iteration stops.
612 #define uvm_for_each_va_range_in_contig(va_range, va_space, start, end)         \
613     for ((va_range) = uvm_va_space_iter_first((va_space), (start), (start));    \
614          (va_range);                                                            \
615          (va_range) = uvm_va_space_iter_next_contig((va_range), (end)))
617 #define uvm_for_each_va_range_in_contig_from(va_range, va_space, first_va_range, end) \
618     for ((va_range) = (first_va_range);                                               \
619          (va_range);                                                                  \
620          (va_range) = uvm_va_space_iter_next_contig((va_range), (end)))
622 // Like uvm_for_each_va_range_in_contig but also stops iteration if any VA range
623 // has a type other than UVM_VA_RANGE_TYPE_MANAGED.
624 #define uvm_for_each_managed_va_range_in_contig(va_range, va_space, start, end) \
625     for ((va_range) = uvm_va_space_iter_first((va_space), (start), (start));    \
626          (va_range) && (va_range)->type == UVM_VA_RANGE_TYPE_MANAGED;           \
627          (va_range) = uvm_va_space_iter_next_contig((va_range), (end)))
629 #define uvm_for_each_va_range_in_vma(va_range, vma)             \
630     uvm_for_each_va_range_in(va_range,                          \
631                              uvm_va_space_get(vma->vm_file),    \
632                              vma->vm_start,                     \
633                              vma->vm_end - 1)
635 #define uvm_for_each_va_range_in_vma_safe(va_range, va_range_next, vma) \
636     uvm_for_each_va_range_in_safe(va_range,                             \
637                                   va_range_next,                        \
638                                   uvm_va_space_get(vma->vm_file),       \
639                                   vma->vm_start,                        \
640                                   vma->vm_end - 1)
642 // Only call this if you're sure that either:
643 // 1) You have a reference on the vma's vm_mm and that vma->vm_mm's mmap_lock is
644 //    held; or
645 // 2) You won't be operating on the vma (as with vm_insert_page) or accessing
646 //    any fields in the vma that can change without va_space->lock being held
647 //    (such as vm_flags).
648 //
649 // Otherwise, use uvm_va_range_vma_current or uvm_va_range_vma_check and be
650 // prepared to handle a NULL return value.
uvm_va_range_vma(uvm_va_range_t * va_range)651 static struct vm_area_struct *uvm_va_range_vma(uvm_va_range_t *va_range)
652 {
653     struct vm_area_struct *vma;
654     UVM_ASSERT_MSG(va_range->type == UVM_VA_RANGE_TYPE_MANAGED, "type: %d", va_range->type);
655     UVM_ASSERT(va_range->managed.vma_wrapper);
657     uvm_assert_rwsem_locked(&va_range->va_space->lock);
659     // vm_file, vm_private_data, vm_start, and vm_end are all safe to access
660     // here because they can't change without the kernel calling vm_ops->open
661     // or vm_ops->close, which both take va_space->lock.
662     vma = va_range->managed.vma_wrapper->vma;
663     UVM_ASSERT(vma);
664     UVM_ASSERT_MSG(vma->vm_private_data == va_range->managed.vma_wrapper,
665                    "vma: 0x%llx [0x%lx, 0x%lx] has vm_private_data 0x%llx\n",
666                    (NvU64)vma,
667                    vma->vm_start,
668                    vma->vm_end - 1,
669                    (NvU64)vma->vm_private_data);
670     UVM_ASSERT_MSG(va_range->va_space == uvm_va_space_get(vma->vm_file),
671                    "va_range va_space: 0x%llx vm_file: 0x%llx vm_file va_space: 0x%llx",
672                    (NvU64)va_range->va_space,
673                    (NvU64)vma->vm_file,
674                    (NvU64)uvm_va_space_get(vma->vm_file));
675     UVM_ASSERT_MSG(va_range->node.start >= vma->vm_start,
676                    "Range mismatch: va_range: [0x%llx, 0x%llx] vma: [0x%lx, 0x%lx]\n",
677                    va_range->node.start,
678                    va_range->node.end,
679                    vma->vm_start,
680                    vma->vm_end - 1);
681     UVM_ASSERT_MSG(va_range->node.end <= vma->vm_end - 1,
682                    "Range mismatch: va_range: [0x%llx, 0x%llx] vma: [0x%lx, 0x%lx]\n",
683                    va_range->node.start,
684                    va_range->node.end,
685                    vma->vm_start,
686                    vma->vm_end - 1);
688     return vma;
689 }
691 // Check that the VA range's vma is safe to use under mm. If not, NULL is
692 // returned. If the vma is returned, there must be a reference on mm and
693 // mm->mmap_lock must be held.
uvm_va_range_vma_check(uvm_va_range_t * va_range,struct mm_struct * mm)694 static struct vm_area_struct *uvm_va_range_vma_check(uvm_va_range_t *va_range, struct mm_struct *mm)
695 {
696     struct vm_area_struct *vma;
698     UVM_ASSERT_MSG(va_range->type == UVM_VA_RANGE_TYPE_MANAGED, "type: %d\n", va_range->type);
700     // Zombies don't have a vma_wrapper.
701     if (!va_range->managed.vma_wrapper)
702         return NULL;
704     vma = uvm_va_range_vma(va_range);
706     // Examples of mm on various paths:
707     //  - CPU fault         vma->vm_mm
708     //  - GPU fault         current->mm or va_space->va_space_mm.mm
709     //  - IOCTL             current->mm or va_space->va_space_mm.mm
710     //  - Process teardown  NULL
711     //
712     // Since the "safe" mm varies based on the path, we may not have a reference
713     // on the vma's owning mm_struct. We won't know that until we look at the
714     // vma. By then it's too late to take mmap_lock since mmap_lock is above the
715     // va_space lock in our lock ordering, and we must be holding the va_space
716     // lock to query the va_range. Hence the need to detect the cases in which
717     // it's safe to operate on the vma.
718     //
719     // When we can't detect for certain that mm is safe to use, we shouldn't
720     // operate on the vma at all. The vma can't be outright freed until we drop
721     // the va_space lock so the pointer itself will remain valid, but its fields
722     // (like vm_start and vm_end) could be modified behind our back. We also
723     // aren't allowed to call vm_insert_page unless we hold the vma's mmap_lock.
724     //
725     // Note that if uvm_va_space_mm_enabled() is true, then vma->vm_mm must be
726     // va_space->va_space_mm.mm because we enforce that at mmap.
727     //
728     // An interesting case is when vma->vm_mm != current->mm. This can happen
729     // due to fork, ptrace, process teardown, etc. It will also be the case in
730     // the GPU fault handler.
731     if (mm != vma->vm_mm)
732         return NULL;
734     uvm_assert_mmap_lock_locked(vma->vm_mm);
735     return vma;
736 }
738 // Helper for use when the only mm which is known is current->mm
uvm_va_range_vma_current(uvm_va_range_t * va_range)739 static struct vm_area_struct *uvm_va_range_vma_current(uvm_va_range_t *va_range)
740 {
741     return uvm_va_range_vma_check(va_range, current->mm);
742 }
744 // Returns the maximum number of VA blocks which could be contained with the
745 // given va_range (number of elements in the va_range->blocks array).
746 // va_range->node.start and .end must be set.
747 //
748 // The va_range must have type UVM_VA_RANGE_TYPE_MANAGED.
749 size_t uvm_va_range_num_blocks(uvm_va_range_t *va_range);
751 // Get the index within the va_range->blocks array of the VA block
752 // corresponding to addr. The block pointer is not guaranteed to be valid. Use
753 // either uvm_va_range_block or uvm_va_range_block_create to look up the block.
754 //
755 // The va_range must have type UVM_VA_RANGE_TYPE_MANAGED.
756 size_t uvm_va_range_block_index(uvm_va_range_t *va_range, NvU64 addr);
758 // Looks up the VA block at va_range->blocks[index]. If no block is present at
759 // that index, NULL is returned.
760 //
761 // The va_range must have type UVM_VA_RANGE_TYPE_MANAGED.
uvm_va_range_block(uvm_va_range_t * va_range,size_t index)762 static uvm_va_block_t *uvm_va_range_block(uvm_va_range_t *va_range, size_t index)
763 {
764     UVM_ASSERT(va_range->type == UVM_VA_RANGE_TYPE_MANAGED);
765     UVM_ASSERT(index < uvm_va_range_num_blocks(va_range));
766     uvm_assert_rwsem_locked(&va_range->va_space->lock);
768     return (uvm_va_block_t *)atomic_long_read(&va_range->blocks[index]);
769 }
771 // Same as uvm_va_range_block except that the block is created if not already
772 // present in the array. If NV_OK is returned, the block has been allocated
773 // successfully.
774 //
775 // The va_range must have type UVM_VA_RANGE_TYPE_MANAGED.
776 NV_STATUS uvm_va_range_block_create(uvm_va_range_t *va_range, size_t index, uvm_va_block_t **out_block);
778 // Returns the first populated VA block in the VA range after the input
779 // va_block, or NULL if none. If the input va_block is NULL, this returns the
780 // first VA block in the VA range, if any exists.
781 uvm_va_block_t *uvm_va_range_block_next(uvm_va_range_t *va_range, uvm_va_block_t *va_block);
783 // Iterate over populated VA blocks in the range. Does not create new VA blocks.
784 #define for_each_va_block_in_va_range(__va_range, __va_block)           \
785     for (__va_block = uvm_va_range_block_next(__va_range, NULL);        \
786          __va_block;                                                    \
787          __va_block = uvm_va_range_block_next(__va_range, __va_block))
789 // Iterate over populated VA blocks in the range. Does not create new VA blocks. Safe version
790 #define for_each_va_block_in_va_range_safe(__va_range, __va_block, __va_block_next)            \
791     for (__va_block = uvm_va_range_block_next(__va_range, NULL),                               \
792          __va_block_next = uvm_va_range_block_next(__va_range, __va_block);                    \
793          __va_block;                                                                           \
794          __va_block = __va_block_next,                                                         \
795          __va_block_next = __va_block? uvm_va_range_block_next(__va_range, __va_block) : NULL)
797 // Set the VA range preferred location (or unset it if preferred location is
799 //
800 // Unsetting the preferred location potentially changes the range group
801 // association to UVM_RANGE_GROUP_ID_NONE if the VA range was previously
802 // associated with a non-migratable range group.
803 //
804 // Changing the preferred location also updates the mask and mappings of GPUs
805 // in UVM-Lite mode.
806 //
807 // The va_range must have type UVM_VA_RANGE_TYPE_MANAGED.
808 //
809 // If mm != NULL, that mm is used for any CPU mappings which may be created as
810 // a result of this call. See uvm_va_block_context_t::mm for details.
811 //
812 // If out_tracker != NULL any block work will be added to that tracker.
813 //
814 // LOCKING: If mm != NULL, the caller must hold mm->mmap_lock in at least read
815 //          mode.
816 NV_STATUS uvm_va_range_set_preferred_location(uvm_va_range_t *va_range,
817                                               uvm_processor_id_t preferred_location,
818                                               int preferred_cpu_nid,
819                                               struct mm_struct *mm,
820                                               uvm_tracker_t *out_tracker);
822 // Add a processor to the accessed_by mask and establish any new required
823 // mappings.
824 //
825 // Also update the mask of UVM-Lite GPUs if needed.
826 //
827 // If mm != NULL, that mm is used for any CPU mappings which may be created as
828 // a result of this call. See uvm_va_block_context_t::mm for details.
829 //
830 // If out_tracker != NULL any block work will be added to that tracker.
831 //
832 // LOCKING: If mm != NULL, the caller must hold mm->mmap_lock in at least read
833 //          mode.
834 NV_STATUS uvm_va_range_set_accessed_by(uvm_va_range_t *va_range,
835                                        uvm_processor_id_t processor_id,
836                                        struct mm_struct *mm,
837                                        uvm_tracker_t *out_tracker);
839 // Remove a processor from the accessed_by mask
840 //
841 // If out_tracker != NULL any block work will be added to that tracker.
842 //
843 // This also updates the mask and mappings of the UVM-Lite GPUs if required.
844 void uvm_va_range_unset_accessed_by(uvm_va_range_t *va_range,
845                                     uvm_processor_id_t processor_id,
846                                     uvm_tracker_t *out_tracker);
848 // Set read-duplication and remove any existing accessed_by and remote mappings
849 //
850 // If mm != NULL, that mm is used for any CPU mappings which may be created as
851 // a result of this call. See uvm_va_block_context_t::mm for details.
852 //
853 // LOCKING: If mm != NULL, the caller must hold mm->mmap_lock in at least read
854 //          mode.
855 NV_STATUS uvm_va_range_set_read_duplication(uvm_va_range_t *va_range, struct mm_struct *mm);
857 // Unset read-duplication and establish accessed_by mappings
858 //
859 // If mm != NULL, that mm is used for any CPU mappings which may be created as
860 // a result of this call. See uvm_va_block_context_t::mm for details.
861 //
862 // LOCKING: If mm != NULL, the caller must hold mm->mmap_lock in at least read
863 //          mode.
864 NV_STATUS uvm_va_range_unset_read_duplication(uvm_va_range_t *va_range, struct mm_struct *mm);
866 // Create and destroy vma wrappers
867 uvm_vma_wrapper_t *uvm_vma_wrapper_alloc(struct vm_area_struct *vma);
868 void uvm_vma_wrapper_destroy(uvm_vma_wrapper_t *vma_wrapper);
uvm_va_range_get_policy(uvm_va_range_t * va_range)870 static uvm_va_policy_t *uvm_va_range_get_policy(uvm_va_range_t *va_range)
871 {
872     UVM_ASSERT(va_range->type == UVM_VA_RANGE_TYPE_MANAGED);
873     return &va_range->managed.policy;
874 }
876 NV_STATUS uvm_test_va_range_info(UVM_TEST_VA_RANGE_INFO_PARAMS *params, struct file *filp);
877 NV_STATUS uvm_test_va_range_split(UVM_TEST_VA_RANGE_SPLIT_PARAMS *params, struct file *filp);
878 NV_STATUS uvm_test_va_range_inject_split_error(UVM_TEST_VA_RANGE_INJECT_SPLIT_ERROR_PARAMS *params, struct file *filp);
879 NV_STATUS uvm_test_va_range_inject_add_gpu_va_space_error(UVM_TEST_VA_RANGE_INJECT_ADD_GPU_VA_SPACE_ERROR_PARAMS *params,
880                                                           struct file *filp);
882 #endif // __UVM_VA_RANGE_H__