1 /*******************************************************************************
2     Copyright (c) 2015-2022 NVIDIA Corporation
3 
4     Permission is hereby granted, free of charge, to any person obtaining a copy
5     of this software and associated documentation files (the "Software"), to
6     deal in the Software without restriction, including without limitation the
7     rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
8     sell copies of the Software, and to permit persons to whom the Software is
9     furnished to do so, subject to the following conditions:
10 
11         The above copyright notice and this permission notice shall be
12         included in all copies or substantial portions of the Software.
13 
14     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17     THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19     FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20     DEALINGS IN THE SOFTWARE.
21 
22 *******************************************************************************/
23 
24 #ifndef __UVM_VA_RANGE_H__
25 #define __UVM_VA_RANGE_H__
26 
27 #include "uvm_linux.h"
28 #include "nv-kref.h"
29 #include "uvm_common.h"
30 #include "uvm_perf_module.h"
31 #include "uvm_processors.h"
32 #include "uvm_gpu.h"
33 #include "uvm_lock.h"
34 #include "uvm_va_space.h"
35 #include "uvm_range_tree.h"
36 #include "uvm_va_policy.h"
37 #include "uvm_test_ioctl.h"
38 #include "uvm_range_group.h"
39 #include "uvm_forward_decl.h"
40 #include "uvm_mmu.h"
41 #include "uvm_hal_types.h"
42 #include "uvm_mem.h"
43 #include "uvm_tracker.h"
44 #include "uvm_ioctl.h"
45 
46 // VA Ranges are the UVM driver equivalent of Linux kernel vmas. They represent
47 // user allocations of any page-aligned size. We maintain these as a separate
48 // data structure from the vma tree for several reasons:
49 //
50 // 1) RM allocations mapped to the GPU by UVM don't have associated UVM vmas
51 //
52 // 2) We don't always have a separate reference on the vma's mm_struct, so we
53 //    can't always lock mmap_lock on paths where current->mm != vma->vm_mm.
54 //
55 // 3) HMM vmas aren't ours, so we can't use their vm_private_data pointers.
56 //
57 // The tree as a whole is protected by va_space->lock. Faults and mappings only
58 // need to take the lock in read mode.
59 // Modification of the range state (such as changes to logical permissions or
60 // location preferences) must take the lock in write mode.
61 //
62 // VA ranges with type == UVM_VA_RANGE_TYPE_MANAGED:
63 //     Each va_range is contained completely within a parent vma. There can be
64 //     multiple va_ranges under the same vma, but not vice versa. All VAs within
65 //     the va_range share the same policy state.
66 //
67 //     Each va_range is a collection of VA blocks. The VA blocks each have
68 //     individual locks, and they hold the current mapping and location state
69 //     for their block across all processors (CPU and all GPUs).
70 //
71 // VA ranges with type == UVM_VA_RANGE_TYPE_EXTERNAL:
72 //     These ranges track physical allocations made by RM. The UVM driver is
73 //     responsible for mapping them to the GPU(s), but not to the CPU. These
74 //     ranges do not support faulting nor migration, and they do not necessarily
75 //     correspond to valid vmas.
76 //
77 //     These ranges do not have blocks. All state (page tables, mapping handles,
78 //     etc) is maintained within the range.
79 //
80 // VA ranges with type == UVM_VA_RANGE_TYPE_CHANNEL:
81 //     These are similar to EXTERNAL ranges, except they represent internal
82 //     allocations required for user channels to operate (context save areas,
83 //     for example).
84 //
85 // VA ranges with type == UVM_VA_RANGE_TYPE_SKED_REFLECTED:
86 //     These ranges track special SKED reflected mappings required for CNP. The
87 //     mappings don't have any physical backing. They just use PTEs with a
88 //     special kind, see make_sked_reflected_pte_pascal() for an example of the
89 //     PTE encoding.
90 //     Notably the API that creates these ranges calls them "dynamic parallelism
91 //     regions", but we use "SKED reflected ranges" internally as it's more
92 //     descriptive.
93 //
94 // VA ranges with type == UVM_VA_RANGE_TYPE_SEMAPHORE_POOL:
95 //     These ranges track semaphore pool allocations. They are backed by sysmem,
96 //     and persistently mapped on the CPU and all GPUs (with registered VA
97 //     spaces) in a user VA space. The ranges are also mapped on UVM internal VA
98 //     space on the CPU and all registered GPUs.
99 //
100 //     These ranges do not have blocks.
101 //
102 
103 // This enum must be kept in sync with UVM_TEST_VA_RANGE_TYPE in
104 // uvm_test_ioctl.h
105 typedef enum
106 {
107     UVM_VA_RANGE_TYPE_INVALID = 0,
108     UVM_VA_RANGE_TYPE_MANAGED,
109     UVM_VA_RANGE_TYPE_EXTERNAL,
110     UVM_VA_RANGE_TYPE_CHANNEL,
111     UVM_VA_RANGE_TYPE_SKED_REFLECTED,
112     UVM_VA_RANGE_TYPE_SEMAPHORE_POOL,
113     UVM_VA_RANGE_TYPE_MAX
114 } uvm_va_range_type_t;
115 
116 // Wrapper to protect access to VMA's vm_page_prot
117 typedef struct
118 {
119     // Needed for creating CPU mappings on the va_range. Do not access this
120     // directly, instead use uvm_va_range_vma and friends.
121     struct vm_area_struct *vma;
122 
123     uvm_rw_semaphore_t lock;
124 } uvm_vma_wrapper_t;
125 
126 // TODO: Bug 1733295. VA range types should really be inverted. Instead of
127 //       maintaining common node state with a union of structs, we should have
128 //       separate C types for each VA range type. Each type would embed a common
129 //       VA range node.
130 //
131 //       There's a lot of state in the top-level uvm_va_range_t struct below
132 //       which really belongs in the per-type structs (for example, blocks).
133 //       We're deferring that cleanup to the full refactor.
134 
135 // va_range state when va_range.type == UVM_VA_RANGE_TYPE_MANAGED
136 typedef struct
137 {
138     // This is null in the case of a zombie allocation. Zombie allocations are
139     // created from unfreed allocations at termination of a process which used
140     // UVM_INIT_FLAGS_MULTI_PROCESS_SHARING_MODE, when at least one other
141     // process is sharing the UVM file descriptor.
142     uvm_vma_wrapper_t *vma_wrapper;
143 
144     // Managed allocations only use this policy and never use the policy
145     // stored in the va_block for HMM allocations.
146     uvm_va_policy_t policy;
147 
148     uvm_perf_module_data_desc_t perf_modules_data[UVM_PERF_MODULE_TYPE_COUNT];
149 } uvm_va_range_managed_t;
150 
151 typedef struct
152 {
153     // GPU mapping the allocation. The GPU's RM address space is required when
154     // releasing the handle.
155     uvm_gpu_t *gpu;
156 
157     // RM handle to the physical allocation. This handle is dup'd into our client
158     // once - on initial mapping of the external allocation. If the allocation is
159     // ever split, its ref_count is incremented. The allocation is not released
160     // until the ref_count drops to 0.
161     NvHandle rm_handle;
162 
163     // Refcount for this handle/allocation. The refcount is used when external
164     // ranges are split, resulting in two ranges using the same physical allocation.
165     nv_kref_t ref_count;
166 } uvm_ext_gpu_mem_handle;
167 
168 typedef struct
169 {
170     uvm_range_tree_node_t node;
171 
172     // Handle to the physical user allocation dup'd into our client. This
173     // prevents the allocation from being removed until we free it, even if the
174     // user frees their handle without telling us.
175     // This will be NULL for sparse mappings, which don't correspond to actual
176     // allocations.
177     uvm_ext_gpu_mem_handle *mem_handle;
178 
179     // Tracks completion of PTE writes on pt_range_vec. The tree lock
180     // protecting this ext_gpu_map may be dropped before those writes are
181     // complete, so subsequent operations on this ext_gpu_map must acquire this
182     // tracker before operating on pt_range_vec.
183     uvm_tracker_t tracker;
184 
185     // GPU on which this allocation is mapped.
186     uvm_gpu_t *gpu;
187 
188     // GPU which owns the allocation. For sysmem, this is the GPU that the
189     // sysmem was originally allocated under. For the allocation to remain valid
190     // we need to prevent the GPU from going away, similarly to P2P mapped
191     // memory.
192     // Similarly for EGM memory.
193     //
194     // This field is not used for sparse mappings as they don't have an
195     // allocation and, hence, owning GPU.
196     //
197     // TODO: Bug 1811006: The semantics of sysmem might change depending on the
198     // resolution of this bug.
199     //
200     // TODO: Bug 1757136: For SLI, this is any GPU in the SLI group. We may need
201     //       to handle that specially.
202     uvm_gpu_t *owning_gpu;
203 
204     // We need to know whether this memory is actually located on owning_gpu so
205     // we know what type of membar is needed at TLB invalidate time, and to know
206     // if the mapping GPU has to be unmapped on UvmDisablePeerAccess.
207     //
208     // This field is not used for sparse mappings as they don't have physical
209     // backing.
210     bool is_sysmem;
211 
212     // EGM memory. If true is_sysmem also has to be true and owning_gpu
213     // has to be valid.
214     bool is_egm;
215     // GPU page tables mapping the allocation
216     uvm_page_table_range_vec_t pt_range_vec;
217 
218     // Node for the deferred free list where this allocation is stored upon
219     // unmapped.
220     //
221     // This field is unused for sparse mappings. Since they don't have physical
222     // backing there is no RM object to be freed when the mapping is unmapped.
223     uvm_deferred_free_object_t deferred_free;
224 } uvm_ext_gpu_map_t;
225 
226 typedef struct
227 {
228     // Lock protecting the range tree.
229     uvm_mutex_t lock;
230 
231     // Range tree that contains all of the mapped portions of an External VA
232     // range. The tree holds uvm_ext_gpu_map_t instances.
233     uvm_range_tree_t tree;
234 } uvm_ext_gpu_range_tree_t;
235 
236 typedef struct
237 {
238     // Mask of GPUs which have mappings to this VA range. If a bit in this mask
239     // is set, the corresponding pointer in gpu_ranges is valid.
240     // The bitmap can be safely accessed by following the locking rules:
241     //   * If the VA space lock is held for write, the mask can be read or written
242     //     normally.
243     //   * If the VA space lock is held for read, and one of the range tree locks is
244     //     held, only the bit corresponding to that GPU range tree can be accessed.
245     //     Writes must use uvm_processor_mask_set_atomic and
246     //     uvm_processor_mask_clear_atomic to avoid clobbering other bits in the
247     //     mask. If no range tree lock is held, the mask cannot be accessed.
248     //   * If the VA space lock is not held, the mask cannot be accessed
249     uvm_processor_mask_t mapped_gpus;
250 
251     // Per-GPU tree of mapped external allocations. This has to be per-GPU in the VA
252     // range because each GPU is able to map a completely different set of
253     // allocations to the same VA range.
254     uvm_ext_gpu_range_tree_t gpu_ranges[UVM_ID_MAX_GPUS];
255 
256     // Dynamically allocated page mask allocated in
257     // uvm_va_range_create_external() and used and freed in uvm_free().
258     uvm_processor_mask_t *retained_mask;
259 } uvm_va_range_external_t;
260 
261 // va_range state when va_range.type == UVM_VA_RANGE_TYPE_CHANNEL. This
262 // represents a channel buffer resource and mapping.
263 typedef struct
264 {
265     // Only a single GPU can map a channel resource, so we only need one GPU
266     // VA space parent.
267     uvm_gpu_va_space_t *gpu_va_space;
268 
269     // Page tables mapped by this range
270     uvm_page_table_range_vec_t pt_range_vec;
271 
272     // Physical location of this channel resource. All pages have the same
273     // aperture.
274     uvm_aperture_t aperture;
275 
276     // Note that this is not a normal RM object handle. It is a non-zero opaque
277     // identifier underneath the GPU VA space which represents this channel
278     // resource. Each channel using this VA range has retained this descriptor
279     // and is responsible for releasing it. That's safe because channels outlive
280     // their VA ranges.
281     NvP64 rm_descriptor;
282 
283     // This is an ID assigned by RM to each resource descriptor.
284     NvU32 rm_id;
285 
286     // The TSG which owns this mapping. Sharing of VA ranges is only allowed
287     // within the same TSG. If valid == false, no sharing is allowed because the
288     // channel is not in a TSG.
289     struct
290     {
291         bool valid;
292         NvU32 id;
293     } tsg;
294 
295     NvU64 ref_count;
296 
297     // Storage in the corresponding uvm_gpu_va_space's channel_va_ranges list
298     struct list_head list_node;
299 } uvm_va_range_channel_t;
300 
301 // va_range state when va_range.type == UVM_VA_RANGE_TYPE_SKED_REFLECTED. This
302 // represents a sked reflected mapping.
303 typedef struct
304 {
305     // Each SKED reflected range is unique to a single GPU so only a single GPU
306     // VA space needs to be tracked.
307     uvm_gpu_va_space_t *gpu_va_space;
308 
309     // Page tables mapped by this range
310     uvm_page_table_range_vec_t pt_range_vec;
311 } uvm_va_range_sked_reflected_t;
312 
313 typedef struct
314 {
315     uvm_mem_t *mem;
316 
317     // The optional owner is a GPU (at most one) that has the allocation cached -
318     // in this case, all writes must be done from this GPU.
319     // protected by va_space lock
320     uvm_gpu_t *owner;
321 
322     // Per-gpu attributes
323     uvm_mem_gpu_mapping_attrs_t gpu_attrs[UVM_ID_MAX_GPUS];
324 
325     // Default attributes to assign when a new GPU is registered
326     uvm_mem_gpu_mapping_attrs_t default_gpu_attrs;
327 
328     // Tracks all outstanding GPU work using this allocation.
329     uvm_tracker_t tracker;
330     uvm_mutex_t tracker_lock;
331 } uvm_va_range_semaphore_pool_t;
332 
333 struct uvm_va_range_struct
334 {
335     // Parent uvm_va_space.
336     uvm_va_space_t *va_space;
337 
338     // Storage in VA range tree. Also contains range start and end.
339     // start and end + 1 have to be PAGE_SIZED aligned.
340     uvm_range_tree_node_t node;
341 
342     // Force the next split on this range to fail. Set by error injection ioctl
343     // (testing purposes only).
344     bool inject_split_error;
345 
346     // Force the next register_gpu_va_space to fail while adding this va_range.
347     // Set by error injection ioctl (testing purposes only).
348     bool inject_add_gpu_va_space_error;
349 
350     // Mask of UVM-Lite GPUs for the VA range
351     //
352     // If the preferred location is set to a non-faultable GPU or the CPU,
353     // this mask contains all non-faultable GPUs that are in the accessed by
354     // mask and the preferred location itself if it's a GPU. Empty otherwise.
355     //
356     // All UVM-Lite GPUs have mappings only to the preferred location. The
357     // mappings are initially established only when the pages are resident on
358     // the preferred location, but persist after that until the preferred
359     // location is changed or a GPU stops being a UVM-Lite GPU.
360     uvm_processor_mask_t uvm_lite_gpus;
361 
362     // This is a uvm_va_block_t ** array of all VA block pointers under this
363     // range. The pointers can be accessed using the functions
364     // uvm_va_range_block() and uvm_va_range_block_create(). The latter
365     // allocates the block if it doesn't already exist. Once allocated, the
366     // blocks persist in the array until the parent VA range is destroyed.
367     //
368     // Concurrent on-demand allocation requires the use of either atomics or a
369     // spin lock. Given that we don't want to take a spin lock for every lookup,
370     // and that the blocks are persistent, atomics are preferred.
371     //
372     // The number of blocks is calculated from the range size using
373     // uvm_va_range_num_blocks().
374     //
375     // TODO: Bug 1766585: Compare perf of up-front allocation and demand-
376     //       allocation of blocks in the common case (lots of accessed blocks)
377     //       and the sparse case. If the common case is hurt by demand-
378     //       allocation, or if the sparse case isn't helped much, just allocate
379     //       them all at range allocation.
380     atomic_long_t *blocks;
381 
382     uvm_va_range_type_t type;
383     union
384     {
385         uvm_va_range_managed_t managed;
386         uvm_va_range_external_t external;
387         uvm_va_range_channel_t channel;
388         uvm_va_range_sked_reflected_t sked_reflected;
389         uvm_va_range_semaphore_pool_t semaphore_pool;
390     };
391 };
392 
393 // Module load/exit
394 NV_STATUS uvm_va_range_init(void);
395 void uvm_va_range_exit(void);
396 
uvm_va_range_size(uvm_va_range_t * va_range)397 static NvU64 uvm_va_range_size(uvm_va_range_t *va_range)
398 {
399     return uvm_range_tree_node_size(&va_range->node);
400 }
401 
uvm_va_range_is_aligned(uvm_va_range_t * va_range,NvU64 alignment)402 static bool uvm_va_range_is_aligned(uvm_va_range_t *va_range, NvU64 alignment)
403 {
404     return IS_ALIGNED(va_range->node.start, alignment) && IS_ALIGNED(uvm_va_range_size(va_range), alignment);
405 }
406 
uvm_va_range_is_managed_zombie(uvm_va_range_t * va_range)407 static bool uvm_va_range_is_managed_zombie(uvm_va_range_t *va_range)
408 {
409     return va_range->type == UVM_VA_RANGE_TYPE_MANAGED && va_range->managed.vma_wrapper == NULL;
410 }
411 
412 // Create a va_range with type UVM_VA_RANGE_TYPE_MANAGED. The out va_range pointer
413 // is optional.
414 //
415 // Returns NV_ERR_UVM_ADDRESS_IN_USE if the vma overlaps with an existing range
416 // in the va_space tree.
417 NV_STATUS uvm_va_range_create_mmap(uvm_va_space_t *va_space,
418                                    struct mm_struct *mm,
419                                    uvm_vma_wrapper_t *vma_wrapper,
420                                    uvm_va_range_t **out_va_range);
421 
422 // Create a va_range with type UVM_VA_RANGE_TYPE_EXTERNAL. The out va_range
423 // pointer is optional.
424 //
425 // Returns NV_ERR_UVM_ADDRESS_IN_USE if the range overlaps with an existing
426 // range in the va_space tree.
427 NV_STATUS uvm_va_range_create_external(uvm_va_space_t *va_space,
428                                        struct mm_struct *mm,
429                                        NvU64 start,
430                                        NvU64 length,
431                                        uvm_va_range_t **out_va_range);
432 
433 // Create a va_range with type UVM_VA_RANGE_TYPE_CHANNEL. The out va_range
434 // pointer is optional.
435 //
436 // Returns NV_ERR_UVM_ADDRESS_IN_USE if the range overlaps with an existing
437 // range in the va_space tree.
438 NV_STATUS uvm_va_range_create_channel(uvm_va_space_t *va_space,
439                                       struct mm_struct *mm,
440                                       NvU64 start,
441                                       NvU64 end,
442                                       uvm_va_range_t **out_va_range);
443 
444 NV_STATUS uvm_va_range_create_sked_reflected(uvm_va_space_t *va_space,
445                                              struct mm_struct *mm,
446                                              NvU64 start,
447                                              NvU64 length,
448                                              uvm_va_range_t **out_va_range);
449 
450 NV_STATUS uvm_va_range_create_semaphore_pool(uvm_va_space_t *va_space,
451                                              struct mm_struct *mm,
452                                              NvU64 start,
453                                              NvU64 length,
454                                              const UvmGpuMappingAttributes *per_gpu_attrs,
455                                              NvU32 per_gpu_attrs_count,
456                                              uvm_va_range_t **out_va_range);
457 
458 // Destroys any state associated with this VA range, removes the VA range from
459 // the VA space, and frees the VA range.
460 //
461 // deferred_free_list may be NULL if the VA range type is known to not require
462 // deferred free. Otherwise this function adds entries to the list for later
463 // processing by uvm_deferred_free_object_list.
464 void uvm_va_range_destroy(uvm_va_range_t *va_range, struct list_head *deferred_free_list);
465 
466 void uvm_va_range_zombify(uvm_va_range_t *va_range);
467 
468 NV_STATUS uvm_api_clean_up_zombie_resources(UVM_CLEAN_UP_ZOMBIE_RESOURCES_PARAMS *params, struct file *filp);
469 NV_STATUS uvm_api_validate_va_range(UVM_VALIDATE_VA_RANGE_PARAMS *params, struct file *filp);
470 
471 // Inform the VA range that a GPU VA space is now available for them to map, if
472 // the VA range is supposed to proactively map GPUs (UvmAllocSemaphorePool,
473 // UvmSetAccessedBy).
474 //
475 // If mm != NULL, that mm is used for any CPU mappings which may be created as
476 // a result of this call. See uvm_va_block_context_t::mm for details.
477 //
478 // LOCKING: If mm != NULL, the caller must hold mm->mmap_lock in at least read
479 //          mode.
480 NV_STATUS uvm_va_range_add_gpu_va_space(uvm_va_range_t *va_range,
481                                         uvm_gpu_va_space_t *gpu_va_space,
482                                         struct mm_struct *mm);
483 
484 // Destroy the VA range's mappings on the GPU, if it has any
485 //
486 // If mm != NULL, that mm is used for any CPU mappings which may be created as
487 // a result of this call. See uvm_va_block_context_t::mm for details.
488 //
489 // LOCKING: If mm != NULL, the caller must hold mm->mmap_lock in at least read
490 //          mode.
491 void uvm_va_range_remove_gpu_va_space(uvm_va_range_t *va_range,
492                                       uvm_gpu_va_space_t *gpu_va_space,
493                                       struct mm_struct *mm,
494                                       struct list_head *deferred_free_list);
495 
496 // Inform the VA range that peer mappings can now be established between the
497 // GPUs, if the VA range is supposed to proactively create them (UvmSetAccessedBy).
498 NV_STATUS uvm_va_range_enable_peer(uvm_va_range_t *va_range, uvm_gpu_t *gpu0, uvm_gpu_t *gpu1);
499 
500 // Unmap all page tables in this VA range which have peer mappings between these
501 // two GPUs, in either direction.
502 void uvm_va_range_disable_peer(uvm_va_range_t *va_range,
503                                uvm_gpu_t *gpu0,
504                                uvm_gpu_t *gpu1,
505                                struct list_head *deferred_free_list);
506 
507 // Notify the VA range of a newly registered GPU.
508 //
509 // LOCKING: the lock of the enclosing VA space is held in R/W mode
510 NV_STATUS uvm_va_range_register_gpu(uvm_va_range_t *va_range, uvm_gpu_t *gpu);
511 
512 // Unmap all page tables in this VA range which map memory owned by this GPU.
513 // Managed ranges will have any memory still resident on this GPU evicted to
514 // system memory.
515 //
516 // deferred_free_list may be NULL if the VA range type is known to not require
517 // deferred free. Otherwise this function adds entries to the list for later
518 // processing by uvm_deferred_free_object_list.
519 //
520 // If mm != NULL, that mm is used for any CPU mappings which may be created as
521 // a result of this call. See uvm_va_block_context_t::mm for details.
522 //
523 // LOCKING: If mm != NULL, the caller must hold mm->mmap_lock in at least read
524 //          mode.
525 void uvm_va_range_unregister_gpu(uvm_va_range_t *va_range,
526                                  uvm_gpu_t *gpu,
527                                  struct mm_struct *mm,
528                                  struct list_head *deferred_free_list);
529 
530 // Splits existing_va_range into two pieces, with new_va_range always after
531 // existing. existing is updated to have new_end. new_end+1 must be page-
532 // aligned.
533 //
534 // Before: [----------- existing ------------]
535 // After:  [---- existing ----][---- new ----]
536 //                            ^new_end
537 //
538 // On error, existing_va_range is still accessible and is left in its original
539 // functional state.
540 //
541 // The va_range must have type UVM_VA_RANGE_TYPE_MANAGED.
542 NV_STATUS uvm_va_range_split(uvm_va_range_t *existing_va_range,
543                              NvU64 new_end,
544                              uvm_va_range_t **new_va_range);
545 
546 // TODO: Bug 1707562: Merge va ranges
547 
uvm_va_range_container(uvm_range_tree_node_t * node)548 static uvm_va_range_t *uvm_va_range_container(uvm_range_tree_node_t *node)
549 {
550     if (!node)
551         return NULL;
552     return container_of(node, uvm_va_range_t, node);
553 }
554 
555 // Returns the va_range containing addr, if any
556 uvm_va_range_t *uvm_va_range_find(uvm_va_space_t *va_space, NvU64 addr);
557 
uvm_ext_gpu_map_container(uvm_range_tree_node_t * node)558 static uvm_ext_gpu_map_t *uvm_ext_gpu_map_container(uvm_range_tree_node_t *node)
559 {
560     if (!node)
561         return NULL;
562     return container_of(node, uvm_ext_gpu_map_t, node);
563 }
564 
565 // Iterators for all va_ranges
566 
567 #define uvm_for_each_va_range(va_range, va_space) \
568     list_for_each_entry((va_range), &(va_space)->va_range_tree.head, node.list)
569 
570 #define uvm_for_each_va_range_safe(va_range, va_range_next, va_space) \
571     list_for_each_entry_safe((va_range), (va_range_next), &(va_space)->va_range_tree.head, node.list)
572 
573 
574 // Iterators for specific ranges
575 
576 // Returns the first va_range in the range [start, end], if any
577 uvm_va_range_t *uvm_va_space_iter_first(uvm_va_space_t *va_space, NvU64 start, NvU64 end);
578 
579 // Returns the va_range following the provided va_range in address order, if
580 // that va_range's start <= the provided end.
581 uvm_va_range_t *uvm_va_space_iter_next(uvm_va_range_t *va_range, NvU64 end);
582 
583 // Like uvm_va_space_iter_next, but also returns NULL if the next va_range
584 // is not adjacent to the provided va_range.
uvm_va_space_iter_next_contig(uvm_va_range_t * va_range,NvU64 end)585 static uvm_va_range_t *uvm_va_space_iter_next_contig(uvm_va_range_t *va_range, NvU64 end)
586 {
587     uvm_va_range_t *next = uvm_va_space_iter_next(va_range, end);
588     if (next && next->node.start != va_range->node.end + 1)
589         return NULL;
590     return next;
591 }
592 
593 // Returns whether the range [start, end] has any VA ranges within it
uvm_va_space_range_empty(uvm_va_space_t * va_space,NvU64 start,NvU64 end)594 static bool uvm_va_space_range_empty(uvm_va_space_t *va_space, NvU64 start, NvU64 end)
595 {
596     return uvm_va_space_iter_first(va_space, start, end) == NULL;
597 }
598 
599 #define uvm_for_each_va_range_in(va_range, va_space, start, end)            \
600     for ((va_range) = uvm_va_space_iter_first((va_space), (start), (end));  \
601          (va_range);                                                        \
602          (va_range) = uvm_va_space_iter_next((va_range), (end)))
603 
604 #define uvm_for_each_va_range_in_safe(va_range, va_range_next, va_space, start, end)    \
605     for ((va_range) = uvm_va_space_iter_first((va_space), (start), (end)),              \
606              (va_range_next) = uvm_va_space_iter_next((va_range), (end));               \
607          (va_range);                                                                    \
608          (va_range) = (va_range_next), (va_range_next) = uvm_va_space_iter_next((va_range), (end)))
609 
610 // Iterator for all contiguous VA ranges between [start, end]. If any part of
611 // [start, end] is not covered by a VA range, iteration stops.
612 #define uvm_for_each_va_range_in_contig(va_range, va_space, start, end)         \
613     for ((va_range) = uvm_va_space_iter_first((va_space), (start), (start));    \
614          (va_range);                                                            \
615          (va_range) = uvm_va_space_iter_next_contig((va_range), (end)))
616 
617 #define uvm_for_each_va_range_in_contig_from(va_range, va_space, first_va_range, end) \
618     for ((va_range) = (first_va_range);                                               \
619          (va_range);                                                                  \
620          (va_range) = uvm_va_space_iter_next_contig((va_range), (end)))
621 
622 // Like uvm_for_each_va_range_in_contig but also stops iteration if any VA range
623 // has a type other than UVM_VA_RANGE_TYPE_MANAGED.
624 #define uvm_for_each_managed_va_range_in_contig(va_range, va_space, start, end) \
625     for ((va_range) = uvm_va_space_iter_first((va_space), (start), (start));    \
626          (va_range) && (va_range)->type == UVM_VA_RANGE_TYPE_MANAGED;           \
627          (va_range) = uvm_va_space_iter_next_contig((va_range), (end)))
628 
629 #define uvm_for_each_va_range_in_vma(va_range, vma)             \
630     uvm_for_each_va_range_in(va_range,                          \
631                              uvm_va_space_get(vma->vm_file),    \
632                              vma->vm_start,                     \
633                              vma->vm_end - 1)
634 
635 #define uvm_for_each_va_range_in_vma_safe(va_range, va_range_next, vma) \
636     uvm_for_each_va_range_in_safe(va_range,                             \
637                                   va_range_next,                        \
638                                   uvm_va_space_get(vma->vm_file),       \
639                                   vma->vm_start,                        \
640                                   vma->vm_end - 1)
641 
642 // Only call this if you're sure that either:
643 // 1) You have a reference on the vma's vm_mm and that vma->vm_mm's mmap_lock is
644 //    held; or
645 // 2) You won't be operating on the vma (as with vm_insert_page) or accessing
646 //    any fields in the vma that can change without va_space->lock being held
647 //    (such as vm_flags).
648 //
649 // Otherwise, use uvm_va_range_vma_current or uvm_va_range_vma_check and be
650 // prepared to handle a NULL return value.
uvm_va_range_vma(uvm_va_range_t * va_range)651 static struct vm_area_struct *uvm_va_range_vma(uvm_va_range_t *va_range)
652 {
653     struct vm_area_struct *vma;
654     UVM_ASSERT_MSG(va_range->type == UVM_VA_RANGE_TYPE_MANAGED, "type: %d", va_range->type);
655     UVM_ASSERT(va_range->managed.vma_wrapper);
656 
657     uvm_assert_rwsem_locked(&va_range->va_space->lock);
658 
659     // vm_file, vm_private_data, vm_start, and vm_end are all safe to access
660     // here because they can't change without the kernel calling vm_ops->open
661     // or vm_ops->close, which both take va_space->lock.
662     vma = va_range->managed.vma_wrapper->vma;
663     UVM_ASSERT(vma);
664     UVM_ASSERT_MSG(vma->vm_private_data == va_range->managed.vma_wrapper,
665                    "vma: 0x%llx [0x%lx, 0x%lx] has vm_private_data 0x%llx\n",
666                    (NvU64)vma,
667                    vma->vm_start,
668                    vma->vm_end - 1,
669                    (NvU64)vma->vm_private_data);
670     UVM_ASSERT_MSG(va_range->va_space == uvm_va_space_get(vma->vm_file),
671                    "va_range va_space: 0x%llx vm_file: 0x%llx vm_file va_space: 0x%llx",
672                    (NvU64)va_range->va_space,
673                    (NvU64)vma->vm_file,
674                    (NvU64)uvm_va_space_get(vma->vm_file));
675     UVM_ASSERT_MSG(va_range->node.start >= vma->vm_start,
676                    "Range mismatch: va_range: [0x%llx, 0x%llx] vma: [0x%lx, 0x%lx]\n",
677                    va_range->node.start,
678                    va_range->node.end,
679                    vma->vm_start,
680                    vma->vm_end - 1);
681     UVM_ASSERT_MSG(va_range->node.end <= vma->vm_end - 1,
682                    "Range mismatch: va_range: [0x%llx, 0x%llx] vma: [0x%lx, 0x%lx]\n",
683                    va_range->node.start,
684                    va_range->node.end,
685                    vma->vm_start,
686                    vma->vm_end - 1);
687 
688     return vma;
689 }
690 
691 // Check that the VA range's vma is safe to use under mm. If not, NULL is
692 // returned. If the vma is returned, there must be a reference on mm and
693 // mm->mmap_lock must be held.
uvm_va_range_vma_check(uvm_va_range_t * va_range,struct mm_struct * mm)694 static struct vm_area_struct *uvm_va_range_vma_check(uvm_va_range_t *va_range, struct mm_struct *mm)
695 {
696     struct vm_area_struct *vma;
697 
698     UVM_ASSERT_MSG(va_range->type == UVM_VA_RANGE_TYPE_MANAGED, "type: %d\n", va_range->type);
699 
700     // Zombies don't have a vma_wrapper.
701     if (!va_range->managed.vma_wrapper)
702         return NULL;
703 
704     vma = uvm_va_range_vma(va_range);
705 
706     // Examples of mm on various paths:
707     //  - CPU fault         vma->vm_mm
708     //  - GPU fault         current->mm or va_space->va_space_mm.mm
709     //  - IOCTL             current->mm or va_space->va_space_mm.mm
710     //  - Process teardown  NULL
711     //
712     // Since the "safe" mm varies based on the path, we may not have a reference
713     // on the vma's owning mm_struct. We won't know that until we look at the
714     // vma. By then it's too late to take mmap_lock since mmap_lock is above the
715     // va_space lock in our lock ordering, and we must be holding the va_space
716     // lock to query the va_range. Hence the need to detect the cases in which
717     // it's safe to operate on the vma.
718     //
719     // When we can't detect for certain that mm is safe to use, we shouldn't
720     // operate on the vma at all. The vma can't be outright freed until we drop
721     // the va_space lock so the pointer itself will remain valid, but its fields
722     // (like vm_start and vm_end) could be modified behind our back. We also
723     // aren't allowed to call vm_insert_page unless we hold the vma's mmap_lock.
724     //
725     // Note that if uvm_va_space_mm_enabled() is true, then vma->vm_mm must be
726     // va_space->va_space_mm.mm because we enforce that at mmap.
727     //
728     // An interesting case is when vma->vm_mm != current->mm. This can happen
729     // due to fork, ptrace, process teardown, etc. It will also be the case in
730     // the GPU fault handler.
731     if (mm != vma->vm_mm)
732         return NULL;
733 
734     uvm_assert_mmap_lock_locked(vma->vm_mm);
735     return vma;
736 }
737 
738 // Helper for use when the only mm which is known is current->mm
uvm_va_range_vma_current(uvm_va_range_t * va_range)739 static struct vm_area_struct *uvm_va_range_vma_current(uvm_va_range_t *va_range)
740 {
741     return uvm_va_range_vma_check(va_range, current->mm);
742 }
743 
744 // Returns the maximum number of VA blocks which could be contained with the
745 // given va_range (number of elements in the va_range->blocks array).
746 // va_range->node.start and .end must be set.
747 //
748 // The va_range must have type UVM_VA_RANGE_TYPE_MANAGED.
749 size_t uvm_va_range_num_blocks(uvm_va_range_t *va_range);
750 
751 // Get the index within the va_range->blocks array of the VA block
752 // corresponding to addr. The block pointer is not guaranteed to be valid. Use
753 // either uvm_va_range_block or uvm_va_range_block_create to look up the block.
754 //
755 // The va_range must have type UVM_VA_RANGE_TYPE_MANAGED.
756 size_t uvm_va_range_block_index(uvm_va_range_t *va_range, NvU64 addr);
757 
758 // Looks up the VA block at va_range->blocks[index]. If no block is present at
759 // that index, NULL is returned.
760 //
761 // The va_range must have type UVM_VA_RANGE_TYPE_MANAGED.
uvm_va_range_block(uvm_va_range_t * va_range,size_t index)762 static uvm_va_block_t *uvm_va_range_block(uvm_va_range_t *va_range, size_t index)
763 {
764     UVM_ASSERT(va_range->type == UVM_VA_RANGE_TYPE_MANAGED);
765     UVM_ASSERT(index < uvm_va_range_num_blocks(va_range));
766     uvm_assert_rwsem_locked(&va_range->va_space->lock);
767 
768     return (uvm_va_block_t *)atomic_long_read(&va_range->blocks[index]);
769 }
770 
771 // Same as uvm_va_range_block except that the block is created if not already
772 // present in the array. If NV_OK is returned, the block has been allocated
773 // successfully.
774 //
775 // The va_range must have type UVM_VA_RANGE_TYPE_MANAGED.
776 NV_STATUS uvm_va_range_block_create(uvm_va_range_t *va_range, size_t index, uvm_va_block_t **out_block);
777 
778 // Returns the first populated VA block in the VA range after the input
779 // va_block, or NULL if none. If the input va_block is NULL, this returns the
780 // first VA block in the VA range, if any exists.
781 uvm_va_block_t *uvm_va_range_block_next(uvm_va_range_t *va_range, uvm_va_block_t *va_block);
782 
783 // Iterate over populated VA blocks in the range. Does not create new VA blocks.
784 #define for_each_va_block_in_va_range(__va_range, __va_block)           \
785     for (__va_block = uvm_va_range_block_next(__va_range, NULL);        \
786          __va_block;                                                    \
787          __va_block = uvm_va_range_block_next(__va_range, __va_block))
788 
789 // Iterate over populated VA blocks in the range. Does not create new VA blocks. Safe version
790 #define for_each_va_block_in_va_range_safe(__va_range, __va_block, __va_block_next)            \
791     for (__va_block = uvm_va_range_block_next(__va_range, NULL),                               \
792          __va_block_next = uvm_va_range_block_next(__va_range, __va_block);                    \
793          __va_block;                                                                           \
794          __va_block = __va_block_next,                                                         \
795          __va_block_next = __va_block? uvm_va_range_block_next(__va_range, __va_block) : NULL)
796 
797 // Set the VA range preferred location (or unset it if preferred location is
798 // UVM_ID_INVALID).
799 //
800 // Unsetting the preferred location potentially changes the range group
801 // association to UVM_RANGE_GROUP_ID_NONE if the VA range was previously
802 // associated with a non-migratable range group.
803 //
804 // Changing the preferred location also updates the mask and mappings of GPUs
805 // in UVM-Lite mode.
806 //
807 // The va_range must have type UVM_VA_RANGE_TYPE_MANAGED.
808 //
809 // If mm != NULL, that mm is used for any CPU mappings which may be created as
810 // a result of this call. See uvm_va_block_context_t::mm for details.
811 //
812 // If out_tracker != NULL any block work will be added to that tracker.
813 //
814 // LOCKING: If mm != NULL, the caller must hold mm->mmap_lock in at least read
815 //          mode.
816 NV_STATUS uvm_va_range_set_preferred_location(uvm_va_range_t *va_range,
817                                               uvm_processor_id_t preferred_location,
818                                               int preferred_cpu_nid,
819                                               struct mm_struct *mm,
820                                               uvm_tracker_t *out_tracker);
821 
822 // Add a processor to the accessed_by mask and establish any new required
823 // mappings.
824 //
825 // Also update the mask of UVM-Lite GPUs if needed.
826 //
827 // If mm != NULL, that mm is used for any CPU mappings which may be created as
828 // a result of this call. See uvm_va_block_context_t::mm for details.
829 //
830 // If out_tracker != NULL any block work will be added to that tracker.
831 //
832 // LOCKING: If mm != NULL, the caller must hold mm->mmap_lock in at least read
833 //          mode.
834 NV_STATUS uvm_va_range_set_accessed_by(uvm_va_range_t *va_range,
835                                        uvm_processor_id_t processor_id,
836                                        struct mm_struct *mm,
837                                        uvm_tracker_t *out_tracker);
838 
839 // Remove a processor from the accessed_by mask
840 //
841 // If out_tracker != NULL any block work will be added to that tracker.
842 //
843 // This also updates the mask and mappings of the UVM-Lite GPUs if required.
844 void uvm_va_range_unset_accessed_by(uvm_va_range_t *va_range,
845                                     uvm_processor_id_t processor_id,
846                                     uvm_tracker_t *out_tracker);
847 
848 // Set read-duplication and remove any existing accessed_by and remote mappings
849 //
850 // If mm != NULL, that mm is used for any CPU mappings which may be created as
851 // a result of this call. See uvm_va_block_context_t::mm for details.
852 //
853 // LOCKING: If mm != NULL, the caller must hold mm->mmap_lock in at least read
854 //          mode.
855 NV_STATUS uvm_va_range_set_read_duplication(uvm_va_range_t *va_range, struct mm_struct *mm);
856 
857 // Unset read-duplication and establish accessed_by mappings
858 //
859 // If mm != NULL, that mm is used for any CPU mappings which may be created as
860 // a result of this call. See uvm_va_block_context_t::mm for details.
861 //
862 // LOCKING: If mm != NULL, the caller must hold mm->mmap_lock in at least read
863 //          mode.
864 NV_STATUS uvm_va_range_unset_read_duplication(uvm_va_range_t *va_range, struct mm_struct *mm);
865 
866 // Create and destroy vma wrappers
867 uvm_vma_wrapper_t *uvm_vma_wrapper_alloc(struct vm_area_struct *vma);
868 void uvm_vma_wrapper_destroy(uvm_vma_wrapper_t *vma_wrapper);
869 
uvm_va_range_get_policy(uvm_va_range_t * va_range)870 static uvm_va_policy_t *uvm_va_range_get_policy(uvm_va_range_t *va_range)
871 {
872     UVM_ASSERT(va_range->type == UVM_VA_RANGE_TYPE_MANAGED);
873     return &va_range->managed.policy;
874 }
875 
876 NV_STATUS uvm_test_va_range_info(UVM_TEST_VA_RANGE_INFO_PARAMS *params, struct file *filp);
877 NV_STATUS uvm_test_va_range_split(UVM_TEST_VA_RANGE_SPLIT_PARAMS *params, struct file *filp);
878 NV_STATUS uvm_test_va_range_inject_split_error(UVM_TEST_VA_RANGE_INJECT_SPLIT_ERROR_PARAMS *params, struct file *filp);
879 NV_STATUS uvm_test_va_range_inject_add_gpu_va_space_error(UVM_TEST_VA_RANGE_INJECT_ADD_GPU_VA_SPACE_ERROR_PARAMS *params,
880                                                           struct file *filp);
881 
882 #endif // __UVM_VA_RANGE_H__
883